[llvm] [llvm-mca][AArch64] Add AArch64 version of clearsSuperRegisters. (PR #92548)
Rin Dobrescu via llvm-commits
llvm-commits at lists.llvm.org
Tue May 21 08:11:19 PDT 2024
https://github.com/Rin18 updated https://github.com/llvm/llvm-project/pull/92548
>From 7d13c3afd29035a26588e6c3e06bb348f1971375 Mon Sep 17 00:00:00 2001
From: Rin Dobrescu <rin.dobrescu at arm.com>
Date: Thu, 16 May 2024 11:26:09 +0000
Subject: [PATCH 1/4] Precommit tests.
---
.../AArch64/Neoverse/V1-clear-upper-regs.s | 792 ++++++++++++++++++
.../AArch64/Neoverse/V2-clear-upper-regs.s | 650 ++++++++++++++
2 files changed, 1442 insertions(+)
create mode 100644 llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
create mode 100644 llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
new file mode 100644
index 0000000000000..197996d229b61
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
@@ -0,0 +1,792 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 --timeline --timeline-max-iterations=4 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN GPR32-bit
+ldr w0, [sp]
+add x0, x0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR8-bit
+ldr b0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR16-bit
+ldr h0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR32-bit
+ldr s0, [sp]
+fadd d0, d0, d0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-b
+ld1 {v0.8b}, [sp]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-h
+ld1 {v0.4h}, [sp]
+add v0.8h, v0.8h, v0.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-s
+ld1 {v0.2s}, [sp]
+add v0.4s, v0.4s, v0.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-d
+ld1 {v0.1d}, [sp]
+add v0.2d, v0.2d, v0.2d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ins
+ins v0.b[0], v1.b[1]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN lanewise-load
+ld1 {v0.b}[0], [sp]
+add v0.16b, v0.16b, v0.16b
+# LLVM-MCA-END
+
+# CHECK: [0] Code Region - GPR32-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 107
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 1.87
+# CHECK-NEXT: IPC: 1.87
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 4 0.33 * ldr w0, [sp]
+# CHECK-NEXT: 1 1 0.25 add x0, x0, x0
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - 0.50 0.50 - - - -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ldr w0, [sp]
+# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - add x0, x0, x0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeER . ldr w0, [sp]
+# CHECK-NEXT: [0,1] D====eER . add x0, x0, x0
+# CHECK-NEXT: [1,0] DeeeeE-R . ldr w0, [sp]
+# CHECK-NEXT: [1,1] D=====eER . add x0, x0, x0
+# CHECK-NEXT: [2,0] DeeeeE--R . ldr w0, [sp]
+# CHECK-NEXT: [2,1] D======eER. add x0, x0, x0
+# CHECK-NEXT: [3,0] D=eeeeE--R. ldr w0, [sp]
+# CHECK-NEXT: [3,1] D=======eER add x0, x0, x0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr w0, [sp]
+# CHECK-NEXT: 1. 4 6.5 0.0 0.0 add x0, x0, x0
+# CHECK-NEXT: 4 3.9 0.6 0.6 <total>
+
+# CHECK: [1] Code Region - FPR8-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr b0, [sp]
+# CHECK-NEXT: 1 2 0.25 fadd d0, d0, d0
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ldr b0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 fadd d0, d0, d0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ldr b0, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. fadd d0, d0, d0
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr b0, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. fadd d0, d0, d0
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr b0, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. fadd d0, d0, d0
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr b0, [sp]
+# CHECK-NEXT: [3,1] D============eeER fadd d0, d0, d0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr b0, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [2] Code Region - FPR16-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr h0, [sp]
+# CHECK-NEXT: 1 2 0.25 fadd d0, d0, d0
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ldr h0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 fadd d0, d0, d0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ldr h0, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. fadd d0, d0, d0
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr h0, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. fadd d0, d0, d0
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr h0, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. fadd d0, d0, d0
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr h0, [sp]
+# CHECK-NEXT: [3,1] D============eeER fadd d0, d0, d0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr h0, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [3] Code Region - FPR32-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr s0, [sp]
+# CHECK-NEXT: 1 2 0.25 fadd d0, d0, d0
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ldr s0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 fadd d0, d0, d0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ldr s0, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. fadd d0, d0, d0
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr s0, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. fadd d0, d0, d0
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr s0, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. fadd d0, d0, d0
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr s0, [sp]
+# CHECK-NEXT: [3,1] D============eeER fadd d0, d0, d0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr s0, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [4] Code Region - SIMD64-bit-b
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1 2 0.25 add v0.16b, v0.16b, v0.16b
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ld1 { v0.8b }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.16b, v0.16b, v0.16b
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [3,1] D============eeER add v0.16b, v0.16b, v0.16b
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [5] Code Region - SIMD64-bit-h
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.4h }, [sp]
+# CHECK-NEXT: 1 2 0.25 add v0.8h, v0.8h, v0.8h
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ld1 { v0.4h }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.8h, v0.8h, v0.8h
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [3,1] D============eeER add v0.8h, v0.8h, v0.8h
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.4h }, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [6] Code Region - SIMD64-bit-s
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.2s }, [sp]
+# CHECK-NEXT: 1 2 0.25 add v0.4s, v0.4s, v0.4s
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ld1 { v0.2s }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.4s, v0.4s, v0.4s
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [3,1] D============eeER add v0.4s, v0.4s, v0.4s
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.2s }, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [7] Code Region - SIMD64-bit-d
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.1d }, [sp]
+# CHECK-NEXT: 1 2 0.25 add v0.2d, v0.2d, v0.2d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ld1 { v0.1d }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.2d, v0.2d, v0.2d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [3,1] D============eeER add v0.2d, v0.2d, v0.2d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.1d }, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [8] Code Region - ins
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 403
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 0.50
+# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 2 0.25 mov v0.b[0], v1.b[1]
+# CHECK-NEXT: 1 2 0.25 add v0.16b, v0.16b, v0.16b
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 0.50 0.50
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - - - - - - - - - 0.50 - 0.50 mov v0.b[0], v1.b[1]
+# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - add v0.16b, v0.16b, v0.16b
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012345678
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . . . mov v0.b[0], v1.b[1]
+# CHECK-NEXT: [0,1] D==eeER . . . add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0] D====eeER . . . mov v0.b[0], v1.b[1]
+# CHECK-NEXT: [1,1] D======eeER . . add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0] D========eeER . . mov v0.b[0], v1.b[1]
+# CHECK-NEXT: [2,1] D==========eeER. . add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0] D============eeER . mov v0.b[0], v1.b[1]
+# CHECK-NEXT: [3,1] D==============eeER add v0.16b, v0.16b, v0.16b
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 7.0 0.3 0.0 mov v0.b[0], v1.b[1]
+# CHECK-NEXT: 1. 4 9.0 0.0 0.0 add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 4 8.0 0.1 0.0 <total>
+
+# CHECK: [9] Code Region - lanewise-load
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 1003
+# CHECK-NEXT: Total uOps: 300
+
+# CHECK: Dispatch Width: 15
+# CHECK-NEXT: uOps Per Cycle: 0.30
+# CHECK-NEXT: IPC: 0.20
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 2 8 0.33 * ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: 1 2 0.25 add v0.16b, v0.16b, v0.16b
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2] - V1UnitL2
+# CHECK-NEXT: [3.0] - V1UnitL01
+# CHECK-NEXT: [3.1] - V1UnitL01
+# CHECK-NEXT: [4] - V1UnitM0
+# CHECK-NEXT: [5] - V1UnitM1
+# CHECK-NEXT: [6.0] - V1UnitS
+# CHECK-NEXT: [6.1] - V1UnitS
+# CHECK-NEXT: [7] - V1UnitV0
+# CHECK-NEXT: [8] - V1UnitV1
+# CHECK-NEXT: [9] - V1UnitV2
+# CHECK-NEXT: [10] - V1UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 0.50 0.50
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - - 0.50 - 0.50 ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - add v0.16b, v0.16b, v0.16b
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: [0,1] D========eeER . . . . . . . add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0] D==========eeeeeeeeER . . . . . ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: [1,1] D==================eeER . . . . . add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0] D====================eeeeeeeeER . . . ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: [2,1] D============================eeER . . . add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0] D==============================eeeeeeeeER . ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: [3,1] D======================================eeER add v0.16b, v0.16b, v0.16b
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 16.0 0.3 0.0 ld1 { v0.b }[0], [sp]
+# CHECK-NEXT: 1. 4 24.0 0.0 0.0 add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 4 20.0 0.1 0.0 <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
new file mode 100644
index 0000000000000..708f22d54afd3
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
@@ -0,0 +1,650 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v2 --timeline --timeline-max-iterations=4 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN FPR8-bit
+ldr b0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR16-bit
+ldr h0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR32-bit
+ldr s0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-b
+ld1 {v0.8b}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-h
+ld1 {v0.4h}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-s
+ld1 {v0.2s}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN SIMD64-bit-d
+ld1 {v0.1d}, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN insr
+insr z0.s, w0
+add z0.s, z0.s, z0.s
+# LLVM-MCA-END
+
+# CHECK: [0] Code Region - FPR8-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr b0, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ldr b0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ldr b0, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr b0, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr b0, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr b0, [sp]
+# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr b0, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [1] Code Region - FPR16-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr h0, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ldr h0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ldr h0, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr h0, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr h0, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr h0, [sp]
+# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr h0, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [2] Code Region - FPR32-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr s0, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ldr s0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ldr s0, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr s0, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr s0, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr s0, [sp]
+# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr s0, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [3] Code Region - SIMD64-bit-b
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ld1 { v0.8b }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [4] Code Region - SIMD64-bit-h
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.4h }, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ld1 { v0.4h }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.4h }, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [5] Code Region - SIMD64-bit-s
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.2s }, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ld1 { v0.2s }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.2s }, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [6] Code Region - SIMD64-bit-d
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ld1 { v0.1d }, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ld1 { v0.1d }, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.1d }, [sp]
+# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+
+# CHECK: [7] Code Region - insr
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 803
+# CHECK-NEXT: Total uOps: 300
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.37
+# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 2 6 1.00 insr z0.s, w0
+# CHECK-NEXT: 1 2 0.25 add z0.s, z0.s, z0.s
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - - - - 1.00 - - - - - 0.33 1.00 0.33 0.34
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - - - - 1.00 - - - - - - 1.00 - - insr z0.s, w0
+# CHECK-NEXT: - - - - - - - - - - - - - 0.33 - 0.33 0.34 add z0.s, z0.s, z0.s
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 01234
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeeER . . . . . . insr z0.s, w0
+# CHECK-NEXT: [0,1] D======eeER . . . . . add z0.s, z0.s, z0.s
+# CHECK-NEXT: [1,0] D========eeeeeeER . . . . insr z0.s, w0
+# CHECK-NEXT: [1,1] D==============eeER . . . . add z0.s, z0.s, z0.s
+# CHECK-NEXT: [2,0] D================eeeeeeER. . . insr z0.s, w0
+# CHECK-NEXT: [2,1] D======================eeER . . add z0.s, z0.s, z0.s
+# CHECK-NEXT: [3,0] D========================eeeeeeER . insr z0.s, w0
+# CHECK-NEXT: [3,1] D==============================eeER add z0.s, z0.s, z0.s
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 13.0 0.3 0.0 insr z0.s, w0
+# CHECK-NEXT: 1. 4 19.0 0.0 0.0 add z0.s, z0.s, z0.s
+# CHECK-NEXT: 4 16.0 0.1 0.0 <total>
>From a92c9ec38f97260f9a95486df10eebdbffabc716 Mon Sep 17 00:00:00 2001
From: Rin Dobrescu <rin.dobrescu at arm.com>
Date: Fri, 17 May 2024 11:10:15 +0000
Subject: [PATCH 2/4] [llvm-mca][AArch64] Add AArch64 version of
clearsSuperRegisters.
---
.../MCTargetDesc/AArch64MCTargetDesc.cpp | 46 ++++
.../AArch64/Neoverse/V1-clear-upper-regs.s | 245 +++++++++---------
.../AArch64/Neoverse/V2-clear-upper-regs.s | 210 +++++++--------
3 files changed, 273 insertions(+), 228 deletions(-)
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 043f0a03b7975..d545cfefc319d 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -432,6 +432,52 @@ class AArch64MCInstrAnalysis : public MCInstrAnalysis {
return false;
}
+ bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst,
+ APInt &Mask) const override {
+ const MCInstrDesc &Desc = Info->get(Inst.getOpcode());
+ unsigned NumDefs = Desc.getNumDefs();
+ unsigned NumImplicitDefs = Desc.implicit_defs().size();
+ assert(Mask.getBitWidth() == NumDefs + NumImplicitDefs &&
+ "Unexpected number of bits in the mask!");
+ // 32-bit General Purpose Register class.
+ const MCRegisterClass &GPR32RC = MRI.getRegClass(AArch64::GPR32RegClassID);
+ // Floating Point Register classes lower than 128-bit.
+ const MCRegisterClass &FPR8RC = MRI.getRegClass(AArch64::FPR8RegClassID);
+ const MCRegisterClass &FPR16RC = MRI.getRegClass(AArch64::FPR16RegClassID);
+ const MCRegisterClass &FPR32RC = MRI.getRegClass(AArch64::FPR32RegClassID);
+ const MCRegisterClass &FPR64RC = MRI.getRegClass(AArch64::FPR64RegClassID);
+
+ auto ClearsSuperReg = [=](unsigned RegID) {
+ // An update to the lower 32 bits of a 64 bit integer register is
+ // architecturally defined to zero extend the upper 32 bits on a write.
+ if (GPR32RC.contains(RegID))
+ return true;
+ // SIMD&FP instructions operating on scalar data only acccess the lower
+ // bits of a register, the upper bits are zero extended on a write. For
+ // SIMD vector registers smaller than 128-bits, the upper 64-bits of the
+ // register are zero extended on a write.
+ if (FPR8RC.contains(RegID) || FPR16RC.contains(RegID) ||
+ FPR32RC.contains(RegID) || FPR64RC.contains(RegID))
+ return true;
+ return false;
+ };
+
+ Mask.clearAllBits();
+ for (unsigned I = 0, E = NumDefs; I < E; ++I) {
+ const MCOperand &Op = Inst.getOperand(I);
+ if (ClearsSuperReg(Op.getReg()))
+ Mask.setBit(I);
+ }
+
+ for (unsigned I = 0, E = NumImplicitDefs; I < E; ++I) {
+ const MCPhysReg Reg = Desc.implicit_defs()[I];
+ if (ClearsSuperReg(Reg))
+ Mask.setBit(NumDefs + I);
+ }
+
+ return Mask.getBoolValue();
+ }
+
std::vector<std::pair<uint64_t, uint64_t>>
findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents,
const Triple &TargetTriple) const override {
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
index 197996d229b61..ab81f9fb04af3 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s
@@ -55,12 +55,12 @@ add v0.16b, v0.16b, v0.16b
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 107
+# CHECK-NEXT: Total Cycles: 41
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 15
-# CHECK-NEXT: uOps Per Cycle: 1.87
-# CHECK-NEXT: IPC: 1.87
+# CHECK-NEXT: uOps Per Cycle: 4.88
+# CHECK-NEXT: IPC: 4.88
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -94,25 +94,24 @@ add v0.16b, v0.16b, v0.16b
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10]
-# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - 0.50 0.50 - - - -
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 0.22 0.22 0.28 0.28 - - - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions:
# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - ldr w0, [sp]
-# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - add x0, x0, x0
+# CHECK-NEXT: - - - - - - - 0.22 0.22 0.28 0.28 - - - - add x0, x0, x0
# CHECK: Timeline view:
-# CHECK-NEXT: 0
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 012345678
-# CHECK: [0,0] DeeeeER . ldr w0, [sp]
-# CHECK-NEXT: [0,1] D====eER . add x0, x0, x0
-# CHECK-NEXT: [1,0] DeeeeE-R . ldr w0, [sp]
-# CHECK-NEXT: [1,1] D=====eER . add x0, x0, x0
-# CHECK-NEXT: [2,0] DeeeeE--R . ldr w0, [sp]
-# CHECK-NEXT: [2,1] D======eER. add x0, x0, x0
-# CHECK-NEXT: [3,0] D=eeeeE--R. ldr w0, [sp]
-# CHECK-NEXT: [3,1] D=======eER add x0, x0, x0
+# CHECK: [0,0] DeeeeER . ldr w0, [sp]
+# CHECK-NEXT: [0,1] D====eER. add x0, x0, x0
+# CHECK-NEXT: [1,0] DeeeeE-R. ldr w0, [sp]
+# CHECK-NEXT: [1,1] D====eER. add x0, x0, x0
+# CHECK-NEXT: [2,0] DeeeeE-R. ldr w0, [sp]
+# CHECK-NEXT: [2,1] D====eER. add x0, x0, x0
+# CHECK-NEXT: [3,0] D=eeeeER. ldr w0, [sp]
+# CHECK-NEXT: [3,1] D=====eER add x0, x0, x0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -121,20 +120,20 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr w0, [sp]
-# CHECK-NEXT: 1. 4 6.5 0.0 0.0 add x0, x0, x0
-# CHECK-NEXT: 4 3.9 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 0.5 ldr w0, [sp]
+# CHECK-NEXT: 1. 4 5.3 0.0 0.0 add x0, x0, x0
+# CHECK-NEXT: 4 3.3 0.6 0.3 <total>
# CHECK: [1] Code Region - FPR8-bit
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 15
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -176,17 +175,17 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 fadd d0, d0, d0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ldr b0, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. fadd d0, d0, d0
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr b0, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. fadd d0, d0, d0
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr b0, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. fadd d0, d0, d0
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr b0, [sp]
-# CHECK-NEXT: [3,1] D============eeER fadd d0, d0, d0
+# CHECK: [0,0] DeeeeeeER .. ldr b0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr b0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr b0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr b0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER fadd d0, d0, d0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -195,20 +194,20 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr b0, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 fadd d0, d0, d0
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr b0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [2] Code Region - FPR16-bit
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 15
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -250,17 +249,17 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 fadd d0, d0, d0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ldr h0, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. fadd d0, d0, d0
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr h0, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. fadd d0, d0, d0
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr h0, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. fadd d0, d0, d0
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr h0, [sp]
-# CHECK-NEXT: [3,1] D============eeER fadd d0, d0, d0
+# CHECK: [0,0] DeeeeeeER .. ldr h0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr h0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr h0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr h0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER fadd d0, d0, d0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -269,20 +268,20 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr h0, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 fadd d0, d0, d0
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr h0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [3] Code Region - FPR32-bit
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 15
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -324,17 +323,17 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 fadd d0, d0, d0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ldr s0, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. fadd d0, d0, d0
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr s0, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. fadd d0, d0, d0
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr s0, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. fadd d0, d0, d0
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr s0, [sp]
-# CHECK-NEXT: [3,1] D============eeER fadd d0, d0, d0
+# CHECK: [0,0] DeeeeeeER .. ldr s0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr s0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr s0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. fadd d0, d0, d0
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr s0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER fadd d0, d0, d0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -343,20 +342,20 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr s0, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 fadd d0, d0, d0
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr s0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [4] Code Region - SIMD64-bit-b
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 15
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -398,17 +397,17 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.16b, v0.16b, v0.16b
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.8b }, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.8b }, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.8b }, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.8b }, [sp]
-# CHECK-NEXT: [3,1] D============eeER add v0.16b, v0.16b, v0.16b
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add v0.16b, v0.16b, v0.16b
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -417,20 +416,20 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.8b }, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add v0.16b, v0.16b, v0.16b
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add v0.16b, v0.16b, v0.16b
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [5] Code Region - SIMD64-bit-h
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 15
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -472,17 +471,17 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.8h, v0.8h, v0.8h
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.4h }, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.4h }, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.4h }, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.4h }, [sp]
-# CHECK-NEXT: [3,1] D============eeER add v0.8h, v0.8h, v0.8h
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add v0.8h, v0.8h, v0.8h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -491,20 +490,20 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.4h }, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add v0.8h, v0.8h, v0.8h
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.4h }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add v0.8h, v0.8h, v0.8h
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [6] Code Region - SIMD64-bit-s
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 15
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -546,17 +545,17 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.4s, v0.4s, v0.4s
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.2s }, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.2s }, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.2s }, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.2s }, [sp]
-# CHECK-NEXT: [3,1] D============eeER add v0.4s, v0.4s, v0.4s
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add v0.4s, v0.4s, v0.4s
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -565,20 +564,20 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.2s }, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.2s }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [7] Code Region - SIMD64-bit-d
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 15
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -620,17 +619,17 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 add v0.2d, v0.2d, v0.2d
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.1d }, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.1d }, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.1d }, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.1d }, [sp]
-# CHECK-NEXT: [3,1] D============eeER add v0.2d, v0.2d, v0.2d
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add v0.2d, v0.2d, v0.2d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -639,9 +638,9 @@ add v0.16b, v0.16b, v0.16b
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.1d }, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.1d }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [8] Code Region - ins
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
index 708f22d54afd3..20c97eeaa8733 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
@@ -45,12 +45,12 @@ add z0.s, z0.s, z0.s
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -94,17 +94,17 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ldr b0, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr b0, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr b0, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr b0, [sp]
-# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+# CHECK: [0,0] DeeeeeeER .. ldr b0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr b0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr b0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr b0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -113,20 +113,20 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr b0, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr b0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [1] Code Region - FPR16-bit
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -170,17 +170,17 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ldr h0, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr h0, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr h0, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr h0, [sp]
-# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+# CHECK: [0,0] DeeeeeeER .. ldr h0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr h0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr h0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr h0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -189,20 +189,20 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr h0, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr h0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [2] Code Region - FPR32-bit
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -246,17 +246,17 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ldr s0, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ldr s0, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ldr s0, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ldr s0, [sp]
-# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+# CHECK: [0,0] DeeeeeeER .. ldr s0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr s0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr s0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr s0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -265,20 +265,20 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ldr s0, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr s0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [3] Code Region - SIMD64-bit-b
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -322,17 +322,17 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.8b }, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.8b }, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.8b }, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.8b }, [sp]
-# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -341,20 +341,20 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.8b }, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [4] Code Region - SIMD64-bit-h
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -398,17 +398,17 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.4h }, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.4h }, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.4h }, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.4h }, [sp]
-# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -417,20 +417,20 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.4h }, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.4h }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [5] Code Region - SIMD64-bit-s
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -474,17 +474,17 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.2s }, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.2s }, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.2s }, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.2s }, [sp]
-# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -493,20 +493,20 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.2s }, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.2s }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [6] Code Region - SIMD64-bit-d
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
-# CHECK-NEXT: Total Cycles: 209
+# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 0.96
-# CHECK-NEXT: IPC: 0.96
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
# CHECK: Instruction Info:
@@ -550,17 +550,17 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. ld1 { v0.1d }, [sp]
-# CHECK-NEXT: [0,1] D======eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,0] DeeeeeeE--R .. ld1 { v0.1d }, [sp]
-# CHECK-NEXT: [1,1] D========eeER .. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [2,0] DeeeeeeE----R .. ld1 { v0.1d }, [sp]
-# CHECK-NEXT: [2,1] D==========eeER.. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-----R.. ld1 { v0.1d }, [sp]
-# CHECK-NEXT: [3,1] D============eeER add z0.d, z0.d, z0.d
+# CHECK: [0,0] DeeeeeeER .. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -569,9 +569,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 2.8 ld1 { v0.1d }, [sp]
-# CHECK-NEXT: 1. 4 10.0 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 5.6 0.6 1.4 <total>
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.1d }, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
# CHECK: [7] Code Region - insr
>From b2abd342d8128be4dfa022ade56eca832fe4d26d Mon Sep 17 00:00:00 2001
From: Rin Dobrescu <rin.dobrescu at arm.com>
Date: Mon, 20 May 2024 10:41:04 +0000
Subject: [PATCH 3/4] Address comment
---
.../lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index d545cfefc319d..3247fe03bb488 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -456,10 +456,8 @@ class AArch64MCInstrAnalysis : public MCInstrAnalysis {
// bits of a register, the upper bits are zero extended on a write. For
// SIMD vector registers smaller than 128-bits, the upper 64-bits of the
// register are zero extended on a write.
- if (FPR8RC.contains(RegID) || FPR16RC.contains(RegID) ||
- FPR32RC.contains(RegID) || FPR64RC.contains(RegID))
- return true;
- return false;
+ return FPR8RC.contains(RegID) || FPR16RC.contains(RegID) ||
+ FPR32RC.contains(RegID) || FPR64RC.contains(RegID);
};
Mask.clearAllBits();
>From 42a2e58e648d17372b2fd27df06ed0d99bc73127 Mon Sep 17 00:00:00 2001
From: Rin Dobrescu <rin.dobrescu at arm.com>
Date: Tue, 21 May 2024 15:08:10 +0000
Subject: [PATCH 4/4] Add FPR-128 implementation and tests for FPR-64 and
FPR-128
---
.../MCTargetDesc/AArch64MCTargetDesc.cpp | 9 +-
.../AArch64/Neoverse/V2-clear-upper-regs.s | 172 +++++++++++++++++-
2 files changed, 174 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 3247fe03bb488..7ed43031841cc 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -441,11 +441,13 @@ class AArch64MCInstrAnalysis : public MCInstrAnalysis {
"Unexpected number of bits in the mask!");
// 32-bit General Purpose Register class.
const MCRegisterClass &GPR32RC = MRI.getRegClass(AArch64::GPR32RegClassID);
- // Floating Point Register classes lower than 128-bit.
+ // Floating Point Register classes.
const MCRegisterClass &FPR8RC = MRI.getRegClass(AArch64::FPR8RegClassID);
const MCRegisterClass &FPR16RC = MRI.getRegClass(AArch64::FPR16RegClassID);
const MCRegisterClass &FPR32RC = MRI.getRegClass(AArch64::FPR32RegClassID);
const MCRegisterClass &FPR64RC = MRI.getRegClass(AArch64::FPR64RegClassID);
+ const MCRegisterClass &FPR128RC =
+ MRI.getRegClass(AArch64::FPR128RegClassID);
auto ClearsSuperReg = [=](unsigned RegID) {
// An update to the lower 32 bits of a 64 bit integer register is
@@ -456,8 +458,11 @@ class AArch64MCInstrAnalysis : public MCInstrAnalysis {
// bits of a register, the upper bits are zero extended on a write. For
// SIMD vector registers smaller than 128-bits, the upper 64-bits of the
// register are zero extended on a write.
+ // When VL is higher than 128 bits, any write to a SIMD&FP register sets
+ // bits higher than 128 to zero.
return FPR8RC.contains(RegID) || FPR16RC.contains(RegID) ||
- FPR32RC.contains(RegID) || FPR64RC.contains(RegID);
+ FPR32RC.contains(RegID) || FPR64RC.contains(RegID) ||
+ FPR128RC.contains(RegID);
};
Mask.clearAllBits();
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
index 20c97eeaa8733..fd2083dc1277a 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
@@ -16,6 +16,16 @@ ldr s0, [sp]
add z0.d, z0.d, z0.d
# LLVM-MCA-END
+# LLVM-MCA-BEGIN FPR64-bit
+ldr d0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN FPR128-bit
+ldr q0, [sp]
+add z0.d, z0.d, z0.d
+# LLVM-MCA-END
+
# LLVM-MCA-BEGIN SIMD64-bit-b
ld1 {v0.8b}, [sp]
add z0.d, z0.d, z0.d
@@ -269,7 +279,159 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
-# CHECK: [3] Code Region - SIMD64-bit-b
+# CHECK: [3] Code Region - FPR64-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr d0, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ldr d0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ldr d0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr d0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr d0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr d0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr d0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [4] Code Region - FPR128-bit
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 44
+# CHECK-NEXT: Total uOps: 200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 4.55
+# CHECK-NEXT: Block RThroughput: 0.3
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 6 0.33 * ldr q0, [sp]
+# CHECK-NEXT: 1 2 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2] - V2UnitL2
+# CHECK-NEXT: [3.0] - V2UnitL01
+# CHECK-NEXT: [3.1] - V2UnitL01
+# CHECK-NEXT: [4] - V2UnitM0
+# CHECK-NEXT: [5] - V2UnitM1
+# CHECK-NEXT: [6] - V2UnitS0
+# CHECK-NEXT: [7] - V2UnitS1
+# CHECK-NEXT: [8] - V2UnitS2
+# CHECK-NEXT: [9] - V2UnitS3
+# CHECK-NEXT: [10] - V2UnitV0
+# CHECK-NEXT: [11] - V2UnitV1
+# CHECK-NEXT: [12] - V2UnitV2
+# CHECK-NEXT: [13] - V2UnitV3
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - 0.25 0.25 0.25 0.25
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - 0.33 0.33 0.34 - - - - - - - - - - ldr q0, [sp]
+# CHECK-NEXT: - - - - - - - - - - - - - 0.25 0.25 0.25 0.25 add z0.d, z0.d, z0.d
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ldr q0, [sp]
+# CHECK-NEXT: [0,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,0] DeeeeeeE--R. ldr q0, [sp]
+# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr q0, [sp]
+# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr q0, [sp]
+# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr q0, [sp]
+# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+
+# CHECK: [5] Code Region - SIMD64-bit-b
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
@@ -345,7 +507,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
-# CHECK: [4] Code Region - SIMD64-bit-h
+# CHECK: [6] Code Region - SIMD64-bit-h
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
@@ -421,7 +583,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
-# CHECK: [5] Code Region - SIMD64-bit-s
+# CHECK: [7] Code Region - SIMD64-bit-s
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
@@ -497,7 +659,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
-# CHECK: [6] Code Region - SIMD64-bit-d
+# CHECK: [8] Code Region - SIMD64-bit-d
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
@@ -573,7 +735,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
-# CHECK: [7] Code Region - insr
+# CHECK: [9] Code Region - insr
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 200
More information about the llvm-commits
mailing list