[llvm] [AArch64] Add SchedReadAdvance to the Neoverse-N3 scheduling model (PR #167302)
Asher Dobrescu via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 10 03:34:49 PST 2025
https://github.com/Asher8118 created https://github.com/llvm/llvm-project/pull/167302
Introduce a description of late forwarding to the Neoverse-N3 scheduling model.
>From 461c54cb6bd49d84a27feee73eae394e24b9b6b7 Mon Sep 17 00:00:00 2001
From: Ash Dobrescu <ash.dobrescu at arm.com>
Date: Fri, 7 Nov 2025 15:13:51 +0000
Subject: [PATCH 1/2] [AArch64] Add SchedReadAdvance to Neoverse-N3 scheduling
model
Introduce a description of late forwarding to the Neoverse-N3 scheduling model.
---
.../llvm-mca/AArch64/Neoverse/N3-forwarding.s | 2034 +++++++++++++++++
1 file changed, 2034 insertions(+)
create mode 100644 llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s
new file mode 100644
index 0000000000000..59e3af8abd708
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s
@@ -0,0 +1,2034 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-n3 -mattr=+sve --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=2 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN madd
+mul x0, x0, x0
+madd x0, x1, x2, x0
+madd x0, x1, x2, x0
+madd x0, x0, x0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smaddl
+mul x0, x0, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w0, w0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmadd
+fadd d0, d0, d0
+fmadd d0, d1, d2, d0
+fmul d0, d0, d0
+fmadd d0, d1, d2, d0
+fmadd d0, d1, d2, d0
+fmadd d0, d0, d1, d2
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN saba
+mul v0.4s, v0.4s, v0.4s
+saba v0.4s, v1.4s, v2.4s
+saba v0.4s, v1.4s, v2.4s
+saba v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sdot
+mul v0.4s, v0.4s, v0.4s
+sdot v0.4s, v1.16b, v2.16b
+sdot v0.4s, v1.16b, v2.16b
+sdot v0.4s, v0.16b, v1.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smmla
+mul v0.4s, v0.4s, v0.4s
+smmla v0.4s, v1.16b, v2.16b
+smmla v0.4s, v1.16b, v2.16b
+smmla v0.4s, v0.16b, v1.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN mla
+mul v0.4s, v0.4s, v0.4s
+mla v0.4s, v1.4s, v2.4s
+mla v0.4s, v1.4s, v2.4s
+mla v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sqrdmlah
+mul v0.4s, v0.4s, v0.4s
+sqrdmlah v0.8h, v1.8h, v2.8h
+sqrdmlah v0.8h, v1.8h, v2.8h
+sqrdmlah v0.8h, v1.8h, v2.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smlal2
+mul v0.4s, v0.4s, v0.4s
+smlal2 v0.4s, v1.8h, v2.8h
+smlal2 v0.4s, v1.8h, v2.8h
+smlal2 v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sqdmlal2
+mul v0.4s, v0.4s, v0.4s
+sqdmlal2 v0.4s, v1.8h, v2.8h
+sqdmlal2 v0.4s, v1.8h, v2.8h
+sqdmlal2 v0.4s, v1.8h, v2.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sadalp
+mul v0.4s, v0.4s, v0.4s
+sadalp v0.2d, v1.4s
+sadalp v0.2d, v1.4s
+sadalp v0.2d, v0.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fcmla
+fmul v0.4s, v0.4s, v0.4s
+fcmla v0.2d, v1.2d, v2.2d, #90
+fcmla v0.2d, v1.2d, v2.2d, #90
+fcmla v0.2d, v0.2d, v1.2d, #90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmla
+fmul v0.2d, v0.2d, v0.2d
+fmla v0.2d, v1.2d, v2.2d
+fadd v0.2d, v0.2d, v0.2d
+fmla v0.2d, v1.2d, v2.2d
+fmla v0.2d, v1.2d, v2.2d
+fmla v0.2d, v0.2d, v1.2d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmlal
+fmul v0.2d, v0.2d, v0.2d
+fmlal v0.4s, v1.4h, v2.4h
+fadd v0.2d, v0.2d, v0.2d
+fmlal v0.4s, v1.4h, v2.4h
+fmlal v0.4s, v1.4h, v2.4h
+fmlal v0.4s, v0.4h, v1.4h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfdot
+fmul v0.2d, v0.2d, v0.2d
+bfdot v0.4s, v1.8h, v2.8h
+bfdot v0.4s, v1.8h, v2.8h
+bfdot v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfmmla
+fmul v0.2d, v0.2d, v0.2d
+bfmmla v0.4s, v1.8h, v2.8h
+bfmmla v0.4s, v1.8h, v2.8h
+bfmmla v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfmlalb
+fmul v0.2d, v0.2d, v0.2d
+bfmlalb v0.4s, v1.8h, v2.8h
+bfmlalb v0.4s, v1.8h, v2.8h
+bfmlalb v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN crc32
+mul w0, w0, w0
+crc32cb w0, w0, w1
+crc32cb w0, w0, w1
+crc32cb w0, w0, w0
+crc32b w0, w0, w15
+crc32h w0, w0, w21
+crc32w w0, w0, w24
+crc32x w0, w0, x25
+crc32ch w0, w0, w16
+crc32cw w0, w0, w23
+crc32cx w0, w0, x5
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z saba
+mul z0.d, z0.d, z0.d
+saba z0.d, z1.d, z2.d
+saba z0.d, z1.d, z2.d
+saba z0.d, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sabalt
+mul z0.h, z0.h, z0.h
+sabalt z0.h, z1.b, z2.b
+sabalt z0.h, z1.b, z2.b
+sabalt z0.h, z0.b, z1.b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sadalp
+mul z0.d, z0.d, z0.d
+sadalp z0.d, p0/m, z1.s
+sadalp z0.d, p0/m, z1.s
+sadalp z0.d, p0/m, z0.s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z ssra
+mul z0.d, z0.d, z0.d
+ssra z0.d, z1.d, #1
+ssra z0.d, z1.d, #1
+ssra z0.d, z0.d, #1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z cdot.s
+mul z0.d, z0.d, z0.d
+cdot z0.s, z1.b, z2.b, #90
+cdot z0.s, z1.b, z2.b, #90
+cdot z0.s, z0.b, z1.b, #90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z cdot.d
+mul z0.d, z0.d, z0.d
+cdot z0.d, z1.h, z2.h, #90
+cdot z0.d, z1.h, z2.h, #90
+cdot z0.d, z0.h, z1.h, #90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z cmla.b
+mul z0.d, z0.d, z0.d
+cmla z0.b, z1.b, z2.b, #90
+cmla z0.b, z1.b, z2.b, #90
+cmla z0.b, z0.b, z1.b, #90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z cmla.d
+mul z0.d, z0.d, z0.d
+cmla z0.d, z1.d, z2.d, #90
+cmla z0.d, z1.d, z2.d, #90
+cmla z0.d, z0.d, z1.d, #90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sdot.s
+mul z0.d, p0/m, z0.d, z0.d
+sdot z0.s, z1.b, z2.b
+sdot z0.s, z1.b, z2.b
+sdot z0.s, z0.b, z1.b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sudot
+mul z0.d, p0/m, z0.d, z0.d
+sdot z0.s, z1.b, z2.b[1]
+sdot z0.s, z1.b, z2.b[1]
+sdot z0.s, z0.b, z1.b[1]
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sdot.d
+mul z0.d, z0.d, z0.d
+sdot z0.d, z1.h, z2.h
+sdot z0.d, z1.h, z2.h
+sdot z0.d, z0.h, z1.h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z smmla
+mul z0.s, z0.s, z0.s
+smmla z0.s, z1.b, z2.b
+smmla z0.s, z1.b, z2.b
+smmla z0.s, z0.b, z1.b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z mla.b
+mul z0.d, z0.d, z0.d
+mla z0.b, p0/m, z1.b, z2.b
+mla z0.b, p0/m, z1.b, z2.b
+mla z0.b, p0/m, z0.b, z1.b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z mla.d
+mul z0.d, z0.d, z0.d
+mla z0.d, p0/m, z1.d, z2.d
+mla z0.d, p0/m, z1.d, z2.d
+mla z0.d, p0/m, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z smlalb
+mul z0.d, z0.d, z0.d
+smlalb z0.d, z1.s, z2.s
+smlalb z0.d, z1.s, z2.s
+smlalb z0.d, z0.s, z1.s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sqdmlalb
+mul z0.d, z0.d, z0.d
+sqdmlalb z0.d, z1.s, z2.s
+sqdmlalb z0.d, z1.s, z2.s
+sqdmlalb z0.d, z0.s, z1.s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sqrdmlah.b
+mul z0.d, z0.d, z0.d
+sqrdmlah z0.b, z1.b, z2.b
+sqrdmlah z0.b, z1.b, z2.b
+sqrdmlah z0.b, z0.b, z1.b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sqrdmlah.d
+mul z0.d, z0.d, z0.d
+sqrdmlah z0.d, z1.d, z2.d
+sqrdmlah z0.d, z1.d, z2.d
+sqrdmlah z0.d, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fcmla ZPmZZ
+fmul z0.d, z0.d, z0.d
+fcmla z0.d, p0/m, z1.d, z2.d, 90
+fcmla z0.d, p0/m, z1.d, z2.d, 90
+fcmla z0.d, p0/m, z0.d, z1.d, 90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fcmla ZZZI
+fmul z0.d, z0.d, z0.d
+fcmla z0.s, z1.s, z2.s[1], 90
+fcmla z0.s, z1.s, z2.s[1], 90
+fcmla z0.s, z0.s, z1.s[1], 90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fmla ZPmZZ
+fmul z0.d, z0.d, z0.d
+fmla z0.d, p0/m, z1.d, z2.d
+fmla z0.d, p0/m, z1.d, z2.d
+fmla z0.d, p0/m, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fmla ZZZI
+fmul z0.d, z0.d, z0.d
+fmla z0.d, z1.d, z2.d[1]
+fmla z0.d, z1.d, z2.d[1]
+fmla z0.d, z0.d, z1.d[1]
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fmlalb ZZZ
+fmul z0.d, z0.d, z0.d
+fmlalb z0.s, z1.h, z2.h
+fmlalb z0.s, z1.h, z2.h
+fmlalb z0.s, z0.h, z1.h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z bfdot
+fmul z0.d, z0.d, z0.d
+bfdot z0.s, z1.h, z2.h
+bfdot z0.s, z1.h, z2.h
+bfdot z0.s, z0.h, z1.h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z bfmmla
+fmul z0.d, z0.d, z0.d
+bfmmla z0.s, z1.h, z2.h
+bfmmla z0.s, z1.h, z2.h
+bfmmla z0.s, z0.h, z1.h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfmlalb
+fmul z0.d, z0.d, z0.d
+bfmlalb z0.s, z1.h, z2.h
+bfmlalb z0.s, z1.h, z2.h
+bfmlalb z0.s, z0.h, z1.h
+# LLVM-MCA-END
+
+# CHECK: [0] Code Region - madd
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 703
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.57
+# CHECK-NEXT: IPC: 0.57
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . .. mul x0, x0, x0
+# CHECK-NEXT: [0,1] D==eeER . .. madd x0, x1, x2, x0
+# CHECK-NEXT: [0,2] D===eeER . .. madd x0, x1, x2, x0
+# CHECK-NEXT: [0,3] D=====eeER. .. madd x0, x0, x0, x0
+# CHECK-NEXT: [1,0] D=======eeER .. mul x0, x0, x0
+# CHECK-NEXT: [1,1] D=========eeER .. madd x0, x1, x2, x0
+# CHECK-NEXT: [1,2] D==========eeER.. madd x0, x1, x2, x0
+# CHECK-NEXT: [1,3] D============eeER madd x0, x0, x0, x0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0
+# CHECK-NEXT: 1. 2 6.5 0.0 0.0 madd x0, x1, x2, x0
+# CHECK-NEXT: 2. 2 7.5 0.0 0.0 madd x0, x1, x2, x0
+# CHECK-NEXT: 3. 2 9.5 0.0 0.0 madd x0, x0, x0, x0
+# CHECK-NEXT: 2 7.0 0.1 0.0 <total>
+
+# CHECK: [1] Code Region - smaddl
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 703
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.57
+# CHECK-NEXT: IPC: 0.57
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . .. mul x0, x0, x0
+# CHECK-NEXT: [0,1] D==eeER . .. smaddl x0, w1, w2, x0
+# CHECK-NEXT: [0,2] D===eeER . .. smaddl x0, w1, w2, x0
+# CHECK-NEXT: [0,3] D=====eeER. .. smaddl x0, w0, w0, x0
+# CHECK-NEXT: [1,0] D=======eeER .. mul x0, x0, x0
+# CHECK-NEXT: [1,1] D=========eeER .. smaddl x0, w1, w2, x0
+# CHECK-NEXT: [1,2] D==========eeER.. smaddl x0, w1, w2, x0
+# CHECK-NEXT: [1,3] D============eeER smaddl x0, w0, w0, x0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0
+# CHECK-NEXT: 1. 2 6.5 0.0 0.0 smaddl x0, w1, w2, x0
+# CHECK-NEXT: 2. 2 7.5 0.0 0.0 smaddl x0, w1, w2, x0
+# CHECK-NEXT: 3. 2 9.5 0.0 0.0 smaddl x0, w0, w0, x0
+# CHECK-NEXT: 2 7.0 0.1 0.0 <total>
+
+# CHECK: [2] Code Region - fmadd
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 600
+# CHECK-NEXT: Total Cycles: 2103
+# CHECK-NEXT: Total uOps: 600
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: IPC: 0.29
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 01234
+
+# CHECK: [0,0] DeeER. . . . . . . . . fadd d0, d0, d0
+# CHECK-NEXT: [0,1] D==eeeeER . . . . . . . . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [0,2] D======eeeER . . . . . . . fmul d0, d0, d0
+# CHECK-NEXT: [0,3] D=========eeeeER . . . . . . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [0,4] D=============eeeeER. . . . . . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [0,5] D=================eeeeER . . . . . fmadd d0, d0, d1, d2
+# CHECK-NEXT: [1,0] D=====================eeER . . . . fadd d0, d0, d0
+# CHECK-NEXT: [1,1] D=======================eeeeER. . . . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [1,2] D===========================eeeER . . . fmul d0, d0, d0
+# CHECK-NEXT: [1,3] D==============================eeeeER . . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [1,4] .D=================================eeeeER . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [1,5] .D=====================================eeeeER fmadd d0, d0, d1, d2
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 11.5 0.5 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 1. 2 13.5 0.0 0.0 fmadd d0, d1, d2, d0
+# CHECK-NEXT: 2. 2 17.5 0.0 0.0 fmul d0, d0, d0
+# CHECK-NEXT: 3. 2 20.5 0.0 0.0 fmadd d0, d1, d2, d0
+# CHECK-NEXT: 4. 2 24.0 0.0 0.0 fmadd d0, d1, d2, d0
+# CHECK-NEXT: 5. 2 28.0 0.0 0.0 fmadd d0, d0, d1, d2
+# CHECK-NEXT: 2 19.2 0.1 0.0 <total>
+
+# CHECK: [3] Code Region - saba
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1603
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.25
+# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 01234
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeER . . . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D====eeeeER . . . . . saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2] D========eeeeER. . . . . saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3] D============eeeeER . . . . saba v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0] D================eeeeER . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] D====================eeeeER . . saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2] D========================eeeeER . saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3] D============================eeeeER saba v0.4s, v0.4s, v1.4s
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 9.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 13.0 0.0 0.0 saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2. 2 17.0 0.0 0.0 saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3. 2 21.0 0.0 0.0 saba v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: 2 15.0 0.1 0.0 <total>
+
+# CHECK: [4] Code Region - sdot
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1303
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.31
+# CHECK-NEXT: IPC: 0.31
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
+
+# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D====eeeER. . . . . sdot v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,2] D=======eeeER . . . . sdot v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,3] D==========eeeER . . . sdot v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] D=================eeeER . . sdot v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,2] D====================eeeER . sdot v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,3] D=======================eeeER sdot v0.4s, v0.16b, v1.16b
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 sdot v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 2. 2 14.5 0.0 0.0 sdot v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 sdot v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
+
+# CHECK: [5] Code Region - smmla
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1303
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.31
+# CHECK-NEXT: IPC: 0.31
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
+
+# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D====eeeER. . . . . smmla v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,2] D=======eeeER . . . . smmla v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,3] D==========eeeER . . . smmla v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] D=================eeeER . . smmla v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,2] D====================eeeER . smmla v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,3] D=======================eeeER smmla v0.4s, v0.16b, v1.16b
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 smmla v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 2. 2 14.5 0.0 0.0 smmla v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 smmla v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
+
+# CHECK: [6] Code Region - mla
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1603
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.25
+# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 01234
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeER . . . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D====eeeeER . . . . . mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2] D========eeeeER. . . . . mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3] D============eeeeER . . . . mla v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0] D================eeeeER . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] D====================eeeeER . . mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2] D========================eeeeER . mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3] D============================eeeeER mla v0.4s, v0.4s, v1.4s
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 9.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 13.0 0.0 0.0 mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2. 2 17.0 0.0 0.0 mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3. 2 21.0 0.0 0.0 mla v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: 2 15.0 0.1 0.0 <total>
+
+# CHECK: [7] Code Region - sqrdmlah
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1603
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.25
+# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 01234
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeER . . . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D====eeeeER . . . . . sqrdmlah v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: [0,2] D========eeeeER. . . . . sqrdmlah v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: [0,3] D============eeeeER . . . . sqrdmlah v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: [1,0] D================eeeeER . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] D====================eeeeER . . sqrdmlah v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: [1,2] D========================eeeeER . sqrdmlah v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: [1,3] D============================eeeeER sqrdmlah v0.8h, v1.8h, v2.8h
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 9.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sqrdmlah v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: 2. 2 17.0 0.0 0.0 sqrdmlah v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: 3. 2 21.0 0.0 0.0 sqrdmlah v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: 2 15.0 0.1 0.0 <total>
+
+# CHECK: [8] Code Region - smlal2
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1603
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.25
+# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 01234
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeER . . . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D====eeeeER . . . . . smlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2] D========eeeeER. . . . . smlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3] D============eeeeER . . . . smlal2 v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0] D================eeeeER . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] D====================eeeeER . . smlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2] D========================eeeeER . smlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3] D============================eeeeER smlal2 v0.4s, v0.8h, v1.8h
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 9.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 13.0 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2. 2 17.0 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3. 2 21.0 0.0 0.0 smlal2 v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: 2 15.0 0.1 0.0 <total>
+
+# CHECK: [9] Code Region - sqdmlal2
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1603
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.25
+# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 01234
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeER . . . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D====eeeeER . . . . . sqdmlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2] D========eeeeER. . . . . sqdmlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3] D============eeeeER . . . . sqdmlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,0] D================eeeeER . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] D====================eeeeER . . sqdmlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2] D========================eeeeER . sqdmlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3] D============================eeeeER sqdmlal2 v0.4s, v1.8h, v2.8h
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 9.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sqdmlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2. 2 17.0 0.0 0.0 sqdmlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3. 2 21.0 0.0 0.0 sqdmlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2 15.0 0.1 0.0 <total>
+
+# CHECK: [10] Code Region - sadalp
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1603
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.25
+# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 01234
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeER . . . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D====eeeeER . . . . . sadalp v0.2d, v1.4s
+# CHECK-NEXT: [0,2] D========eeeeER. . . . . sadalp v0.2d, v1.4s
+# CHECK-NEXT: [0,3] D============eeeeER . . . . sadalp v0.2d, v0.4s
+# CHECK-NEXT: [1,0] D================eeeeER . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] D====================eeeeER . . sadalp v0.2d, v1.4s
+# CHECK-NEXT: [1,2] D========================eeeeER . sadalp v0.2d, v1.4s
+# CHECK-NEXT: [1,3] D============================eeeeER sadalp v0.2d, v0.4s
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 9.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sadalp v0.2d, v1.4s
+# CHECK-NEXT: 2. 2 17.0 0.0 0.0 sadalp v0.2d, v1.4s
+# CHECK-NEXT: 3. 2 21.0 0.0 0.0 sadalp v0.2d, v0.4s
+# CHECK-NEXT: 2 15.0 0.1 0.0 <total>
+
+# CHECK: [11] Code Region - fcmla
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1503
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.27
+# CHECK-NEXT: IPC: 0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 012
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeER . . . . . . fmul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . . fcmla v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [0,2] D=======eeeeER . . . . . fcmla v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [0,3] D===========eeeeER . . . . fcmla v0.2d, v0.2d, v1.2d, #90
+# CHECK-NEXT: [1,0] D===============eeeER . . . fmul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] D==================eeeeER. . . fcmla v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [1,2] D======================eeeeER . . fcmla v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [1,3] D==========================eeeeER fcmla v0.2d, v0.2d, v1.2d, #90
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fcmla v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: 2. 2 15.5 0.0 0.0 fcmla v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: 3. 2 19.5 0.0 0.0 fcmla v0.2d, v0.2d, v1.2d, #90
+# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
+
+# CHECK: [12] Code Region - fmla
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 600
+# CHECK-NEXT: Total Cycles: 2103
+# CHECK-NEXT: Total uOps: 600
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: IPC: 0.29
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 01234
+
+# CHECK: [0,0] DeeeER . . . . . . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . . . . fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,2] D=======eeER . . . . . . . fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,3] D=========eeeeER . . . . . . fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,4] D=============eeeeER. . . . . . fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,5] D=================eeeeER . . . . . fmla v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: [1,0] D=====================eeeER . . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1] D========================eeeeER . . . fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,2] D============================eeER . . . fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3] D==============================eeeeER . . fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,4] .D=================================eeeeER . fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,5] .D=====================================eeeeER fmla v0.2d, v0.2d, v1.2d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 11.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1. 2 14.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 2. 2 18.5 0.0 0.0 fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3. 2 20.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 4. 2 24.0 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 5. 2 28.0 0.0 0.0 fmla v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: 2 19.5 0.1 0.0 <total>
+
+# CHECK: [13] Code Region - fmlal
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 600
+# CHECK-NEXT: Total Cycles: 2103
+# CHECK-NEXT: Total uOps: 600
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: IPC: 0.29
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 01234
+
+# CHECK: [0,0] DeeeER . . . . . . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . . . . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,2] D=======eeER . . . . . . . fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,3] D=========eeeeER . . . . . . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,4] D=============eeeeER. . . . . . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,5] D=================eeeeER . . . . . fmlal v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: [1,0] D=====================eeeER . . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1] D========================eeeeER . . . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,2] D============================eeER . . . fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3] D==============================eeeeER . . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,4] .D=================================eeeeER . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,5] .D=====================================eeeeER fmlal v0.4s, v0.4h, v1.4h
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 11.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1. 2 14.5 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 2. 2 18.5 0.0 0.0 fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3. 2 20.5 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 4. 2 24.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 5. 2 28.0 0.0 0.0 fmlal v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: 2 19.5 0.1 0.0 <total>
+
+# CHECK: [14] Code Region - bfdot
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1503
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.27
+# CHECK-NEXT: IPC: 0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 012
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeER . . . . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . . bfdot v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2] D=======eeeeER . . . . . bfdot v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3] D===========eeeeER . . . . bfdot v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0] D===============eeeER . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1] D==================eeeeER. . . bfdot v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2] D======================eeeeER . . bfdot v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3] D==========================eeeeER bfdot v0.4s, v0.8h, v1.8h
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 bfdot v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2. 2 15.5 0.0 0.0 bfdot v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3. 2 19.5 0.0 0.0 bfdot v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
+
+# CHECK: [15] Code Region - bfmmla
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1803
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.22
+# CHECK-NEXT: IPC: 0.22
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 012345678
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeER . . . . . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1] D===eeeeeER . . . . . . bfmmla v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2] D========eeeeeER . . . . . bfmmla v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3] D=============eeeeeER . . . . bfmmla v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0] D==================eeeER . . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1] D=====================eeeeeER . . . bfmmla v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2] D==========================eeeeeER . . bfmmla v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3] D===============================eeeeeER bfmmla v0.4s, v0.8h, v1.8h
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 10.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1. 2 13.0 0.0 0.0 bfmmla v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2. 2 18.0 0.0 0.0 bfmmla v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3. 2 23.0 0.0 0.0 bfmmla v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: 2 16.0 0.1 0.0 <total>
+
+# CHECK: [16] Code Region - bfmlalb
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1503
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.27
+# CHECK-NEXT: IPC: 0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 012
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeER . . . . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . . bfmlalb v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2] D=======eeeeER . . . . . bfmlalb v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3] D===========eeeeER . . . . bfmlalb v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0] D===============eeeER . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1] D==================eeeeER. . . bfmlalb v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2] D======================eeeeER . . bfmlalb v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3] D==========================eeeeER bfmlalb v0.4s, v0.8h, v1.8h
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 bfmlalb v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2. 2 15.5 0.0 0.0 bfmlalb v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3. 2 19.5 0.0 0.0 bfmlalb v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
+
+# CHECK: [17] Code Region - crc32
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 1100
+# CHECK-NEXT: Total Cycles: 2203
+# CHECK-NEXT: Total uOps: 1100
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.50
+# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 0123456
+
+# CHECK: [0,0] DeeER. . . . . . . . .. mul w0, w0, w0
+# CHECK-NEXT: [0,1] D==eeER . . . . . . . .. crc32cb w0, w0, w1
+# CHECK-NEXT: [0,2] D====eeER . . . . . . . .. crc32cb w0, w0, w1
+# CHECK-NEXT: [0,3] D======eeER . . . . . . .. crc32cb w0, w0, w0
+# CHECK-NEXT: [0,4] D========eeER . . . . . . .. crc32b w0, w0, w15
+# CHECK-NEXT: [0,5] D==========eeER. . . . . . .. crc32h w0, w0, w21
+# CHECK-NEXT: [0,6] D============eeER . . . . . .. crc32w w0, w0, w24
+# CHECK-NEXT: [0,7] D==============eeER . . . . . .. crc32x w0, w0, x25
+# CHECK-NEXT: [0,8] D================eeER . . . . .. crc32ch w0, w0, w16
+# CHECK-NEXT: [0,9] D==================eeER . . . . .. crc32cw w0, w0, w23
+# CHECK-NEXT: [0,10] .D===================eeER. . . . .. crc32cx w0, w0, x5
+# CHECK-NEXT: [1,0] .D=====================eeER . . . .. mul w0, w0, w0
+# CHECK-NEXT: [1,1] .D=======================eeER . . . .. crc32cb w0, w0, w1
+# CHECK-NEXT: [1,2] .D=========================eeER . . .. crc32cb w0, w0, w1
+# CHECK-NEXT: [1,3] .D===========================eeER . . .. crc32cb w0, w0, w0
+# CHECK-NEXT: [1,4] .D=============================eeER. . .. crc32b w0, w0, w15
+# CHECK-NEXT: [1,5] .D===============================eeER . .. crc32h w0, w0, w21
+# CHECK-NEXT: [1,6] .D=================================eeER . .. crc32w w0, w0, w24
+# CHECK-NEXT: [1,7] .D===================================eeER .. crc32x w0, w0, x25
+# CHECK-NEXT: [1,8] .D=====================================eeER .. crc32ch w0, w0, w16
+# CHECK-NEXT: [1,9] . D======================================eeER.. crc32cw w0, w0, w23
+# CHECK-NEXT: [1,10] . D========================================eeER crc32cx w0, w0, x5
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 11.5 0.5 0.0 mul w0, w0, w0
+# CHECK-NEXT: 1. 2 13.5 0.0 0.0 crc32cb w0, w0, w1
+# CHECK-NEXT: 2. 2 15.5 0.0 0.0 crc32cb w0, w0, w1
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 crc32cb w0, w0, w0
+# CHECK-NEXT: 4. 2 19.5 0.0 0.0 crc32b w0, w0, w15
+# CHECK-NEXT: 5. 2 21.5 0.0 0.0 crc32h w0, w0, w21
+# CHECK-NEXT: 6. 2 23.5 0.0 0.0 crc32w w0, w0, w24
+# CHECK-NEXT: 7. 2 25.5 0.0 0.0 crc32x w0, w0, x25
+# CHECK-NEXT: 8. 2 27.5 0.0 0.0 crc32ch w0, w0, w16
+# CHECK-NEXT: 9. 2 29.0 0.0 0.0 crc32cw w0, w0, w23
+# CHECK-NEXT: 10. 2 30.5 0.0 0.0 crc32cx w0, w0, x5
+# CHECK-NEXT: 2 21.4 0.0 0.0 <total>
+
+# CHECK: [18] Code Region - Z saba
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: IPC: 0.23
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. saba z0.d, z1.d, z2.d
+# CHECK-NEXT: [0,2] D=========eeeeER . . . .. saba z0.d, z1.d, z2.d
+# CHECK-NEXT: [0,3] D=============eeeeER. . . .. saba z0.d, z0.d, z1.d
+# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] D======================eeeeER . .. saba z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,2] D==========================eeeeER .. saba z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,3] D==============================eeeeER saba z0.d, z0.d, z1.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 14.5 0.0 0.0 saba z0.d, z1.d, z2.d
+# CHECK-NEXT: 2. 2 18.5 0.0 0.0 saba z0.d, z1.d, z2.d
+# CHECK-NEXT: 3. 2 22.5 0.0 0.0 saba z0.d, z0.d, z1.d
+# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+
+# CHECK: [19] Code Region - Z sabalt
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1603
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.25
+# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 01234
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeER . . . . . . mul z0.h, z0.h, z0.h
+# CHECK-NEXT: [0,1] D====eeeeER . . . . . sabalt z0.h, z1.b, z2.b
+# CHECK-NEXT: [0,2] D========eeeeER. . . . . sabalt z0.h, z1.b, z2.b
+# CHECK-NEXT: [0,3] D============eeeeER . . . . sabalt z0.h, z0.b, z1.b
+# CHECK-NEXT: [1,0] D================eeeeER . . . mul z0.h, z0.h, z0.h
+# CHECK-NEXT: [1,1] D====================eeeeER . . sabalt z0.h, z1.b, z2.b
+# CHECK-NEXT: [1,2] D========================eeeeER . sabalt z0.h, z1.b, z2.b
+# CHECK-NEXT: [1,3] D============================eeeeER sabalt z0.h, z0.b, z1.b
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 9.0 0.5 0.0 mul z0.h, z0.h, z0.h
+# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sabalt z0.h, z1.b, z2.b
+# CHECK-NEXT: 2. 2 17.0 0.0 0.0 sabalt z0.h, z1.b, z2.b
+# CHECK-NEXT: 3. 2 21.0 0.0 0.0 sabalt z0.h, z0.b, z1.b
+# CHECK-NEXT: 2 15.0 0.1 0.0 <total>
+
+# CHECK: [20] Code Region - Z sadalp
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: IPC: 0.23
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. sadalp z0.d, p0/m, z1.s
+# CHECK-NEXT: [0,2] D=========eeeeER . . . .. sadalp z0.d, p0/m, z1.s
+# CHECK-NEXT: [0,3] D=============eeeeER. . . .. sadalp z0.d, p0/m, z0.s
+# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] D======================eeeeER . .. sadalp z0.d, p0/m, z1.s
+# CHECK-NEXT: [1,2] D==========================eeeeER .. sadalp z0.d, p0/m, z1.s
+# CHECK-NEXT: [1,3] D==============================eeeeER sadalp z0.d, p0/m, z0.s
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 14.5 0.0 0.0 sadalp z0.d, p0/m, z1.s
+# CHECK-NEXT: 2. 2 18.5 0.0 0.0 sadalp z0.d, p0/m, z1.s
+# CHECK-NEXT: 3. 2 22.5 0.0 0.0 sadalp z0.d, p0/m, z0.s
+# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+
+# CHECK: [21] Code Region - Z ssra
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: IPC: 0.23
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. ssra z0.d, z1.d, #1
+# CHECK-NEXT: [0,2] D=========eeeeER . . . .. ssra z0.d, z1.d, #1
+# CHECK-NEXT: [0,3] D=============eeeeER. . . .. ssra z0.d, z0.d, #1
+# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] D======================eeeeER . .. ssra z0.d, z1.d, #1
+# CHECK-NEXT: [1,2] D==========================eeeeER .. ssra z0.d, z1.d, #1
+# CHECK-NEXT: [1,3] D==============================eeeeER ssra z0.d, z0.d, #1
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 14.5 0.0 0.0 ssra z0.d, z1.d, #1
+# CHECK-NEXT: 2. 2 18.5 0.0 0.0 ssra z0.d, z1.d, #1
+# CHECK-NEXT: 3. 2 22.5 0.0 0.0 ssra z0.d, z0.d, #1
+# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+
+# CHECK: [22] Code Region - Z cdot.s
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1403
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.29
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeER . . . . cdot z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [0,2] D========eeeER . . . . cdot z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [0,3] D===========eeeER . . . cdot z0.s, z0.b, z1.b, #90
+# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] D===================eeeER. . cdot z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [1,2] D======================eeeER . cdot z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [1,3] D=========================eeeER cdot z0.s, z0.b, z1.b, #90
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 13.0 0.0 0.0 cdot z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: 2. 2 16.0 0.0 0.0 cdot z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: 3. 2 19.0 0.0 0.0 cdot z0.s, z0.b, z1.b, #90
+# CHECK-NEXT: 2 14.0 0.1 0.0 <total>
+
+# CHECK: [23] Code Region - Z cdot.d
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: IPC: 0.23
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. cdot z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [0,2] D=========eeeeER . . . .. cdot z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [0,3] D=============eeeeER. . . .. cdot z0.d, z0.h, z1.h, #90
+# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] D======================eeeeER . .. cdot z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [1,2] D==========================eeeeER .. cdot z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [1,3] D==============================eeeeER cdot z0.d, z0.h, z1.h, #90
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 14.5 0.0 0.0 cdot z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: 2. 2 18.5 0.0 0.0 cdot z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: 3. 2 22.5 0.0 0.0 cdot z0.d, z0.h, z1.h, #90
+# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+
+# CHECK: [24] Code Region - Z cmla.b
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: IPC: 0.23
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. cmla z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [0,2] D=========eeeeER . . . .. cmla z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [0,3] D=============eeeeER. . . .. cmla z0.b, z0.b, z1.b, #90
+# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] D======================eeeeER . .. cmla z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [1,2] D==========================eeeeER .. cmla z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [1,3] D==============================eeeeER cmla z0.b, z0.b, z1.b, #90
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 14.5 0.0 0.0 cmla z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: 2. 2 18.5 0.0 0.0 cmla z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: 3. 2 22.5 0.0 0.0 cmla z0.b, z0.b, z1.b, #90
+# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+
+# CHECK: [25] Code Region - Z cmla.d
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 2003
+# CHECK-NEXT: Total uOps: 800
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.40
+# CHECK-NEXT: IPC: 0.20
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeER . . . . . . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeeER . . . . . . . cmla z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [0,2] D==========eeeeeER . . . . . . cmla z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [0,3] D===============eeeeeER . . . . . cmla z0.d, z0.d, z1.d, #90
+# CHECK-NEXT: [1,0] D====================eeeeeER . . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D========================eeeeeER . . . cmla z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [1,2] .D=============================eeeeeER . . cmla z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [1,3] .D==================================eeeeeER cmla z0.d, z0.d, z1.d, #90
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 11.0 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 15.5 0.0 0.0 cmla z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: 2. 2 20.5 0.0 0.0 cmla z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: 3. 2 25.5 0.0 0.0 cmla z0.d, z0.d, z1.d, #90
+# CHECK-NEXT: 2 18.1 0.1 0.0 <total>
+
+# CHECK: [26] Code Region - Z sdot.s
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1403
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.29
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeER . . . . sdot z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,2] D========eeeER . . . . sdot z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,3] D===========eeeER . . . sdot z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1] D===================eeeER. . sdot z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,2] D======================eeeER . sdot z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3] D=========================eeeER sdot z0.s, z0.b, z1.b
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sdot z0.s, z1.b, z2.b
+# CHECK-NEXT: 2. 2 16.0 0.0 0.0 sdot z0.s, z1.b, z2.b
+# CHECK-NEXT: 3. 2 19.0 0.0 0.0 sdot z0.s, z0.b, z1.b
+# CHECK-NEXT: 2 14.0 0.1 0.0 <total>
+
+# CHECK: [27] Code Region - Z sudot
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1403
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.29
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeER . . . . sdot z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [0,2] D========eeeER . . . . sdot z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [0,3] D===========eeeER . . . sdot z0.s, z0.b, z1.b[1]
+# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1] D===================eeeER. . sdot z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,2] D======================eeeER . sdot z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,3] D=========================eeeER sdot z0.s, z0.b, z1.b[1]
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sdot z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 2. 2 16.0 0.0 0.0 sdot z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 3. 2 19.0 0.0 0.0 sdot z0.s, z0.b, z1.b[1]
+# CHECK-NEXT: 2 14.0 0.1 0.0 <total>
+
+# CHECK: [28] Code Region - Z sdot.d
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: IPC: 0.23
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. sdot z0.d, z1.h, z2.h
+# CHECK-NEXT: [0,2] D=========eeeeER . . . .. sdot z0.d, z1.h, z2.h
+# CHECK-NEXT: [0,3] D=============eeeeER. . . .. sdot z0.d, z0.h, z1.h
+# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] D======================eeeeER . .. sdot z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,2] D==========================eeeeER .. sdot z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,3] D==============================eeeeER sdot z0.d, z0.h, z1.h
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 14.5 0.0 0.0 sdot z0.d, z1.h, z2.h
+# CHECK-NEXT: 2. 2 18.5 0.0 0.0 sdot z0.d, z1.h, z2.h
+# CHECK-NEXT: 3. 2 22.5 0.0 0.0 sdot z0.d, z0.h, z1.h
+# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+
+# CHECK: [29] Code Region - Z smmla
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1303
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.31
+# CHECK-NEXT: IPC: 0.31
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
+
+# CHECK: [0,0] DeeeeER . . . . . mul z0.s, z0.s, z0.s
+# CHECK-NEXT: [0,1] D====eeeER. . . . . smmla z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,2] D=======eeeER . . . . smmla z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,3] D==========eeeER . . . smmla z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,0] D=============eeeeER. . . mul z0.s, z0.s, z0.s
+# CHECK-NEXT: [1,1] D=================eeeER . . smmla z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,2] D====================eeeER . smmla z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3] D=======================eeeER smmla z0.s, z0.b, z1.b
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.s, z0.s, z0.s
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 smmla z0.s, z1.b, z2.b
+# CHECK-NEXT: 2. 2 14.5 0.0 0.0 smmla z0.s, z1.b, z2.b
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 smmla z0.s, z0.b, z1.b
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
+
+# CHECK: [30] Code Region - Z mla.b
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: IPC: 0.23
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. mla z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [0,2] D=========eeeeER . . . .. mla z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [0,3] D=============eeeeER. . . .. mla z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] D======================eeeeER . .. mla z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [1,2] D==========================eeeeER .. mla z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [1,3] D==============================eeeeER mla z0.b, p0/m, z0.b, z1.b
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 14.5 0.0 0.0 mla z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: 2. 2 18.5 0.0 0.0 mla z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: 3. 2 22.5 0.0 0.0 mla z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+
+# CHECK: [31] Code Region - Z mla.d
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 2003
+# CHECK-NEXT: Total uOps: 800
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.40
+# CHECK-NEXT: IPC: 0.20
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeER . . . . . . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeeER . . . . . . . mla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,2] D==========eeeeeER . . . . . . mla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,3] D===============eeeeeER . . . . . mla z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0] D====================eeeeeER . . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D========================eeeeeER . . . mla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2] .D=============================eeeeeER . . mla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3] .D==================================eeeeeER mla z0.d, p0/m, z0.d, z1.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 11.0 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 15.5 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2. 2 20.5 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3. 2 25.5 0.0 0.0 mla z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: 2 18.1 0.1 0.0 <total>
+
+# CHECK: [32] Code Region - Z smlalb
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: IPC: 0.23
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. smlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: [0,2] D=========eeeeER . . . .. smlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: [0,3] D=============eeeeER. . . .. smlalb z0.d, z0.s, z1.s
+# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] D======================eeeeER . .. smlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,2] D==========================eeeeER .. smlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,3] D==============================eeeeER smlalb z0.d, z0.s, z1.s
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 14.5 0.0 0.0 smlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: 2. 2 18.5 0.0 0.0 smlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: 3. 2 22.5 0.0 0.0 smlalb z0.d, z0.s, z1.s
+# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+
+# CHECK: [33] Code Region - Z sqdmlalb
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: IPC: 0.23
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. sqdmlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: [0,2] D=========eeeeER . . . .. sqdmlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: [0,3] D=============eeeeER. . . .. sqdmlalb z0.d, z0.s, z1.s
+# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] D======================eeeeER . .. sqdmlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,2] D==========================eeeeER .. sqdmlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,3] D==============================eeeeER sqdmlalb z0.d, z0.s, z1.s
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 14.5 0.0 0.0 sqdmlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: 2. 2 18.5 0.0 0.0 sqdmlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: 3. 2 22.5 0.0 0.0 sqdmlalb z0.d, z0.s, z1.s
+# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+
+# CHECK: [34] Code Region - Z sqrdmlah.b
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: IPC: 0.23
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. sqrdmlah z0.b, z1.b, z2.b
+# CHECK-NEXT: [0,2] D=========eeeeER . . . .. sqrdmlah z0.b, z1.b, z2.b
+# CHECK-NEXT: [0,3] D=============eeeeER. . . .. sqrdmlah z0.b, z0.b, z1.b
+# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] D======================eeeeER . .. sqrdmlah z0.b, z1.b, z2.b
+# CHECK-NEXT: [1,2] D==========================eeeeER .. sqrdmlah z0.b, z1.b, z2.b
+# CHECK-NEXT: [1,3] D==============================eeeeER sqrdmlah z0.b, z0.b, z1.b
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 14.5 0.0 0.0 sqrdmlah z0.b, z1.b, z2.b
+# CHECK-NEXT: 2. 2 18.5 0.0 0.0 sqrdmlah z0.b, z1.b, z2.b
+# CHECK-NEXT: 3. 2 22.5 0.0 0.0 sqrdmlah z0.b, z0.b, z1.b
+# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+
+# CHECK: [35] Code Region - Z sqrdmlah.d
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 2003
+# CHECK-NEXT: Total uOps: 800
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.40
+# CHECK-NEXT: IPC: 0.20
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeER . . . . . . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeeER . . . . . . . sqrdmlah z0.d, z1.d, z2.d
+# CHECK-NEXT: [0,2] D==========eeeeeER . . . . . . sqrdmlah z0.d, z1.d, z2.d
+# CHECK-NEXT: [0,3] D===============eeeeeER . . . . . sqrdmlah z0.d, z0.d, z1.d
+# CHECK-NEXT: [1,0] D====================eeeeeER . . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D========================eeeeeER . . . sqrdmlah z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,2] .D=============================eeeeeER . . sqrdmlah z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,3] .D==================================eeeeeER sqrdmlah z0.d, z0.d, z1.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 11.0 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 15.5 0.0 0.0 sqrdmlah z0.d, z1.d, z2.d
+# CHECK-NEXT: 2. 2 20.5 0.0 0.0 sqrdmlah z0.d, z1.d, z2.d
+# CHECK-NEXT: 3. 2 25.5 0.0 0.0 sqrdmlah z0.d, z0.d, z1.d
+# CHECK-NEXT: 2 18.1 0.1 0.0 <total>
+
+# CHECK: [36] Code Region - Z fcmla ZPmZZ
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1503
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.27
+# CHECK-NEXT: IPC: 0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 012
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . . fcmla z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [0,2] D=======eeeeER . . . . . fcmla z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [0,3] D===========eeeeER . . . . fcmla z0.d, p0/m, z0.d, z1.d, #90
+# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] D==================eeeeER. . . fcmla z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [1,2] D======================eeeeER . . fcmla z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [1,3] D==========================eeeeER fcmla z0.d, p0/m, z0.d, z1.d, #90
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fcmla z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: 2. 2 15.5 0.0 0.0 fcmla z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: 3. 2 19.5 0.0 0.0 fcmla z0.d, p0/m, z0.d, z1.d, #90
+# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
+
+# CHECK: [37] Code Region - Z fcmla ZZZI
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1503
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.27
+# CHECK-NEXT: IPC: 0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 012
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . . fcmla z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [0,2] D=======eeeeER . . . . . fcmla z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [0,3] D===========eeeeER . . . . fcmla z0.s, z0.s, z1.s[1], #90
+# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] D==================eeeeER. . . fcmla z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [1,2] D======================eeeeER . . fcmla z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [1,3] D==========================eeeeER fcmla z0.s, z0.s, z1.s[1], #90
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fcmla z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: 2. 2 15.5 0.0 0.0 fcmla z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: 3. 2 19.5 0.0 0.0 fcmla z0.s, z0.s, z1.s[1], #90
+# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
+
+# CHECK: [38] Code Region - Z fmla ZPmZZ
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1503
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.27
+# CHECK-NEXT: IPC: 0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 012
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . . fmla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,2] D=======eeeeER . . . . . fmla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,3] D===========eeeeER . . . . fmla z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] D==================eeeeER. . . fmla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2] D======================eeeeER . . fmla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3] D==========================eeeeER fmla z0.d, p0/m, z0.d, z1.d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fmla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2. 2 15.5 0.0 0.0 fmla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3. 2 19.5 0.0 0.0 fmla z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
+
+# CHECK: [39] Code Region - Z fmla ZZZI
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1503
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.27
+# CHECK-NEXT: IPC: 0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 012
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . . fmla z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [0,2] D=======eeeeER . . . . . fmla z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [0,3] D===========eeeeER . . . . fmla z0.d, z0.d, z1.d[1]
+# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] D==================eeeeER. . . fmla z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [1,2] D======================eeeeER . . fmla z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [1,3] D==========================eeeeER fmla z0.d, z0.d, z1.d[1]
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fmla z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: 2. 2 15.5 0.0 0.0 fmla z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: 3. 2 19.5 0.0 0.0 fmla z0.d, z0.d, z1.d[1]
+# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
+
+# CHECK: [40] Code Region - Z fmlalb ZZZ
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1503
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.27
+# CHECK-NEXT: IPC: 0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 012
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . . fmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2] D=======eeeeER . . . . . fmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3] D===========eeeeER . . . . fmlalb z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] D==================eeeeER. . . fmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2] D======================eeeeER . . fmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3] D==========================eeeeER fmlalb z0.s, z0.h, z1.h
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: 2. 2 15.5 0.0 0.0 fmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: 3. 2 19.5 0.0 0.0 fmlalb z0.s, z0.h, z1.h
+# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
+
+# CHECK: [41] Code Region - Z bfdot
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1503
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.27
+# CHECK-NEXT: IPC: 0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 012
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . . bfdot z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2] D=======eeeeER . . . . . bfdot z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3] D===========eeeeER . . . . bfdot z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] D==================eeeeER. . . bfdot z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2] D======================eeeeER . . bfdot z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3] D==========================eeeeER bfdot z0.s, z0.h, z1.h
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 bfdot z0.s, z1.h, z2.h
+# CHECK-NEXT: 2. 2 15.5 0.0 0.0 bfdot z0.s, z1.h, z2.h
+# CHECK-NEXT: 3. 2 19.5 0.0 0.0 bfdot z0.s, z0.h, z1.h
+# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
+
+# CHECK: [42] Code Region - Z bfmmla
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1803
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.22
+# CHECK-NEXT: IPC: 0.22
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 012345678
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeER . . . . . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D===eeeeeER . . . . . . bfmmla z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2] D========eeeeeER . . . . . bfmmla z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3] D=============eeeeeER . . . . bfmmla z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0] D==================eeeER . . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] D=====================eeeeeER . . . bfmmla z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2] D==========================eeeeeER . . bfmmla z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3] D===============================eeeeeER bfmmla z0.s, z0.h, z1.h
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 10.0 0.5 0.0 fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 13.0 0.0 0.0 bfmmla z0.s, z1.h, z2.h
+# CHECK-NEXT: 2. 2 18.0 0.0 0.0 bfmmla z0.s, z1.h, z2.h
+# CHECK-NEXT: 3. 2 23.0 0.0 0.0 bfmmla z0.s, z0.h, z1.h
+# CHECK-NEXT: 2 16.0 0.1 0.0 <total>
+
+# CHECK: [43] Code Region - bfmlalb
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1503
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 10
+# CHECK-NEXT: uOps Per Cycle: 0.27
+# CHECK-NEXT: IPC: 0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 012
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . . bfmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2] D=======eeeeER . . . . . bfmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3] D===========eeeeER . . . . bfmlalb z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] D==================eeeeER. . . bfmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2] D======================eeeeER . . bfmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3] D==========================eeeeER bfmlalb z0.s, z0.h, z1.h
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 bfmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: 2. 2 15.5 0.0 0.0 bfmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: 3. 2 19.5 0.0 0.0 bfmlalb z0.s, z0.h, z1.h
+# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
>From cfa2112ff0dca73bfa6a37b5b6234f38674399b9 Mon Sep 17 00:00:00 2001
From: Ash Dobrescu <ash.dobrescu at arm.com>
Date: Fri, 7 Nov 2025 16:28:37 +0000
Subject: [PATCH 2/2] [AArch64] Add late forwarding to Neoverse-N3 and update
test
---
.../Target/AArch64/AArch64SchedNeoverseN3.td | 205 +-
.../llvm-mca/AArch64/Neoverse/N3-forwarding.s | 1688 ++++++++---------
2 files changed, 1003 insertions(+), 890 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
index 411b372a3f533..d3705b932bf62 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
@@ -553,6 +553,107 @@ def N3Write_16c_16V0 : SchedWriteRes<[N3UnitV0, N3UnitV0, N3UnitV0, N3UnitV0,
let NumMicroOps = 16;
}
+//===----------------------------------------------------------------------===//
+// Define forwarded types
+// NOTE: SOG, p. 19, n. 2: Accumulator forwarding is not supported for
+// consumers of 64 bit multiply high operations?
+
+def N3Wr_FMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_FMA : SchedReadAdvance<2, [WriteFMul, N3Wr_FMA]>;
+
+def N3Wr_VMA : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_VMA : SchedReadAdvance<3, [N3Wr_VMA]>;
+
+def N3Wr_VMAL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_VMAL : SchedReadAdvance<3, [N3Wr_VMAL]>;
+
+def N3Wr_VMAH : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_VMAH : SchedReadAdvance<2, [N3Wr_VMAH]>;
+
+def N3Wr_VMASL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_VMASL : SchedReadAdvance<2, [N3Wr_VMASL]>;
+
+def N3Wr_ADA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; }
+def N3Rd_ADA : SchedReadAdvance<3, [N3Wr_ADA]>;
+
+def N3Wr_VDOT : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
+def N3Rd_VDOT : SchedReadAdvance<2, [N3Wr_VDOT]>;
+
+def N3Wr_VMMA : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
+def N3Rd_VMMA : SchedReadAdvance<2, [N3Wr_VMMA]>;
+
+def N3Wr_FCMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_FCMA : SchedReadAdvance<2, [N3Wr_FCMA]>;
+
+def N3Wr_FPM : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
+def N3Wr_FPMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_FPMA : SchedReadAdvance<2, [N3Wr_FPM, N3Wr_FPMA]>;
+
+def N3Wr_FPMAL : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_FPMAL : SchedReadAdvance<2, [N3Wr_FPMAL]>;
+
+def N3Wr_BFD : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_BFD : SchedReadAdvance<2, [N3Wr_BFD]>;
+
+def N3Wr_BFMMA : SchedWriteRes<[N3UnitV]> { let Latency = 5; }
+def N3Rd_BFMMA : SchedReadAdvance<2, [N3Wr_BFMMA]>;
+
+def N3Wr_BFMLA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_BFMLA : SchedReadAdvance<2, [N3Wr_BFMLA]>;
+
+def N3Wr_CRC : SchedWriteRes<[N3UnitM0]> { let Latency = 2; }
+def N3Rd_CRC : SchedReadAdvance<1, [N3Wr_CRC]>;
+
+def N3Wr_ZA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; }
+def N3Rd_ZA : SchedReadAdvance<3, [N3Wr_ZA]>;
+def N3Wr_ZPA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; }
+def N3Rd_ZPA : SchedReadAdvance<3, [N3Wr_ZPA]>;
+def N3Wr_ZSA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; }
+def N3Rd_ZSA : SchedReadAdvance<3, [N3Wr_ZSA]>;
+
+def N3Wr_ZDOTB : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
+def N3Rd_ZDOTB : SchedReadAdvance<2, [N3Wr_ZDOTB]>;
+def N3Wr_ZDOTH : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_ZDOTH : SchedReadAdvance<3, [N3Wr_ZDOTH]>;
+
+def N3Wr_ZCMABHS : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_ZCMABHS : SchedReadAdvance<3, [N3Wr_ZCMABHS]>;
+def N3Wr_ZCMAD : SchedWriteRes<[N3UnitV0, N3UnitV0]> { let Latency = 5; }
+def N3Rd_ZCMAD : SchedReadAdvance<2, [N3Wr_ZCMAD]>;
+
+def N3Wr_ZMMA : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
+def N3Rd_ZMMA : SchedReadAdvance<2, [N3Wr_ZMMA]>;
+
+def N3Wr_ZMABHS : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_ZMABHS : SchedReadAdvance<3, [N3Wr_ZMABHS]>;
+def N3Wr_ZMAD : SchedWriteRes<[N3UnitV0, N3UnitV0]> { let Latency = 5; }
+def N3Rd_ZMAD : SchedReadAdvance<2, [N3Wr_ZMAD]>;
+
+def N3Wr_ZMAL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_ZMAL : SchedReadAdvance<3, [N3Wr_ZMAL]>;
+
+def N3Wr_ZMASQL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Wr_ZMASQBHS : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Wr_ZMASQD : SchedWriteRes<[N3UnitV0, N3UnitV0]> { let Latency = 5; }
+def N3Rd_ZMASQ : SchedReadAdvance<2, [N3Wr_ZMASQL, N3Wr_ZMASQBHS,
+ N3Wr_ZMASQD]>;
+
+def N3Wr_ZFCMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_ZFCMA : SchedReadAdvance<2, [N3Wr_ZFCMA]>;
+
+def N3Wr_ZFMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_ZFMA : SchedReadAdvance<2, [N3Wr_ZFMA]>;
+
+def N3Wr_ZFMAL : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_ZFMAL : SchedReadAdvance<2, [N3Wr_ZFMAL]>;
+
+def N3Wr_ZBFDOT : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_ZBFDOT : SchedReadAdvance<2, [N3Wr_ZBFDOT]>;
+def N3Wr_ZBFMMA : SchedWriteRes<[N3UnitV]> { let Latency = 5; }
+def N3Rd_ZBFMMA : SchedReadAdvance<2, [N3Wr_ZBFMMA]>;
+def N3Wr_ZBFMAL : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_ZBFMAL : SchedReadAdvance<2, [N3Wr_ZBFMAL]>;
+
// Miscellaneous
// -----------------------------------------------------------------------------
@@ -832,10 +933,11 @@ def : SchedAlias<WriteFDiv , N3Write_7c_1V0>;
def : InstRW<[N3Write_12c_1V0], (instrs FDIVDrr, FSQRTDr)>;
// FP multiply
-def : SchedAlias<WriteFMul, N3Write_3c_1V>;
+def : WriteRes<WriteFMul, [N3UnitV]> { let Latency = 3; }
// FP multiply accumulate
-def : InstRW<[N3Write_4c_1V], (instregex "^(FMADD|FMSUB|FNMADD|FNMSUB)[DHS]rrr$")>;
+def : InstRW<[N3Wr_FMA, ReadDefault, ReadDefault, N3Rd_FMA],
+ (instregex "^(FMADD|FMSUB|FNMADD|FNMSUB)[DHS]rrr$")>;
// FP round to integral
def : InstRW<[N3Write_3c_1V0], (instregex "^FRINT([AIMNPXZ]|32X|64X|32Z|64Z)[DHS]r$")>;
@@ -969,7 +1071,7 @@ def : SchedAlias<WriteVq, N3Write_2c_1V>;
// ASIMD absolute diff accum long
// ASIMD pairwise add and accumulate long
// ASIMD shift accumulate
-def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ABAL?v",
+def : InstRW<[N3Wr_ADA, N3Rd_ADA], (instregex "^[SU]ABAL?v",
"^[SU]ADALPv",
"^[SU]R?SRAv")>;
@@ -984,10 +1086,11 @@ def : InstRW<[N3Write_6c_2V1], (instregex "^[SU]?ADDL?Vv16i8v$")>;
// ASIMD dot product
// ASIMD dot product using signed and unsigned integers
-def : InstRW<[N3Write_3c_1V], (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>;
+def : InstRW<[N3Wr_VDOT, N3Rd_VDOT],
+ (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>;
// ASIMD matrix multiply-accumulate
-def : InstRW<[N3Write_3c_1V], (instrs SMMLA, UMMLA, USMMLA)>;
+def : InstRW<[N3Wr_VMMA, N3Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>;
// ASIMD max/min, reduce, 4H/4S
def : InstRW<[N3Write_3c_1V1], (instregex "^[SU](MAX|MIN)Vv4i(16|32)v$")>;
@@ -1002,16 +1105,16 @@ def : InstRW<[N3Write_6c_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
def : InstRW<[N3Write_4c_1V0], (instregex "^MULv", "^SQ(R)?DMULHv")>;
// ASIMD multiply accumulate
-def : InstRW<[N3Write_4c_1V0], (instregex "^MLAv", "^MLSv")>;
+def : InstRW<[N3Wr_VMA, N3Rd_VMA], (instregex "^MLAv", "^MLSv")>;
// ASIMD multiply accumulate high
-def : InstRW<[N3Write_4c_1V0], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
+def : InstRW<[N3Wr_VMAH, N3Rd_VMAH], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
// ASIMD multiply accumulate long
-def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
+def : InstRW<[N3Wr_VMAL, N3Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
// ASIMD multiply accumulate saturating long
-def : InstRW<[N3Write_4c_1V0], (instregex "^SQDMLALv", "^SQDMLSLv")>;
+def : InstRW<[N3Wr_VMASL, N3Rd_VMASL], (instregex "^SQDMLALv", "^SQDMLSLv")>;
// ASIMD multiply/multiply long (8x8) polynomial, D-form
// ASIMD multiply/multiply long (8x8) polynomial, Q-form
@@ -1058,7 +1161,7 @@ def : InstRW<[N3Write_4c_1V1],
def : InstRW<[N3Write_3c_1V], (instregex "^FCADDv")>;
// ASIMD FP complex multiply add
-def : InstRW<[N3Write_4c_1V], (instregex "^FCMLAv")>;
+def : InstRW<[N3Wr_FCMA, N3Rd_FCMA], (instregex "^FCMLAv")>;
// ASIMD FP convert, long (F16 to F32)
def : InstRW<[N3Write_4c_2V0], (instregex "^FCVTL(v4|v8)i16")>;
@@ -1114,13 +1217,13 @@ def : InstRW<[N3Write_4c_2V], (instregex "^(FMAX|FMIN)(NM)?Vv4(i16|i32)v$")>;
def : InstRW<[N3Write_6c_3V], (instregex "^(FMAX|FMIN)(NM)?Vv8i16v$")>;
// ASIMD FP multiply
-def : InstRW<[N3Write_3c_1V], (instregex "^FMULv", "^FMULXv")>;
+def : InstRW<[N3Wr_FPM], (instregex "^FMULv", "^FMULXv")>;
// ASIMD FP multiply accumulate
-def : InstRW<[N3Write_4c_1V], (instregex "^FMLAv", "^FMLSv")>;
+def : InstRW<[N3Wr_FPMA, N3Rd_FPMA], (instregex "^FMLAv", "^FMLSv")>;
// ASIMD FP multiply accumulate long
-def : InstRW<[N3Write_4c_1V], (instregex "^FMLALv", "^FMLSLv")>;
+def : InstRW<[N3Wr_FPMAL, N3Rd_FPMAL], (instregex "^FMLALv", "^FMLSLv")>;
// ASIMD FP round, D-form F32 and Q-form F64
def : InstRW<[N3Write_3c_1V0],
@@ -1157,13 +1260,14 @@ def : InstRW<[N3Write_13c_2V0], (instrs FSQRTv2f64)>;
def : InstRW<[N3Write_4c_2V0], (instrs BFCVTN, BFCVTN2)>;
// ASIMD dot product
-def : InstRW<[N3Write_4c_1V], (instrs BFDOTv4bf16, BFDOTv8bf16)>;
+def : InstRW<[N3Wr_BFD, N3Rd_BFD], (instrs BFDOTv4bf16, BFDOTv8bf16)>;
// ASIMD matrix multiply accumulate
-def : InstRW<[N3Write_5c_1V], (instrs BFMMLA)>;
+def : InstRW<[N3Wr_BFMMA, N3Rd_BFMMA], (instrs BFMMLA)>;
// ASIMD multiply accumulate long
-def : InstRW<[N3Write_4c_1V], (instrs BFMLALB, BFMLALBIdx, BFMLALT, BFMLALTIdx)>;
+def : InstRW<[N3Wr_BFMLA, N3Rd_BFMLA],
+ (instrs BFMLALB, BFMLALBIdx, BFMLALT, BFMLALTIdx)>;
// Scalar convert, F32 to BF16
def : InstRW<[N3Write_3c_1V0], (instrs BFCVT)>;
@@ -1502,7 +1606,7 @@ def : InstRW<[N3Write_4c_1V0], (instrs SM4E, SM4ENCKEY)>;
// -----------------------------------------------------------------------------
// CRC checksum ops
-def : InstRW<[N3Write_2c_1M0], (instregex "^CRC32")>;
+def : InstRW<[N3Wr_CRC, N3Rd_CRC], (instregex "^CRC32")>;
// SVE Predicate instructions
// -----------------------------------------------------------------------------
@@ -1592,10 +1696,10 @@ def : InstRW<[N3Write_2c_1V], (instregex "^[SU]ABD_ZPmZ_[BHSD]",
"^[SU]ABD_ZPZZ_[BHSD]")>;
// Arithmetic, absolute diff accum
-def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ABA_ZZZ_[BHSD]$")>;
+def : InstRW<[N3Wr_ZA, N3Rd_ZA], (instregex "^[SU]ABA_ZZZ_[BHSD]$")>;
// Arithmetic, absolute diff accum long
-def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]$")>;
+def : InstRW<[N3Wr_ZA, N3Rd_ZA], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]$")>;
// Arithmetic, absolute diff long
def : InstRW<[N3Write_2c_1V], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]$")>;
@@ -1629,7 +1733,8 @@ def : InstRW<[N3Write_2c_1V], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]$")>;
def : InstRW<[N3Write_2c_1V], (instregex "^ADDP_ZPmZ_[BHSD]$")>;
// Arithmetic, pairwise add and accum long
-def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ADALP_ZPmZ_[HSD]$")>;
+def : InstRW<[N3Wr_ZPA, ReadDefault, N3Rd_ZPA],
+ (instregex "^[SU]ADALP_ZPmZ_[HSD]$")>;
// Arithmetic, shift
def : InstRW<[N3Write_2c_1V1],
@@ -1642,7 +1747,7 @@ def : InstRW<[N3Write_2c_1V1],
"^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>;
// Arithmetic, shift and accumulate
-def : InstRW<[N3Write_4c_1V1],
+def : InstRW<[N3Wr_ZSA, N3Rd_ZSA],
(instregex "^(SRSRA|SSRA|URSRA|USRA)_ZZI_[BHSD]$")>;
// Arithmetic, shift by immediate
@@ -1688,16 +1793,17 @@ def : InstRW<[N3Write_2c_1V],
def : InstRW<[N3Write_2c_1V], (instregex "^(SQ)?CADD_ZZI_[BHSD]$")>;
// Complex dot product 8-bit element
-def : InstRW<[N3Write_3c_1V], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>;
+def : InstRW<[N3Wr_ZDOTB, N3Rd_ZDOTB], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>;
// Complex dot product 16-bit element
-def : InstRW<[N3Write_4c_1V0], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>;
+def : InstRW<[N3Wr_ZDOTH, N3Rd_ZDOTH], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>;
// Complex multiply-add B, H, S element size
-def : InstRW<[N3Write_4c_1V0], (instregex "^CMLA_ZZZ_[BHS]$", "^CMLA_ZZZI_[HS]$")>;
+def : InstRW<[N3Wr_ZCMABHS, N3Rd_ZCMABHS],
+ (instregex "^CMLA_ZZZ_[BHS]$", "^CMLA_ZZZI_[HS]$")>;
// Complex multiply-add D element size
-def : InstRW<[N3Write_5c_2V0], (instrs CMLA_ZZZ_D)>;
+def : InstRW<[N3Wr_ZCMAD, N3Rd_ZCMAD], (instrs CMLA_ZZZ_D)>;
// Conditional extract operations, scalar form
def : InstRW<[N3Write_8c_1M0_1V], (instregex "^CLAST[AB]_RPZ_[BHSD]$")>;
@@ -1736,13 +1842,14 @@ def : InstRW<[N3Write_16c_16V0], (instregex "^[SU]DIVR?_ZPmZ_D",
"^[SU]DIV_ZPZZ_D")>;
// Dot product, 8 bit
-def : InstRW<[N3Write_3c_1V], (instregex "^[SU]DOT_ZZZI?_BtoS$")>;
+def : InstRW<[N3Wr_ZDOTB, N3Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_BtoS$")>;
// Dot product, 8 bit, using signed and unsigned integers
-def : InstRW<[N3Write_3c_1V], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>;
+def : InstRW<[N3Wr_ZDOTB, N3Rd_ZDOTB],
+ (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>;
// Dot product, 16 bit
-def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]DOT_ZZZI?_HtoD$")>;
+def : InstRW<[N3Wr_ZDOTH, N3Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_HtoD$")>;
// Duplicate, immediate and indexed form
def : InstRW<[N3Write_2c_1V], (instregex "^DUP_ZI_[BHSD]$",
@@ -1804,7 +1911,7 @@ def : InstRW<[N3Write_2c_1V], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]",
def : InstRW<[N3Write_2c_1V], (instregex "^N?MATCH_PPzZZ_[BH]$")>;
// Matrix multiply-accumulate
-def : InstRW<[N3Write_3c_1V], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
+def : InstRW<[N3Wr_ZMMA, N3Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
// Move prefix
def : InstRW<[N3Write_2c_1V], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]$",
@@ -1827,20 +1934,22 @@ def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]MULL[BT]_ZZZI_[SD]$",
"^[SU]MULL[BT]_ZZZ_[HSD]$")>;
// Multiply accumulate, B, H, S element size
-def : InstRW<[N3Write_4c_1V0], (instregex "^ML[AS]_ZZZI_[BHS]$",
- "^(ML[AS]|MAD|MSB)_(ZPmZZ|ZPZZZ)_[BHS]")>;
+def : InstRW<[N3Wr_ZMABHS, ReadDefault, N3Rd_ZMABHS],
+ (instregex "^ML[AS]_ZZZI_[BHS]$",
+ "^(ML[AS]|MAD|MSB)_(ZPmZZ|ZPZZZ)_[BHS]")>;
// Multiply accumulate, D element size
-def : InstRW<[N3Write_5c_2V0], (instregex "^ML[AS]_ZZZI_D$",
+def : InstRW<[N3Wr_ZMAD, ReadDefault, N3Rd_ZMAD], (instregex "^ML[AS]_ZZZI_D$",
"^(ML[AS]|MAD|MSB)_(ZPmZZ|ZPZZZ)_D")>;
// Multiply accumulate long
-def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]$",
+def : InstRW<[N3Wr_ZMAL, N3Rd_ZMAL], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]$",
"^[SU]ML[AS]L[BT]_ZZZI_[SD]$")>;
// Multiply accumulate saturating doubling long regular
-def : InstRW<[N3Write_4c_1V0], (instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]$",
- "^SQDML[AS](LB|LT)_ZZZI_[SD]$")>;
+def : InstRW<[N3Wr_ZMASQL, N3Rd_ZMASQ],
+ (instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]$",
+ "^SQDML[AS](LB|LT)_ZZZI_[SD]$")>;
// Multiply saturating doubling high, B, H, S element size
def : InstRW<[N3Write_4c_1V0], (instregex "^SQDMULH_ZZZ_[BHS]$",
@@ -1854,13 +1963,13 @@ def : InstRW<[N3Write_4c_1V0], (instregex "^SQDMULL[BT]_ZZZ_[HSD]$",
"^SQDMULL[BT]_ZZZI_[SD]$")>;
// Multiply saturating rounding doubling regular/complex accumulate, B, H, S element size
-def : InstRW<[N3Write_4c_1V0], (instregex "^SQRDML[AS]H_ZZZ_[BHS]$",
+def : InstRW<[N3Wr_ZMASQBHS, N3Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZ_[BHS]$",
"^SQRDCMLAH_ZZZ_[BHS]$",
"^SQRDML[AS]H_ZZZI_[HS]$",
"^SQRDCMLAH_ZZZI_[HS]$")>;
// Multiply saturating rounding doubling regular/complex accumulate, D element size
-def : InstRW<[N3Write_5c_2V0], (instregex "^SQRDML[AS]H_ZZZI?_D$",
+def : InstRW<[N3Wr_ZMASQD, N3Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZI?_D$",
"^SQRDCMLAH_ZZZ_D$")>;
// Multiply saturating rounding doubling regular/complex, B, H, S element size
@@ -1949,8 +2058,9 @@ def : InstRW<[N3Write_2c_1V], (instregex "^FAC(GE|GT)_PPzZZ_[HSD]$",
def : InstRW<[N3Write_3c_1V], (instregex "^FCADD_ZPmZ_[HSD]$")>;
// Floating point complex multiply add
-def : InstRW<[N3Write_4c_1V], (instregex "^FCMLA_ZPmZZ_[HSD]$",
- "^FCMLA_ZZZI_[HS]$")>;
+def : InstRW<[N3Wr_ZFCMA, ReadDefault, N3Rd_ZFCMA],
+ (instregex "^FCMLA_ZPmZZ_[HSD]")>;
+def : InstRW<[N3Wr_ZFCMA, N3Rd_ZFCMA], (instregex "^FCMLA_ZZZI_[HS]")>;
// Floating point convert, long or narrow (F16 to F32 or F32 to F16)
def : InstRW<[N3Write_4c_2V0], (instregex "^FCVT_ZPmZ_(HtoS|StoH)",
@@ -2014,12 +2124,15 @@ def : InstRW<[N3Write_3c_1V], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]",
"^FMUL_ZPZ[IZ]_[HSD]")>;
// Floating point multiply accumulate
-def : InstRW<[N3Write_4c_1V], (instregex "^F(N?M(AD|SB)|N?ML[AS])_ZPmZZ_[HSD]$",
- "^FN?ML[AS]_ZPZZZ_[HSD]",
- "^FML[AS]_ZZZI_[HSD]$")>;
+def : InstRW<[N3Wr_ZFMA, ReadDefault, N3Rd_ZFMA],
+ (instregex "^FN?ML[AS]_ZPmZZ_[HSD]",
+ "^FN?(MAD|MSB)_ZPmZZ_[HSD]")>;
+def : InstRW<[N3Wr_ZFMA, N3Rd_ZFMA],
+ (instregex "^FML[AS]_ZZZI_[HSD]",
+ "^FN?ML[AS]_ZPZZZ_[HSD]")>;
// Floating point multiply add/sub accumulate long
-def : InstRW<[N3Write_4c_1V], (instregex "^FML[AS]L[BT]_ZZZI?_SHH$")>;
+def : InstRW<[N3Wr_ZFMAL, N3Rd_ZFMAL], (instregex "^FML[AS]L[BT]_ZZZI?_SHH$")>;
// Floating point reciprocal estimate, F16
def : InstRW<[N3Write_6c_4V0], (instregex "^FR(ECP|SQRT)E_ZZ_H", "^FRECPX_ZPmZ_H")>;
@@ -2079,13 +2192,13 @@ def : InstRW<[N3Write_3c_1V], (instregex "^FTS(MUL|SEL)_ZZZ_[HSD]$")>;
def : InstRW<[N3Write_4c_2V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;
// Dot product
-def : InstRW<[N3Write_4c_1V], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
+def : InstRW<[N3Wr_ZBFDOT, N3Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
// Matrix multiply accumulate
-def : InstRW<[N3Write_5c_1V], (instrs BFMMLA_ZZZ_HtoS)>;
+def : InstRW<[N3Wr_ZBFMMA, N3Rd_ZBFMMA], (instrs BFMMLA_ZZZ_HtoS)>;
// Multiply accumulate long
-def : InstRW<[N3Write_4c_1V], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;
+def : InstRW<[N3Wr_ZBFMAL, N3Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;
// SVE Load instructions
// -----------------------------------------------------------------------------
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s
index 59e3af8abd708..f6b9db13624b6 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s
@@ -329,7 +329,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 703
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 0.57
# CHECK-NEXT: IPC: 0.57
# CHECK-NEXT: Block RThroughput: 3.0
@@ -343,9 +343,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,2] D===eeER . .. madd x0, x1, x2, x0
# CHECK-NEXT: [0,3] D=====eeER. .. madd x0, x0, x0, x0
# CHECK-NEXT: [1,0] D=======eeER .. mul x0, x0, x0
-# CHECK-NEXT: [1,1] D=========eeER .. madd x0, x1, x2, x0
-# CHECK-NEXT: [1,2] D==========eeER.. madd x0, x1, x2, x0
-# CHECK-NEXT: [1,3] D============eeER madd x0, x0, x0, x0
+# CHECK-NEXT: [1,1] .D========eeER .. madd x0, x1, x2, x0
+# CHECK-NEXT: [1,2] .D=========eeER.. madd x0, x1, x2, x0
+# CHECK-NEXT: [1,3] .D===========eeER madd x0, x0, x0, x0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -355,10 +355,10 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0
-# CHECK-NEXT: 1. 2 6.5 0.0 0.0 madd x0, x1, x2, x0
-# CHECK-NEXT: 2. 2 7.5 0.0 0.0 madd x0, x1, x2, x0
-# CHECK-NEXT: 3. 2 9.5 0.0 0.0 madd x0, x0, x0, x0
-# CHECK-NEXT: 2 7.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 2 6.0 0.0 0.0 madd x0, x1, x2, x0
+# CHECK-NEXT: 2. 2 7.0 0.0 0.0 madd x0, x1, x2, x0
+# CHECK-NEXT: 3. 2 9.0 0.0 0.0 madd x0, x0, x0, x0
+# CHECK-NEXT: 2 6.6 0.1 0.0 <total>
# CHECK: [1] Code Region - smaddl
@@ -367,7 +367,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 703
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 0.57
# CHECK-NEXT: IPC: 0.57
# CHECK-NEXT: Block RThroughput: 3.0
@@ -381,9 +381,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,2] D===eeER . .. smaddl x0, w1, w2, x0
# CHECK-NEXT: [0,3] D=====eeER. .. smaddl x0, w0, w0, x0
# CHECK-NEXT: [1,0] D=======eeER .. mul x0, x0, x0
-# CHECK-NEXT: [1,1] D=========eeER .. smaddl x0, w1, w2, x0
-# CHECK-NEXT: [1,2] D==========eeER.. smaddl x0, w1, w2, x0
-# CHECK-NEXT: [1,3] D============eeER smaddl x0, w0, w0, x0
+# CHECK-NEXT: [1,1] .D========eeER .. smaddl x0, w1, w2, x0
+# CHECK-NEXT: [1,2] .D=========eeER.. smaddl x0, w1, w2, x0
+# CHECK-NEXT: [1,3] .D===========eeER smaddl x0, w0, w0, x0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -393,39 +393,39 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0
-# CHECK-NEXT: 1. 2 6.5 0.0 0.0 smaddl x0, w1, w2, x0
-# CHECK-NEXT: 2. 2 7.5 0.0 0.0 smaddl x0, w1, w2, x0
-# CHECK-NEXT: 3. 2 9.5 0.0 0.0 smaddl x0, w0, w0, x0
-# CHECK-NEXT: 2 7.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 2 6.0 0.0 0.0 smaddl x0, w1, w2, x0
+# CHECK-NEXT: 2. 2 7.0 0.0 0.0 smaddl x0, w1, w2, x0
+# CHECK-NEXT: 3. 2 9.0 0.0 0.0 smaddl x0, w0, w0, x0
+# CHECK-NEXT: 2 6.6 0.1 0.0 <total>
# CHECK: [2] Code Region - fmadd
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 600
-# CHECK-NEXT: Total Cycles: 2103
+# CHECK-NEXT: Total Cycles: 1703
# CHECK-NEXT: Total uOps: 600
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.29
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.35
+# CHECK-NEXT: IPC: 0.35
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 01234
-
-# CHECK: [0,0] DeeER. . . . . . . . . fadd d0, d0, d0
-# CHECK-NEXT: [0,1] D==eeeeER . . . . . . . . fmadd d0, d1, d2, d0
-# CHECK-NEXT: [0,2] D======eeeER . . . . . . . fmul d0, d0, d0
-# CHECK-NEXT: [0,3] D=========eeeeER . . . . . . fmadd d0, d1, d2, d0
-# CHECK-NEXT: [0,4] D=============eeeeER. . . . . . fmadd d0, d1, d2, d0
-# CHECK-NEXT: [0,5] D=================eeeeER . . . . . fmadd d0, d0, d1, d2
-# CHECK-NEXT: [1,0] D=====================eeER . . . . fadd d0, d0, d0
-# CHECK-NEXT: [1,1] D=======================eeeeER. . . . fmadd d0, d1, d2, d0
-# CHECK-NEXT: [1,2] D===========================eeeER . . . fmul d0, d0, d0
-# CHECK-NEXT: [1,3] D==============================eeeeER . . fmadd d0, d1, d2, d0
-# CHECK-NEXT: [1,4] .D=================================eeeeER . fmadd d0, d1, d2, d0
-# CHECK-NEXT: [1,5] .D=====================================eeeeER fmadd d0, d0, d1, d2
+# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeER. . . . . . .. fadd d0, d0, d0
+# CHECK-NEXT: [0,1] D==eeeeER . . . . . .. fmadd d0, d1, d2, d0
+# CHECK-NEXT: [0,2] D======eeeER . . . . .. fmul d0, d0, d0
+# CHECK-NEXT: [0,3] D=======eeeeER . . . . .. fmadd d0, d1, d2, d0
+# CHECK-NEXT: [0,4] D=========eeeeER . . . .. fmadd d0, d1, d2, d0
+# CHECK-NEXT: [0,5] .D============eeeeER. . . .. fmadd d0, d0, d1, d2
+# CHECK-NEXT: [1,0] .D================eeER . . .. fadd d0, d0, d0
+# CHECK-NEXT: [1,1] .D==================eeeeER . .. fmadd d0, d1, d2, d0
+# CHECK-NEXT: [1,2] .D======================eeeER . .. fmul d0, d0, d0
+# CHECK-NEXT: [1,3] .D=======================eeeeER .. fmadd d0, d1, d2, d0
+# CHECK-NEXT: [1,4] . D========================eeeeER .. fmadd d0, d1, d2, d0
+# CHECK-NEXT: [1,5] . D============================eeeeER fmadd d0, d0, d1, d2
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -434,38 +434,38 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 11.5 0.5 0.0 fadd d0, d0, d0
-# CHECK-NEXT: 1. 2 13.5 0.0 0.0 fmadd d0, d1, d2, d0
-# CHECK-NEXT: 2. 2 17.5 0.0 0.0 fmul d0, d0, d0
-# CHECK-NEXT: 3. 2 20.5 0.0 0.0 fmadd d0, d1, d2, d0
-# CHECK-NEXT: 4. 2 24.0 0.0 0.0 fmadd d0, d1, d2, d0
-# CHECK-NEXT: 5. 2 28.0 0.0 0.0 fmadd d0, d0, d1, d2
-# CHECK-NEXT: 2 19.2 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 9.0 0.5 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 1. 2 11.0 0.0 0.0 fmadd d0, d1, d2, d0
+# CHECK-NEXT: 2. 2 15.0 0.0 0.0 fmul d0, d0, d0
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fmadd d0, d1, d2, d0
+# CHECK-NEXT: 4. 2 17.5 0.0 0.0 fmadd d0, d1, d2, d0
+# CHECK-NEXT: 5. 2 21.0 0.0 0.0 fmadd d0, d0, d1, d2
+# CHECK-NEXT: 2 14.9 0.1 0.0 <total>
# CHECK: [3] Code Region - saba
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1603
+# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.25
-# CHECK-NEXT: IPC: 0.25
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.31
+# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 01234
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
-# CHECK: [0,0] DeeeeER . . . . . . mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1] D====eeeeER . . . . . saba v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,2] D========eeeeER. . . . . saba v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,3] D============eeeeER . . . . saba v0.4s, v0.4s, v1.4s
-# CHECK-NEXT: [1,0] D================eeeeER . . . mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1] D====================eeeeER . . saba v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2] D========================eeeeER . saba v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3] D============================eeeeER saba v0.4s, v0.4s, v1.4s
+# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D====eeeeER . . . . saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2] D=====eeeeER . . . . saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3] D=========eeeeER . . . saba v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] .D================eeeeER . . saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2] .D=================eeeeER. . saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3] .D=====================eeeeER saba v0.4s, v0.4s, v1.4s
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -474,36 +474,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1. 2 13.0 0.0 0.0 saba v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2. 2 17.0 0.0 0.0 saba v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3. 2 21.0 0.0 0.0 saba v0.4s, v0.4s, v1.4s
-# CHECK-NEXT: 2 15.0 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 11.0 0.0 0.0 saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 saba v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: 2 11.6 0.1 0.0 <total>
# CHECK: [4] Code Region - sdot
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1303
+# CHECK-NEXT: Total Cycles: 1103
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.31
-# CHECK-NEXT: IPC: 0.31
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.36
# CHECK-NEXT: Block RThroughput: 1.5
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789 012345678
+# CHECK-NEXT: Index 0123456789 01234
-# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1] D====eeeER. . . . . sdot v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [0,2] D=======eeeER . . . . sdot v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [0,3] D==========eeeER . . . sdot v0.4s, v0.16b, v1.16b
-# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1] D=================eeeER . . sdot v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,2] D====================eeeER . sdot v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,3] D=======================eeeER sdot v0.4s, v0.16b, v1.16b
+# CHECK: [0,0] DeeeeER . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D====eeeER. . . . sdot v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,2] D=====eeeER . . . sdot v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,3] D========eeeER . . . sdot v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: [1,0] D===========eeeeER . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] .D==============eeeER . sdot v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,2] .D===============eeeER . sdot v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,3] .D==================eeeER sdot v0.4s, v0.16b, v1.16b
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -512,36 +512,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1. 2 11.5 0.0 0.0 sdot v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 2. 2 14.5 0.0 0.0 sdot v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 3. 2 17.5 0.0 0.0 sdot v0.4s, v0.16b, v1.16b
-# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 10.0 0.0 0.0 sdot v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 2. 2 11.0 0.0 0.0 sdot v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 3. 2 14.0 0.0 0.0 sdot v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: 2 10.4 0.1 0.0 <total>
# CHECK: [5] Code Region - smmla
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1303
+# CHECK-NEXT: Total Cycles: 1103
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.31
-# CHECK-NEXT: IPC: 0.31
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.36
# CHECK-NEXT: Block RThroughput: 1.5
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789 012345678
+# CHECK-NEXT: Index 0123456789 01234
-# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1] D====eeeER. . . . . smmla v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [0,2] D=======eeeER . . . . smmla v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [0,3] D==========eeeER . . . smmla v0.4s, v0.16b, v1.16b
-# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1] D=================eeeER . . smmla v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,2] D====================eeeER . smmla v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,3] D=======================eeeER smmla v0.4s, v0.16b, v1.16b
+# CHECK: [0,0] DeeeeER . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D====eeeER. . . . smmla v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,2] D=====eeeER . . . smmla v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,3] D========eeeER . . . smmla v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: [1,0] D===========eeeeER . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] .D==============eeeER . smmla v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,2] .D===============eeeER . smmla v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,3] .D==================eeeER smmla v0.4s, v0.16b, v1.16b
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -550,36 +550,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1. 2 11.5 0.0 0.0 smmla v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 2. 2 14.5 0.0 0.0 smmla v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 3. 2 17.5 0.0 0.0 smmla v0.4s, v0.16b, v1.16b
-# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 10.0 0.0 0.0 smmla v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 2. 2 11.0 0.0 0.0 smmla v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 3. 2 14.0 0.0 0.0 smmla v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: 2 10.4 0.1 0.0 <total>
# CHECK: [6] Code Region - mla
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1603
+# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.25
-# CHECK-NEXT: IPC: 0.25
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.31
+# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 01234
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
-# CHECK: [0,0] DeeeeER . . . . . . mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1] D====eeeeER . . . . . mla v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,2] D========eeeeER. . . . . mla v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,3] D============eeeeER . . . . mla v0.4s, v0.4s, v1.4s
-# CHECK-NEXT: [1,0] D================eeeeER . . . mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1] D====================eeeeER . . mla v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2] D========================eeeeER . mla v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3] D============================eeeeER mla v0.4s, v0.4s, v1.4s
+# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D====eeeeER . . . . mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2] D=====eeeeER . . . . mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3] D=========eeeeER . . . mla v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] .D================eeeeER . . mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2] .D=================eeeeER. . mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3] .D=====================eeeeER mla v0.4s, v0.4s, v1.4s
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -588,36 +588,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1. 2 13.0 0.0 0.0 mla v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2. 2 17.0 0.0 0.0 mla v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3. 2 21.0 0.0 0.0 mla v0.4s, v0.4s, v1.4s
-# CHECK-NEXT: 2 15.0 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 11.0 0.0 0.0 mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 mla v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: 2 11.6 0.1 0.0 <total>
# CHECK: [7] Code Region - sqrdmlah
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1603
+# CHECK-NEXT: Total Cycles: 1203
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.25
-# CHECK-NEXT: IPC: 0.25
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.33
+# CHECK-NEXT: IPC: 0.33
# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 01234
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 0123456
-# CHECK: [0,0] DeeeeER . . . . . . mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1] D====eeeeER . . . . . sqrdmlah v0.8h, v1.8h, v2.8h
-# CHECK-NEXT: [0,2] D========eeeeER. . . . . sqrdmlah v0.8h, v1.8h, v2.8h
-# CHECK-NEXT: [0,3] D============eeeeER . . . . sqrdmlah v0.8h, v1.8h, v2.8h
-# CHECK-NEXT: [1,0] D================eeeeER . . . mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1] D====================eeeeER . . sqrdmlah v0.8h, v1.8h, v2.8h
-# CHECK-NEXT: [1,2] D========================eeeeER . sqrdmlah v0.8h, v1.8h, v2.8h
-# CHECK-NEXT: [1,3] D============================eeeeER sqrdmlah v0.8h, v1.8h, v2.8h
+# CHECK: [0,0] DeeeeER . . . .. mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D====eeeeER . . .. sqrdmlah v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: [0,2] D======eeeeER . . .. sqrdmlah v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: [0,3] D========eeeeER. . .. sqrdmlah v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: [1,0] D============eeeeER . .. mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] .D===============eeeeER .. sqrdmlah v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: [1,2] .D=================eeeeER.. sqrdmlah v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: [1,3] .D===================eeeeER sqrdmlah v0.8h, v1.8h, v2.8h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -626,36 +626,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sqrdmlah v0.8h, v1.8h, v2.8h
-# CHECK-NEXT: 2. 2 17.0 0.0 0.0 sqrdmlah v0.8h, v1.8h, v2.8h
-# CHECK-NEXT: 3. 2 21.0 0.0 0.0 sqrdmlah v0.8h, v1.8h, v2.8h
-# CHECK-NEXT: 2 15.0 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 10.5 0.0 0.0 sqrdmlah v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: 2. 2 12.5 0.0 0.0 sqrdmlah v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: 3. 2 14.5 0.0 0.0 sqrdmlah v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: 2 11.1 0.1 0.0 <total>
# CHECK: [8] Code Region - smlal2
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1603
+# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.25
-# CHECK-NEXT: IPC: 0.25
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.31
+# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 01234
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
-# CHECK: [0,0] DeeeeER . . . . . . mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1] D====eeeeER . . . . . smlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,2] D========eeeeER. . . . . smlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,3] D============eeeeER . . . . smlal2 v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: [1,0] D================eeeeER . . . mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1] D====================eeeeER . . smlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2] D========================eeeeER . smlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3] D============================eeeeER smlal2 v0.4s, v0.8h, v1.8h
+# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D====eeeeER . . . . smlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2] D=====eeeeER . . . . smlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3] D=========eeeeER . . . smlal2 v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] .D================eeeeER . . smlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2] .D=================eeeeER. . smlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3] .D=====================eeeeER smlal2 v0.4s, v0.8h, v1.8h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -664,36 +664,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1. 2 13.0 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2. 2 17.0 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3. 2 21.0 0.0 0.0 smlal2 v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: 2 15.0 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 11.0 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 smlal2 v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: 2 11.6 0.1 0.0 <total>
# CHECK: [9] Code Region - sqdmlal2
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1603
+# CHECK-NEXT: Total Cycles: 1203
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.25
-# CHECK-NEXT: IPC: 0.25
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.33
+# CHECK-NEXT: IPC: 0.33
# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 01234
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 0123456
-# CHECK: [0,0] DeeeeER . . . . . . mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1] D====eeeeER . . . . . sqdmlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,2] D========eeeeER. . . . . sqdmlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,3] D============eeeeER . . . . sqdmlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,0] D================eeeeER . . . mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1] D====================eeeeER . . sqdmlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2] D========================eeeeER . sqdmlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3] D============================eeeeER sqdmlal2 v0.4s, v1.8h, v2.8h
+# CHECK: [0,0] DeeeeER . . . .. mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D====eeeeER . . .. sqdmlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2] D======eeeeER . . .. sqdmlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3] D========eeeeER. . .. sqdmlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,0] D============eeeeER . .. mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] .D===============eeeeER .. sqdmlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2] .D=================eeeeER.. sqdmlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3] .D===================eeeeER sqdmlal2 v0.4s, v1.8h, v2.8h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -702,36 +702,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sqdmlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2. 2 17.0 0.0 0.0 sqdmlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3. 2 21.0 0.0 0.0 sqdmlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2 15.0 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 10.5 0.0 0.0 sqdmlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2. 2 12.5 0.0 0.0 sqdmlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3. 2 14.5 0.0 0.0 sqdmlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2 11.1 0.1 0.0 <total>
# CHECK: [10] Code Region - sadalp
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1603
+# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.25
-# CHECK-NEXT: IPC: 0.25
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.31
+# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 01234
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
-# CHECK: [0,0] DeeeeER . . . . . . mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1] D====eeeeER . . . . . sadalp v0.2d, v1.4s
-# CHECK-NEXT: [0,2] D========eeeeER. . . . . sadalp v0.2d, v1.4s
-# CHECK-NEXT: [0,3] D============eeeeER . . . . sadalp v0.2d, v0.4s
-# CHECK-NEXT: [1,0] D================eeeeER . . . mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1] D====================eeeeER . . sadalp v0.2d, v1.4s
-# CHECK-NEXT: [1,2] D========================eeeeER . sadalp v0.2d, v1.4s
-# CHECK-NEXT: [1,3] D============================eeeeER sadalp v0.2d, v0.4s
+# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D====eeeeER . . . . sadalp v0.2d, v1.4s
+# CHECK-NEXT: [0,2] D=====eeeeER . . . . sadalp v0.2d, v1.4s
+# CHECK-NEXT: [0,3] D=========eeeeER . . . sadalp v0.2d, v0.4s
+# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] .D================eeeeER . . sadalp v0.2d, v1.4s
+# CHECK-NEXT: [1,2] .D=================eeeeER. . sadalp v0.2d, v1.4s
+# CHECK-NEXT: [1,3] .D=====================eeeeER sadalp v0.2d, v0.4s
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -740,36 +740,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sadalp v0.2d, v1.4s
-# CHECK-NEXT: 2. 2 17.0 0.0 0.0 sadalp v0.2d, v1.4s
-# CHECK-NEXT: 3. 2 21.0 0.0 0.0 sadalp v0.2d, v0.4s
-# CHECK-NEXT: 2 15.0 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 11.0 0.0 0.0 sadalp v0.2d, v1.4s
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 sadalp v0.2d, v1.4s
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sadalp v0.2d, v0.4s
+# CHECK-NEXT: 2 11.6 0.1 0.0 <total>
# CHECK: [11] Code Region - fcmla
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1503
+# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.27
-# CHECK-NEXT: IPC: 0.27
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.31
+# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 012
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
-# CHECK: [0,0] DeeeER . . . . . . fmul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1] D===eeeeER. . . . . . fcmla v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: [0,2] D=======eeeeER . . . . . fcmla v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: [0,3] D===========eeeeER . . . . fcmla v0.2d, v0.2d, v1.2d, #90
-# CHECK-NEXT: [1,0] D===============eeeER . . . fmul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1] D==================eeeeER. . . fcmla v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: [1,2] D======================eeeeER . . fcmla v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: [1,3] D==========================eeeeER fcmla v0.2d, v0.2d, v1.2d, #90
+# CHECK: [0,0] DeeeER . . . . . fmul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . fcmla v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [0,2] D=====eeeeER . . . . fcmla v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [0,3] D=========eeeeER . . . fcmla v0.2d, v0.2d, v1.2d, #90
+# CHECK-NEXT: [1,0] D=============eeeER . . . fmul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] .D===============eeeeER . . fcmla v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [1,2] .D=================eeeeER. . fcmla v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [1,3] .D=====================eeeeER fcmla v0.2d, v0.2d, v1.2d, #90
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -778,40 +778,40 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fcmla v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: 2. 2 15.5 0.0 0.0 fcmla v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: 3. 2 19.5 0.0 0.0 fcmla v0.2d, v0.2d, v1.2d, #90
-# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 10.0 0.0 0.0 fcmla v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 fcmla v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fcmla v0.2d, v0.2d, v1.2d, #90
+# CHECK-NEXT: 2 11.4 0.1 0.0 <total>
# CHECK: [12] Code Region - fmla
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 600
-# CHECK-NEXT: Total Cycles: 2103
+# CHECK-NEXT: Total Cycles: 1703
# CHECK-NEXT: Total uOps: 600
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.29
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.35
+# CHECK-NEXT: IPC: 0.35
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 01234
-
-# CHECK: [0,0] DeeeER . . . . . . . . fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,1] D===eeeeER. . . . . . . . fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [0,2] D=======eeER . . . . . . . fadd v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,3] D=========eeeeER . . . . . . fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [0,4] D=============eeeeER. . . . . . fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [0,5] D=================eeeeER . . . . . fmla v0.2d, v0.2d, v1.2d
-# CHECK-NEXT: [1,0] D=====================eeeER . . . . fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1] D========================eeeeER . . . fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,2] D============================eeER . . . fadd v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,3] D==============================eeeeER . . fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,4] .D=================================eeeeER . fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,5] .D=====================================eeeeER fmla v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeER . . . . . .. fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1] D=eeeeER . . . . . .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,2] D=====eeER. . . . . .. fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,3] D=======eeeeER . . . . .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,4] D=========eeeeER . . . .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,5] .D============eeeeER. . . .. fmla v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: [1,0] .D================eeeER . . .. fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1] .D=================eeeeER. . .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,2] .D=====================eeER . .. fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3] .D=======================eeeeER .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,4] . D========================eeeeER .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,5] . D============================eeeeER fmla v0.2d, v0.2d, v1.2d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -820,42 +820,42 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 11.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1. 2 14.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 2. 2 18.5 0.0 0.0 fadd v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 3. 2 20.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 4. 2 24.0 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 5. 2 28.0 0.0 0.0 fmla v0.2d, v0.2d, v1.2d
-# CHECK-NEXT: 2 19.5 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 9.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1. 2 10.0 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 2. 2 14.0 0.0 0.0 fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 4. 2 17.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 5. 2 21.0 0.0 0.0 fmla v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: 2 14.6 0.1 0.0 <total>
# CHECK: [13] Code Region - fmlal
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 600
-# CHECK-NEXT: Total Cycles: 2103
+# CHECK-NEXT: Total Cycles: 1903
# CHECK-NEXT: Total uOps: 600
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.29
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.32
+# CHECK-NEXT: IPC: 0.32
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 01234
-
-# CHECK: [0,0] DeeeER . . . . . . . . fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,1] D===eeeeER. . . . . . . . fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [0,2] D=======eeER . . . . . . . fadd v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,3] D=========eeeeER . . . . . . fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [0,4] D=============eeeeER. . . . . . fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [0,5] D=================eeeeER . . . . . fmlal v0.4s, v0.4h, v1.4h
-# CHECK-NEXT: [1,0] D=====================eeeER . . . . fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1] D========================eeeeER . . . fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,2] D============================eeER . . . fadd v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,3] D==============================eeeeER . . fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,4] .D=================================eeeeER . fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,5] .D=====================================eeeeER fmlal v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: Index 0123456789 0123456789 0
+
+# CHECK: [0,0] DeeeER . . . . . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . . . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,2] D=======eeER . . . . . . fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,3] D=========eeeeER . . . . . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,4] D===========eeeeER . . . . . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,5] .D==============eeeeER . . . . fmlal v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: [1,0] .D==================eeeER. . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1] .D=====================eeeeER . . . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,2] .D=========================eeER . . fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3] .D===========================eeeeER. . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,4] . D============================eeeeER . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,5] . D================================eeeeER fmlal v0.4s, v0.4h, v1.4h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -864,38 +864,38 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 11.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1. 2 14.5 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 2. 2 18.5 0.0 0.0 fadd v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 3. 2 20.5 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 4. 2 24.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 5. 2 28.0 0.0 0.0 fmlal v0.4s, v0.4h, v1.4h
-# CHECK-NEXT: 2 19.5 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 10.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1. 2 13.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 2. 2 17.0 0.0 0.0 fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3. 2 19.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 4. 2 20.5 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 5. 2 24.0 0.0 0.0 fmlal v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: 2 17.3 0.1 0.0 <total>
# CHECK: [14] Code Region - bfdot
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1503
+# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.27
-# CHECK-NEXT: IPC: 0.27
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.31
+# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 012
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
-# CHECK: [0,0] DeeeER . . . . . . fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,1] D===eeeeER. . . . . . bfdot v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,2] D=======eeeeER . . . . . bfdot v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,3] D===========eeeeER . . . . bfdot v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: [1,0] D===============eeeER . . . fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1] D==================eeeeER. . . bfdot v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2] D======================eeeeER . . bfdot v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3] D==========================eeeeER bfdot v0.4s, v0.8h, v1.8h
+# CHECK: [0,0] DeeeER . . . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . bfdot v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2] D=====eeeeER . . . . bfdot v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3] D=========eeeeER . . . bfdot v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0] D=============eeeER . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1] .D===============eeeeER . . bfdot v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2] .D=================eeeeER. . bfdot v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3] .D=====================eeeeER bfdot v0.4s, v0.8h, v1.8h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -904,36 +904,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1. 2 11.5 0.0 0.0 bfdot v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2. 2 15.5 0.0 0.0 bfdot v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3. 2 19.5 0.0 0.0 bfdot v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1. 2 10.0 0.0 0.0 bfdot v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 bfdot v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 bfdot v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: 2 11.4 0.1 0.0 <total>
# CHECK: [15] Code Region - bfmmla
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1803
+# CHECK-NEXT: Total Cycles: 1603
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.22
-# CHECK-NEXT: IPC: 0.22
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.25
+# CHECK-NEXT: IPC: 0.25
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 012345678
+# CHECK-NEXT: 0123456789 01234
# CHECK-NEXT: Index 0123456789 0123456789
-# CHECK: [0,0] DeeeER . . . . . . . fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,1] D===eeeeeER . . . . . . bfmmla v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,2] D========eeeeeER . . . . . bfmmla v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,3] D=============eeeeeER . . . . bfmmla v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: [1,0] D==================eeeER . . . . fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1] D=====================eeeeeER . . . bfmmla v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2] D==========================eeeeeER . . bfmmla v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3] D===============================eeeeeER bfmmla v0.4s, v0.8h, v1.8h
+# CHECK: [0,0] DeeeER . . . . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1] D===eeeeeER . . . . . bfmmla v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2] D======eeeeeER . . . . . bfmmla v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3] D===========eeeeeER . . . . bfmmla v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0] D================eeeER . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1] .D==================eeeeeER . . bfmmla v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2] .D=====================eeeeeER. . bfmmla v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3] .D==========================eeeeeER bfmmla v0.4s, v0.8h, v1.8h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -942,36 +942,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 10.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1. 2 13.0 0.0 0.0 bfmmla v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2. 2 18.0 0.0 0.0 bfmmla v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3. 2 23.0 0.0 0.0 bfmmla v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: 2 16.0 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 9.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 bfmmla v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2. 2 14.5 0.0 0.0 bfmmla v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3. 2 19.5 0.0 0.0 bfmmla v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: 2 13.6 0.1 0.0 <total>
# CHECK: [16] Code Region - bfmlalb
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1503
+# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.27
-# CHECK-NEXT: IPC: 0.27
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.31
+# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 012
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
-# CHECK: [0,0] DeeeER . . . . . . fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,1] D===eeeeER. . . . . . bfmlalb v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,2] D=======eeeeER . . . . . bfmlalb v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,3] D===========eeeeER . . . . bfmlalb v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: [1,0] D===============eeeER . . . fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1] D==================eeeeER. . . bfmlalb v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2] D======================eeeeER . . bfmlalb v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3] D==========================eeeeER bfmlalb v0.4s, v0.8h, v1.8h
+# CHECK: [0,0] DeeeER . . . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . bfmlalb v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2] D=====eeeeER . . . . bfmlalb v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3] D=========eeeeER . . . bfmlalb v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0] D=============eeeER . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1] .D===============eeeeER . . bfmlalb v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2] .D=================eeeeER. . bfmlalb v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3] .D=====================eeeeER bfmlalb v0.4s, v0.8h, v1.8h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -980,50 +980,50 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1. 2 11.5 0.0 0.0 bfmlalb v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2. 2 15.5 0.0 0.0 bfmlalb v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3. 2 19.5 0.0 0.0 bfmlalb v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1. 2 10.0 0.0 0.0 bfmlalb v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 bfmlalb v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 bfmlalb v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: 2 11.4 0.1 0.0 <total>
# CHECK: [17] Code Region - crc32
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1100
-# CHECK-NEXT: Total Cycles: 2203
+# CHECK-NEXT: Total Cycles: 1403
# CHECK-NEXT: Total uOps: 1100
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.50
-# CHECK-NEXT: IPC: 0.50
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.78
+# CHECK-NEXT: IPC: 0.78
# CHECK-NEXT: Block RThroughput: 10.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123456
-
-# CHECK: [0,0] DeeER. . . . . . . . .. mul w0, w0, w0
-# CHECK-NEXT: [0,1] D==eeER . . . . . . . .. crc32cb w0, w0, w1
-# CHECK-NEXT: [0,2] D====eeER . . . . . . . .. crc32cb w0, w0, w1
-# CHECK-NEXT: [0,3] D======eeER . . . . . . .. crc32cb w0, w0, w0
-# CHECK-NEXT: [0,4] D========eeER . . . . . . .. crc32b w0, w0, w15
-# CHECK-NEXT: [0,5] D==========eeER. . . . . . .. crc32h w0, w0, w21
-# CHECK-NEXT: [0,6] D============eeER . . . . . .. crc32w w0, w0, w24
-# CHECK-NEXT: [0,7] D==============eeER . . . . . .. crc32x w0, w0, x25
-# CHECK-NEXT: [0,8] D================eeER . . . . .. crc32ch w0, w0, w16
-# CHECK-NEXT: [0,9] D==================eeER . . . . .. crc32cw w0, w0, w23
-# CHECK-NEXT: [0,10] .D===================eeER. . . . .. crc32cx w0, w0, x5
-# CHECK-NEXT: [1,0] .D=====================eeER . . . .. mul w0, w0, w0
-# CHECK-NEXT: [1,1] .D=======================eeER . . . .. crc32cb w0, w0, w1
-# CHECK-NEXT: [1,2] .D=========================eeER . . .. crc32cb w0, w0, w1
-# CHECK-NEXT: [1,3] .D===========================eeER . . .. crc32cb w0, w0, w0
-# CHECK-NEXT: [1,4] .D=============================eeER. . .. crc32b w0, w0, w15
-# CHECK-NEXT: [1,5] .D===============================eeER . .. crc32h w0, w0, w21
-# CHECK-NEXT: [1,6] .D=================================eeER . .. crc32w w0, w0, w24
-# CHECK-NEXT: [1,7] .D===================================eeER .. crc32x w0, w0, x25
-# CHECK-NEXT: [1,8] .D=====================================eeER .. crc32ch w0, w0, w16
-# CHECK-NEXT: [1,9] . D======================================eeER.. crc32cw w0, w0, w23
-# CHECK-NEXT: [1,10] . D========================================eeER crc32cx w0, w0, x5
+# CHECK-NEXT: 0123456789 0
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeER. . . . . . mul w0, w0, w0
+# CHECK-NEXT: [0,1] D==eeER . . . . . crc32cb w0, w0, w1
+# CHECK-NEXT: [0,2] D===eeER . . . . . crc32cb w0, w0, w1
+# CHECK-NEXT: [0,3] D=====eeER. . . . . crc32cb w0, w0, w0
+# CHECK-NEXT: [0,4] D======eeER . . . . crc32b w0, w0, w15
+# CHECK-NEXT: [0,5] .D======eeER . . . . crc32h w0, w0, w21
+# CHECK-NEXT: [0,6] .D=======eeER . . . . crc32w w0, w0, w24
+# CHECK-NEXT: [0,7] .D========eeER . . . . crc32x w0, w0, x25
+# CHECK-NEXT: [0,8] .D=========eeER. . . . crc32ch w0, w0, w16
+# CHECK-NEXT: [0,9] .D==========eeER . . . crc32cw w0, w0, w23
+# CHECK-NEXT: [0,10] . D==========eeER . . . crc32cx w0, w0, x5
+# CHECK-NEXT: [1,0] . D============eeER . . . mul w0, w0, w0
+# CHECK-NEXT: [1,1] . D==============eeER . . crc32cb w0, w0, w1
+# CHECK-NEXT: [1,2] . D===============eeER . . crc32cb w0, w0, w1
+# CHECK-NEXT: [1,3] . D=================eeER . . crc32cb w0, w0, w0
+# CHECK-NEXT: [1,4] . D=================eeER. . crc32b w0, w0, w15
+# CHECK-NEXT: [1,5] . D==================eeER . crc32h w0, w0, w21
+# CHECK-NEXT: [1,6] . D===================eeER . crc32w w0, w0, w24
+# CHECK-NEXT: [1,7] . D====================eeER . crc32x w0, w0, x25
+# CHECK-NEXT: [1,8] . D=====================eeER . crc32ch w0, w0, w16
+# CHECK-NEXT: [1,9] . D=====================eeER. crc32cw w0, w0, w23
+# CHECK-NEXT: [1,10] . D======================eeER crc32cx w0, w0, x5
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1032,43 +1032,43 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 11.5 0.5 0.0 mul w0, w0, w0
-# CHECK-NEXT: 1. 2 13.5 0.0 0.0 crc32cb w0, w0, w1
-# CHECK-NEXT: 2. 2 15.5 0.0 0.0 crc32cb w0, w0, w1
-# CHECK-NEXT: 3. 2 17.5 0.0 0.0 crc32cb w0, w0, w0
-# CHECK-NEXT: 4. 2 19.5 0.0 0.0 crc32b w0, w0, w15
-# CHECK-NEXT: 5. 2 21.5 0.0 0.0 crc32h w0, w0, w21
-# CHECK-NEXT: 6. 2 23.5 0.0 0.0 crc32w w0, w0, w24
-# CHECK-NEXT: 7. 2 25.5 0.0 0.0 crc32x w0, w0, x25
-# CHECK-NEXT: 8. 2 27.5 0.0 0.0 crc32ch w0, w0, w16
-# CHECK-NEXT: 9. 2 29.0 0.0 0.0 crc32cw w0, w0, w23
-# CHECK-NEXT: 10. 2 30.5 0.0 0.0 crc32cx w0, w0, x5
-# CHECK-NEXT: 2 21.4 0.0 0.0 <total>
+# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul w0, w0, w0
+# CHECK-NEXT: 1. 2 9.0 0.0 0.0 crc32cb w0, w0, w1
+# CHECK-NEXT: 2. 2 10.0 0.0 0.0 crc32cb w0, w0, w1
+# CHECK-NEXT: 3. 2 12.0 0.0 0.0 crc32cb w0, w0, w0
+# CHECK-NEXT: 4. 2 12.5 0.0 0.0 crc32b w0, w0, w15
+# CHECK-NEXT: 5. 2 13.0 0.0 0.0 crc32h w0, w0, w21
+# CHECK-NEXT: 6. 2 14.0 0.0 0.0 crc32w w0, w0, w24
+# CHECK-NEXT: 7. 2 15.0 0.0 0.0 crc32x w0, w0, x25
+# CHECK-NEXT: 8. 2 16.0 0.0 0.0 crc32ch w0, w0, w16
+# CHECK-NEXT: 9. 2 16.5 0.0 0.0 crc32cw w0, w0, w23
+# CHECK-NEXT: 10. 2 17.0 0.0 0.0 crc32cx w0, w0, x5
+# CHECK-NEXT: 2 12.9 0.0 0.0 <total>
# CHECK: [18] Code Region - Z saba
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total Cycles: 1403
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.23
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: 0123456789 0
# CHECK-NEXT: Index 0123456789 0123456789
-# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. saba z0.d, z1.d, z2.d
-# CHECK-NEXT: [0,2] D=========eeeeER . . . .. saba z0.d, z1.d, z2.d
-# CHECK-NEXT: [0,3] D=============eeeeER. . . .. saba z0.d, z0.d, z1.d
-# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D======================eeeeER . .. saba z0.d, z1.d, z2.d
-# CHECK-NEXT: [1,2] D==========================eeeeER .. saba z0.d, z1.d, z2.d
-# CHECK-NEXT: [1,3] D==============================eeeeER saba z0.d, z0.d, z1.d
+# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . saba z0.d, z1.d, z2.d
+# CHECK-NEXT: [0,2] D======eeeeER . . . . saba z0.d, z1.d, z2.d
+# CHECK-NEXT: [0,3] D==========eeeeER . . . saba z0.d, z0.d, z1.d
+# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D==================eeeeER . saba z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,2] .D===================eeeeER . saba z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,3] .D=======================eeeeER saba z0.d, z0.d, z1.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1077,36 +1077,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 14.5 0.0 0.0 saba z0.d, z1.d, z2.d
-# CHECK-NEXT: 2. 2 18.5 0.0 0.0 saba z0.d, z1.d, z2.d
-# CHECK-NEXT: 3. 2 22.5 0.0 0.0 saba z0.d, z0.d, z1.d
-# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 12.5 0.0 0.0 saba z0.d, z1.d, z2.d
+# CHECK-NEXT: 2. 2 13.5 0.0 0.0 saba z0.d, z1.d, z2.d
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 saba z0.d, z0.d, z1.d
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
# CHECK: [19] Code Region - Z sabalt
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1603
+# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.25
-# CHECK-NEXT: IPC: 0.25
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.31
+# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 01234
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
-# CHECK: [0,0] DeeeeER . . . . . . mul z0.h, z0.h, z0.h
-# CHECK-NEXT: [0,1] D====eeeeER . . . . . sabalt z0.h, z1.b, z2.b
-# CHECK-NEXT: [0,2] D========eeeeER. . . . . sabalt z0.h, z1.b, z2.b
-# CHECK-NEXT: [0,3] D============eeeeER . . . . sabalt z0.h, z0.b, z1.b
-# CHECK-NEXT: [1,0] D================eeeeER . . . mul z0.h, z0.h, z0.h
-# CHECK-NEXT: [1,1] D====================eeeeER . . sabalt z0.h, z1.b, z2.b
-# CHECK-NEXT: [1,2] D========================eeeeER . sabalt z0.h, z1.b, z2.b
-# CHECK-NEXT: [1,3] D============================eeeeER sabalt z0.h, z0.b, z1.b
+# CHECK: [0,0] DeeeeER . . . . . mul z0.h, z0.h, z0.h
+# CHECK-NEXT: [0,1] D====eeeeER . . . . sabalt z0.h, z1.b, z2.b
+# CHECK-NEXT: [0,2] D=====eeeeER . . . . sabalt z0.h, z1.b, z2.b
+# CHECK-NEXT: [0,3] D=========eeeeER . . . sabalt z0.h, z0.b, z1.b
+# CHECK-NEXT: [1,0] D=============eeeeER. . . mul z0.h, z0.h, z0.h
+# CHECK-NEXT: [1,1] .D================eeeeER . . sabalt z0.h, z1.b, z2.b
+# CHECK-NEXT: [1,2] .D=================eeeeER. . sabalt z0.h, z1.b, z2.b
+# CHECK-NEXT: [1,3] .D=====================eeeeER sabalt z0.h, z0.b, z1.b
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1115,36 +1115,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.0 0.5 0.0 mul z0.h, z0.h, z0.h
-# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sabalt z0.h, z1.b, z2.b
-# CHECK-NEXT: 2. 2 17.0 0.0 0.0 sabalt z0.h, z1.b, z2.b
-# CHECK-NEXT: 3. 2 21.0 0.0 0.0 sabalt z0.h, z0.b, z1.b
-# CHECK-NEXT: 2 15.0 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.h, z0.h, z0.h
+# CHECK-NEXT: 1. 2 11.0 0.0 0.0 sabalt z0.h, z1.b, z2.b
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 sabalt z0.h, z1.b, z2.b
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sabalt z0.h, z0.b, z1.b
+# CHECK-NEXT: 2 11.6 0.1 0.0 <total>
# CHECK: [20] Code Region - Z sadalp
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total Cycles: 1403
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.23
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: 0123456789 0
# CHECK-NEXT: Index 0123456789 0123456789
-# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. sadalp z0.d, p0/m, z1.s
-# CHECK-NEXT: [0,2] D=========eeeeER . . . .. sadalp z0.d, p0/m, z1.s
-# CHECK-NEXT: [0,3] D=============eeeeER. . . .. sadalp z0.d, p0/m, z0.s
-# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D======================eeeeER . .. sadalp z0.d, p0/m, z1.s
-# CHECK-NEXT: [1,2] D==========================eeeeER .. sadalp z0.d, p0/m, z1.s
-# CHECK-NEXT: [1,3] D==============================eeeeER sadalp z0.d, p0/m, z0.s
+# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . sadalp z0.d, p0/m, z1.s
+# CHECK-NEXT: [0,2] D======eeeeER . . . . sadalp z0.d, p0/m, z1.s
+# CHECK-NEXT: [0,3] D==========eeeeER . . . sadalp z0.d, p0/m, z0.s
+# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D==================eeeeER . sadalp z0.d, p0/m, z1.s
+# CHECK-NEXT: [1,2] .D===================eeeeER . sadalp z0.d, p0/m, z1.s
+# CHECK-NEXT: [1,3] .D=======================eeeeER sadalp z0.d, p0/m, z0.s
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1153,36 +1153,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 14.5 0.0 0.0 sadalp z0.d, p0/m, z1.s
-# CHECK-NEXT: 2. 2 18.5 0.0 0.0 sadalp z0.d, p0/m, z1.s
-# CHECK-NEXT: 3. 2 22.5 0.0 0.0 sadalp z0.d, p0/m, z0.s
-# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 12.5 0.0 0.0 sadalp z0.d, p0/m, z1.s
+# CHECK-NEXT: 2. 2 13.5 0.0 0.0 sadalp z0.d, p0/m, z1.s
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 sadalp z0.d, p0/m, z0.s
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
# CHECK: [21] Code Region - Z ssra
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total Cycles: 1403
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.23
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: 0123456789 0
# CHECK-NEXT: Index 0123456789 0123456789
-# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. ssra z0.d, z1.d, #1
-# CHECK-NEXT: [0,2] D=========eeeeER . . . .. ssra z0.d, z1.d, #1
-# CHECK-NEXT: [0,3] D=============eeeeER. . . .. ssra z0.d, z0.d, #1
-# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D======================eeeeER . .. ssra z0.d, z1.d, #1
-# CHECK-NEXT: [1,2] D==========================eeeeER .. ssra z0.d, z1.d, #1
-# CHECK-NEXT: [1,3] D==============================eeeeER ssra z0.d, z0.d, #1
+# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . ssra z0.d, z1.d, #1
+# CHECK-NEXT: [0,2] D======eeeeER . . . . ssra z0.d, z1.d, #1
+# CHECK-NEXT: [0,3] D==========eeeeER . . . ssra z0.d, z0.d, #1
+# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D==================eeeeER . ssra z0.d, z1.d, #1
+# CHECK-NEXT: [1,2] .D===================eeeeER . ssra z0.d, z1.d, #1
+# CHECK-NEXT: [1,3] .D=======================eeeeER ssra z0.d, z0.d, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1191,36 +1191,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 14.5 0.0 0.0 ssra z0.d, z1.d, #1
-# CHECK-NEXT: 2. 2 18.5 0.0 0.0 ssra z0.d, z1.d, #1
-# CHECK-NEXT: 3. 2 22.5 0.0 0.0 ssra z0.d, z0.d, #1
-# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 12.5 0.0 0.0 ssra z0.d, z1.d, #1
+# CHECK-NEXT: 2. 2 13.5 0.0 0.0 ssra z0.d, z1.d, #1
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 ssra z0.d, z0.d, #1
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
# CHECK: [22] Code Region - Z cdot.s
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1403
+# CHECK-NEXT: Total Cycles: 1203
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.36
-# CHECK-NEXT: IPC: 0.29
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.42
+# CHECK-NEXT: IPC: 0.33
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 0123456
-# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D=====eeeER . . . . cdot z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: [0,2] D========eeeER . . . . cdot z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: [0,3] D===========eeeER . . . cdot z0.s, z0.b, z1.b, #90
-# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D===================eeeER. . cdot z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: [1,2] D======================eeeER . cdot z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: [1,3] D=========================eeeER cdot z0.s, z0.b, z1.b, #90
+# CHECK: [0,0] DeeeeeER . . . .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeER . . .. cdot z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [0,2] D======eeeER . . .. cdot z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [0,3] D=========eeeER. . .. cdot z0.s, z0.b, z1.b, #90
+# CHECK-NEXT: [1,0] .D===========eeeeeER. .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D================eeeER .. cdot z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [1,2] .D=================eeeER .. cdot z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [1,3] .D====================eeeER cdot z0.s, z0.b, z1.b, #90
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1229,36 +1229,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 13.0 0.0 0.0 cdot z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: 2. 2 16.0 0.0 0.0 cdot z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: 3. 2 19.0 0.0 0.0 cdot z0.s, z0.b, z1.b, #90
-# CHECK-NEXT: 2 14.0 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 cdot z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: 2. 2 12.5 0.0 0.0 cdot z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: 3. 2 15.5 0.0 0.0 cdot z0.s, z0.b, z1.b, #90
+# CHECK-NEXT: 2 11.5 0.1 0.0 <total>
# CHECK: [23] Code Region - Z cdot.d
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total Cycles: 1403
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.23
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: Block RThroughput: 5.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: 0123456789 0
# CHECK-NEXT: Index 0123456789 0123456789
-# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. cdot z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: [0,2] D=========eeeeER . . . .. cdot z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: [0,3] D=============eeeeER. . . .. cdot z0.d, z0.h, z1.h, #90
-# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D======================eeeeER . .. cdot z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: [1,2] D==========================eeeeER .. cdot z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: [1,3] D==============================eeeeER cdot z0.d, z0.h, z1.h, #90
+# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . cdot z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [0,2] D======eeeeER . . . . cdot z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [0,3] D==========eeeeER . . . cdot z0.d, z0.h, z1.h, #90
+# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D==================eeeeER . cdot z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [1,2] .D===================eeeeER . cdot z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [1,3] .D=======================eeeeER cdot z0.d, z0.h, z1.h, #90
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1267,36 +1267,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 14.5 0.0 0.0 cdot z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: 2. 2 18.5 0.0 0.0 cdot z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: 3. 2 22.5 0.0 0.0 cdot z0.d, z0.h, z1.h, #90
-# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 12.5 0.0 0.0 cdot z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: 2. 2 13.5 0.0 0.0 cdot z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 cdot z0.d, z0.h, z1.h, #90
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
# CHECK: [24] Code Region - Z cmla.b
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total Cycles: 1403
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.23
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: Block RThroughput: 5.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: 0123456789 0
# CHECK-NEXT: Index 0123456789 0123456789
-# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. cmla z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: [0,2] D=========eeeeER . . . .. cmla z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: [0,3] D=============eeeeER. . . .. cmla z0.b, z0.b, z1.b, #90
-# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D======================eeeeER . .. cmla z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: [1,2] D==========================eeeeER .. cmla z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: [1,3] D==============================eeeeER cmla z0.b, z0.b, z1.b, #90
+# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . cmla z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [0,2] D======eeeeER . . . . cmla z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [0,3] D==========eeeeER . . . cmla z0.b, z0.b, z1.b, #90
+# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D==================eeeeER . cmla z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [1,2] .D===================eeeeER . cmla z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [1,3] .D=======================eeeeER cmla z0.b, z0.b, z1.b, #90
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1305,36 +1305,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 14.5 0.0 0.0 cmla z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: 2. 2 18.5 0.0 0.0 cmla z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: 3. 2 22.5 0.0 0.0 cmla z0.b, z0.b, z1.b, #90
-# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 12.5 0.0 0.0 cmla z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: 2. 2 13.5 0.0 0.0 cmla z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 cmla z0.b, z0.b, z1.b, #90
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
# CHECK: [25] Code Region - Z cmla.d
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 2003
-# CHECK-NEXT: Total uOps: 800
+# CHECK-NEXT: Total Cycles: 1803
+# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.40
-# CHECK-NEXT: IPC: 0.20
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.28
+# CHECK-NEXT: IPC: 0.22
# CHECK-NEXT: Block RThroughput: 8.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 012
+# CHECK-NEXT: 0123456789 012345678
+# CHECK-NEXT: Index 0123456789 0123456789
-# CHECK: [0,0] DeeeeeER . . . . . . . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D=====eeeeeER . . . . . . . cmla z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: [0,2] D==========eeeeeER . . . . . . cmla z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: [0,3] D===============eeeeeER . . . . . cmla z0.d, z0.d, z1.d, #90
-# CHECK-NEXT: [1,0] D====================eeeeeER . . . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] .D========================eeeeeER . . . cmla z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: [1,2] .D=============================eeeeeER . . cmla z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: [1,3] .D==================================eeeeeER cmla z0.d, z0.d, z1.d, #90
+# CHECK: [0,0] DeeeeeER . . . . . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeeER . . . . . . cmla z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [0,2] D========eeeeeER . . . . . cmla z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [0,3] D=============eeeeeER . . . . cmla z0.d, z0.d, z1.d, #90
+# CHECK-NEXT: [1,0] .D=================eeeeeER . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D======================eeeeeER . . cmla z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [1,2] .D=========================eeeeeER . . cmla z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [1,3] .D==============================eeeeeER cmla z0.d, z0.d, z1.d, #90
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1343,36 +1343,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 11.0 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 15.5 0.0 0.0 cmla z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: 2. 2 20.5 0.0 0.0 cmla z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: 3. 2 25.5 0.0 0.0 cmla z0.d, z0.d, z1.d, #90
-# CHECK-NEXT: 2 18.1 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 14.5 0.0 0.0 cmla z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: 2. 2 17.5 0.0 0.0 cmla z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: 3. 2 22.5 0.0 0.0 cmla z0.d, z0.d, z1.d, #90
+# CHECK-NEXT: 2 16.0 0.1 0.0 <total>
# CHECK: [26] Code Region - Z sdot.s
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1403
+# CHECK-NEXT: Total Cycles: 1203
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.36
-# CHECK-NEXT: IPC: 0.29
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.42
+# CHECK-NEXT: IPC: 0.33
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 0123456
-# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [0,1] D=====eeeER . . . . sdot z0.s, z1.b, z2.b
-# CHECK-NEXT: [0,2] D========eeeER . . . . sdot z0.s, z1.b, z2.b
-# CHECK-NEXT: [0,3] D===========eeeER . . . sdot z0.s, z0.b, z1.b
-# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [1,1] D===================eeeER. . sdot z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,2] D======================eeeER . sdot z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,3] D=========================eeeER sdot z0.s, z0.b, z1.b
+# CHECK: [0,0] DeeeeeER . . . .. mul z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeER . . .. sdot z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,2] D======eeeER . . .. sdot z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,3] D=========eeeER. . .. sdot z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,0] .D===========eeeeeER. .. mul z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D================eeeER .. sdot z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,2] .D=================eeeER .. sdot z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3] .D====================eeeER sdot z0.s, z0.b, z1.b
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1381,36 +1381,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sdot z0.s, z1.b, z2.b
-# CHECK-NEXT: 2. 2 16.0 0.0 0.0 sdot z0.s, z1.b, z2.b
-# CHECK-NEXT: 3. 2 19.0 0.0 0.0 sdot z0.s, z0.b, z1.b
-# CHECK-NEXT: 2 14.0 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 sdot z0.s, z1.b, z2.b
+# CHECK-NEXT: 2. 2 12.5 0.0 0.0 sdot z0.s, z1.b, z2.b
+# CHECK-NEXT: 3. 2 15.5 0.0 0.0 sdot z0.s, z0.b, z1.b
+# CHECK-NEXT: 2 11.5 0.1 0.0 <total>
# CHECK: [27] Code Region - Z sudot
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1403
+# CHECK-NEXT: Total Cycles: 1203
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.36
-# CHECK-NEXT: IPC: 0.29
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.42
+# CHECK-NEXT: IPC: 0.33
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 0123456
-# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [0,1] D=====eeeER . . . . sdot z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: [0,2] D========eeeER . . . . sdot z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: [0,3] D===========eeeER . . . sdot z0.s, z0.b, z1.b[1]
-# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [1,1] D===================eeeER. . sdot z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: [1,2] D======================eeeER . sdot z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: [1,3] D=========================eeeER sdot z0.s, z0.b, z1.b[1]
+# CHECK: [0,0] DeeeeeER . . . .. mul z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeER . . .. sdot z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [0,2] D======eeeER . . .. sdot z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [0,3] D=========eeeER. . .. sdot z0.s, z0.b, z1.b[1]
+# CHECK-NEXT: [1,0] .D===========eeeeeER. .. mul z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D================eeeER .. sdot z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,2] .D=================eeeER .. sdot z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,3] .D====================eeeER sdot z0.s, z0.b, z1.b[1]
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1419,36 +1419,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sdot z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: 2. 2 16.0 0.0 0.0 sdot z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: 3. 2 19.0 0.0 0.0 sdot z0.s, z0.b, z1.b[1]
-# CHECK-NEXT: 2 14.0 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 sdot z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 2. 2 12.5 0.0 0.0 sdot z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 3. 2 15.5 0.0 0.0 sdot z0.s, z0.b, z1.b[1]
+# CHECK-NEXT: 2 11.5 0.1 0.0 <total>
# CHECK: [28] Code Region - Z sdot.d
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total Cycles: 1403
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.23
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: Block RThroughput: 5.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: 0123456789 0
# CHECK-NEXT: Index 0123456789 0123456789
-# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. sdot z0.d, z1.h, z2.h
-# CHECK-NEXT: [0,2] D=========eeeeER . . . .. sdot z0.d, z1.h, z2.h
-# CHECK-NEXT: [0,3] D=============eeeeER. . . .. sdot z0.d, z0.h, z1.h
-# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D======================eeeeER . .. sdot z0.d, z1.h, z2.h
-# CHECK-NEXT: [1,2] D==========================eeeeER .. sdot z0.d, z1.h, z2.h
-# CHECK-NEXT: [1,3] D==============================eeeeER sdot z0.d, z0.h, z1.h
+# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . sdot z0.d, z1.h, z2.h
+# CHECK-NEXT: [0,2] D======eeeeER . . . . sdot z0.d, z1.h, z2.h
+# CHECK-NEXT: [0,3] D==========eeeeER . . . sdot z0.d, z0.h, z1.h
+# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D==================eeeeER . sdot z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,2] .D===================eeeeER . sdot z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,3] .D=======================eeeeER sdot z0.d, z0.h, z1.h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1457,36 +1457,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 14.5 0.0 0.0 sdot z0.d, z1.h, z2.h
-# CHECK-NEXT: 2. 2 18.5 0.0 0.0 sdot z0.d, z1.h, z2.h
-# CHECK-NEXT: 3. 2 22.5 0.0 0.0 sdot z0.d, z0.h, z1.h
-# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 12.5 0.0 0.0 sdot z0.d, z1.h, z2.h
+# CHECK-NEXT: 2. 2 13.5 0.0 0.0 sdot z0.d, z1.h, z2.h
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 sdot z0.d, z0.h, z1.h
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
# CHECK: [29] Code Region - Z smmla
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1303
+# CHECK-NEXT: Total Cycles: 1103
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.31
-# CHECK-NEXT: IPC: 0.31
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.36
# CHECK-NEXT: Block RThroughput: 1.5
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789 012345678
+# CHECK-NEXT: Index 0123456789 01234
-# CHECK: [0,0] DeeeeER . . . . . mul z0.s, z0.s, z0.s
-# CHECK-NEXT: [0,1] D====eeeER. . . . . smmla z0.s, z1.b, z2.b
-# CHECK-NEXT: [0,2] D=======eeeER . . . . smmla z0.s, z1.b, z2.b
-# CHECK-NEXT: [0,3] D==========eeeER . . . smmla z0.s, z0.b, z1.b
-# CHECK-NEXT: [1,0] D=============eeeeER. . . mul z0.s, z0.s, z0.s
-# CHECK-NEXT: [1,1] D=================eeeER . . smmla z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,2] D====================eeeER . smmla z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,3] D=======================eeeER smmla z0.s, z0.b, z1.b
+# CHECK: [0,0] DeeeeER . . . . mul z0.s, z0.s, z0.s
+# CHECK-NEXT: [0,1] D====eeeER. . . . smmla z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,2] D=====eeeER . . . smmla z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,3] D========eeeER . . . smmla z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,0] D===========eeeeER . . mul z0.s, z0.s, z0.s
+# CHECK-NEXT: [1,1] .D==============eeeER . smmla z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,2] .D===============eeeER . smmla z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3] .D==================eeeER smmla z0.s, z0.b, z1.b
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1495,36 +1495,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.s, z0.s, z0.s
-# CHECK-NEXT: 1. 2 11.5 0.0 0.0 smmla z0.s, z1.b, z2.b
-# CHECK-NEXT: 2. 2 14.5 0.0 0.0 smmla z0.s, z1.b, z2.b
-# CHECK-NEXT: 3. 2 17.5 0.0 0.0 smmla z0.s, z0.b, z1.b
-# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul z0.s, z0.s, z0.s
+# CHECK-NEXT: 1. 2 10.0 0.0 0.0 smmla z0.s, z1.b, z2.b
+# CHECK-NEXT: 2. 2 11.0 0.0 0.0 smmla z0.s, z1.b, z2.b
+# CHECK-NEXT: 3. 2 14.0 0.0 0.0 smmla z0.s, z0.b, z1.b
+# CHECK-NEXT: 2 10.4 0.1 0.0 <total>
# CHECK: [30] Code Region - Z mla.b
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total Cycles: 1403
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.23
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: Block RThroughput: 5.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: 0123456789 0
# CHECK-NEXT: Index 0123456789 0123456789
-# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. mla z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: [0,2] D=========eeeeER . . . .. mla z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: [0,3] D=============eeeeER. . . .. mla z0.b, p0/m, z0.b, z1.b
-# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D======================eeeeER . .. mla z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: [1,2] D==========================eeeeER .. mla z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: [1,3] D==============================eeeeER mla z0.b, p0/m, z0.b, z1.b
+# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . mla z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [0,2] D======eeeeER . . . . mla z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [0,3] D==========eeeeER . . . mla z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D==================eeeeER . mla z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [1,2] .D===================eeeeER . mla z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [1,3] .D=======================eeeeER mla z0.b, p0/m, z0.b, z1.b
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1533,36 +1533,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 14.5 0.0 0.0 mla z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: 2. 2 18.5 0.0 0.0 mla z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: 3. 2 22.5 0.0 0.0 mla z0.b, p0/m, z0.b, z1.b
-# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 12.5 0.0 0.0 mla z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: 2. 2 13.5 0.0 0.0 mla z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 mla z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
# CHECK: [31] Code Region - Z mla.d
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 2003
-# CHECK-NEXT: Total uOps: 800
+# CHECK-NEXT: Total Cycles: 1803
+# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.40
-# CHECK-NEXT: IPC: 0.20
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.28
+# CHECK-NEXT: IPC: 0.22
# CHECK-NEXT: Block RThroughput: 8.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 012
+# CHECK-NEXT: 0123456789 012345678
+# CHECK-NEXT: Index 0123456789 0123456789
-# CHECK: [0,0] DeeeeeER . . . . . . . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D=====eeeeeER . . . . . . . mla z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [0,2] D==========eeeeeER . . . . . . mla z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [0,3] D===============eeeeeER . . . . . mla z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT: [1,0] D====================eeeeeER . . . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] .D========================eeeeeER . . . mla z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,2] .D=============================eeeeeER . . mla z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,3] .D==================================eeeeeER mla z0.d, p0/m, z0.d, z1.d
+# CHECK: [0,0] DeeeeeER . . . . . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeeER . . . . . . mla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,2] D========eeeeeER . . . . . mla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,3] D=============eeeeeER . . . . mla z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0] .D=================eeeeeER . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D======================eeeeeER . . mla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2] .D=========================eeeeeER . . mla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3] .D==============================eeeeeER mla z0.d, p0/m, z0.d, z1.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1571,36 +1571,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 11.0 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 15.5 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 2. 2 20.5 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 3. 2 25.5 0.0 0.0 mla z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT: 2 18.1 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 14.5 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2. 2 17.5 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3. 2 22.5 0.0 0.0 mla z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: 2 16.0 0.1 0.0 <total>
# CHECK: [32] Code Region - Z smlalb
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total Cycles: 1403
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.23
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: Block RThroughput: 5.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: 0123456789 0
# CHECK-NEXT: Index 0123456789 0123456789
-# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. smlalb z0.d, z1.s, z2.s
-# CHECK-NEXT: [0,2] D=========eeeeER . . . .. smlalb z0.d, z1.s, z2.s
-# CHECK-NEXT: [0,3] D=============eeeeER. . . .. smlalb z0.d, z0.s, z1.s
-# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D======================eeeeER . .. smlalb z0.d, z1.s, z2.s
-# CHECK-NEXT: [1,2] D==========================eeeeER .. smlalb z0.d, z1.s, z2.s
-# CHECK-NEXT: [1,3] D==============================eeeeER smlalb z0.d, z0.s, z1.s
+# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . smlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: [0,2] D======eeeeER . . . . smlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: [0,3] D==========eeeeER . . . smlalb z0.d, z0.s, z1.s
+# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D==================eeeeER . smlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,2] .D===================eeeeER . smlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,3] .D=======================eeeeER smlalb z0.d, z0.s, z1.s
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1609,36 +1609,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 14.5 0.0 0.0 smlalb z0.d, z1.s, z2.s
-# CHECK-NEXT: 2. 2 18.5 0.0 0.0 smlalb z0.d, z1.s, z2.s
-# CHECK-NEXT: 3. 2 22.5 0.0 0.0 smlalb z0.d, z0.s, z1.s
-# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 12.5 0.0 0.0 smlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: 2. 2 13.5 0.0 0.0 smlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 smlalb z0.d, z0.s, z1.s
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
# CHECK: [33] Code Region - Z sqdmlalb
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total Cycles: 1503
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.23
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.33
+# CHECK-NEXT: IPC: 0.27
# CHECK-NEXT: Block RThroughput: 5.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: 0123456789 012
# CHECK-NEXT: Index 0123456789 0123456789
-# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. sqdmlalb z0.d, z1.s, z2.s
-# CHECK-NEXT: [0,2] D=========eeeeER . . . .. sqdmlalb z0.d, z1.s, z2.s
-# CHECK-NEXT: [0,3] D=============eeeeER. . . .. sqdmlalb z0.d, z0.s, z1.s
-# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D======================eeeeER . .. sqdmlalb z0.d, z1.s, z2.s
-# CHECK-NEXT: [1,2] D==========================eeeeER .. sqdmlalb z0.d, z1.s, z2.s
-# CHECK-NEXT: [1,3] D==============================eeeeER sqdmlalb z0.d, z0.s, z1.s
+# CHECK: [0,0] DeeeeeER . . . . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . . sqdmlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: [0,2] D=======eeeeER . . . . . sqdmlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: [0,3] D===========eeeeER . . . . sqdmlalb z0.d, z0.s, z1.s
+# CHECK-NEXT: [1,0] .D==============eeeeeER . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D===================eeeeER . . sqdmlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,2] .D=====================eeeeER . . sqdmlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,3] .D=========================eeeeER sqdmlalb z0.d, z0.s, z1.s
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1647,36 +1647,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 14.5 0.0 0.0 sqdmlalb z0.d, z1.s, z2.s
-# CHECK-NEXT: 2. 2 18.5 0.0 0.0 sqdmlalb z0.d, z1.s, z2.s
-# CHECK-NEXT: 3. 2 22.5 0.0 0.0 sqdmlalb z0.d, z0.s, z1.s
-# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sqdmlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: 2. 2 15.0 0.0 0.0 sqdmlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: 3. 2 19.0 0.0 0.0 sqdmlalb z0.d, z0.s, z1.s
+# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
# CHECK: [34] Code Region - Z sqrdmlah.b
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total Cycles: 1503
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.23
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.33
+# CHECK-NEXT: IPC: 0.27
# CHECK-NEXT: Block RThroughput: 5.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: 0123456789 012
# CHECK-NEXT: Index 0123456789 0123456789
-# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. sqrdmlah z0.b, z1.b, z2.b
-# CHECK-NEXT: [0,2] D=========eeeeER . . . .. sqrdmlah z0.b, z1.b, z2.b
-# CHECK-NEXT: [0,3] D=============eeeeER. . . .. sqrdmlah z0.b, z0.b, z1.b
-# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D======================eeeeER . .. sqrdmlah z0.b, z1.b, z2.b
-# CHECK-NEXT: [1,2] D==========================eeeeER .. sqrdmlah z0.b, z1.b, z2.b
-# CHECK-NEXT: [1,3] D==============================eeeeER sqrdmlah z0.b, z0.b, z1.b
+# CHECK: [0,0] DeeeeeER . . . . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . . sqrdmlah z0.b, z1.b, z2.b
+# CHECK-NEXT: [0,2] D=======eeeeER . . . . . sqrdmlah z0.b, z1.b, z2.b
+# CHECK-NEXT: [0,3] D===========eeeeER . . . . sqrdmlah z0.b, z0.b, z1.b
+# CHECK-NEXT: [1,0] .D==============eeeeeER . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D===================eeeeER . . sqrdmlah z0.b, z1.b, z2.b
+# CHECK-NEXT: [1,2] .D=====================eeeeER . . sqrdmlah z0.b, z1.b, z2.b
+# CHECK-NEXT: [1,3] .D=========================eeeeER sqrdmlah z0.b, z0.b, z1.b
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1685,36 +1685,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 14.5 0.0 0.0 sqrdmlah z0.b, z1.b, z2.b
-# CHECK-NEXT: 2. 2 18.5 0.0 0.0 sqrdmlah z0.b, z1.b, z2.b
-# CHECK-NEXT: 3. 2 22.5 0.0 0.0 sqrdmlah z0.b, z0.b, z1.b
-# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sqrdmlah z0.b, z1.b, z2.b
+# CHECK-NEXT: 2. 2 15.0 0.0 0.0 sqrdmlah z0.b, z1.b, z2.b
+# CHECK-NEXT: 3. 2 19.0 0.0 0.0 sqrdmlah z0.b, z0.b, z1.b
+# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
# CHECK: [35] Code Region - Z sqrdmlah.d
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 2003
-# CHECK-NEXT: Total uOps: 800
+# CHECK-NEXT: Total Cycles: 1803
+# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.40
-# CHECK-NEXT: IPC: 0.20
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.28
+# CHECK-NEXT: IPC: 0.22
# CHECK-NEXT: Block RThroughput: 8.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 012
+# CHECK-NEXT: 0123456789 012345678
+# CHECK-NEXT: Index 0123456789 0123456789
-# CHECK: [0,0] DeeeeeER . . . . . . . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D=====eeeeeER . . . . . . . sqrdmlah z0.d, z1.d, z2.d
-# CHECK-NEXT: [0,2] D==========eeeeeER . . . . . . sqrdmlah z0.d, z1.d, z2.d
-# CHECK-NEXT: [0,3] D===============eeeeeER . . . . . sqrdmlah z0.d, z0.d, z1.d
-# CHECK-NEXT: [1,0] D====================eeeeeER . . . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] .D========================eeeeeER . . . sqrdmlah z0.d, z1.d, z2.d
-# CHECK-NEXT: [1,2] .D=============================eeeeeER . . sqrdmlah z0.d, z1.d, z2.d
-# CHECK-NEXT: [1,3] .D==================================eeeeeER sqrdmlah z0.d, z0.d, z1.d
+# CHECK: [0,0] DeeeeeER . . . . . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D=====eeeeeER . . . . . . sqrdmlah z0.d, z1.d, z2.d
+# CHECK-NEXT: [0,2] D========eeeeeER . . . . . sqrdmlah z0.d, z1.d, z2.d
+# CHECK-NEXT: [0,3] D=============eeeeeER . . . . sqrdmlah z0.d, z0.d, z1.d
+# CHECK-NEXT: [1,0] .D=================eeeeeER . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D======================eeeeeER . . sqrdmlah z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,2] .D=========================eeeeeER . . sqrdmlah z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,3] .D==============================eeeeeER sqrdmlah z0.d, z0.d, z1.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1723,36 +1723,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 11.0 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 15.5 0.0 0.0 sqrdmlah z0.d, z1.d, z2.d
-# CHECK-NEXT: 2. 2 20.5 0.0 0.0 sqrdmlah z0.d, z1.d, z2.d
-# CHECK-NEXT: 3. 2 25.5 0.0 0.0 sqrdmlah z0.d, z0.d, z1.d
-# CHECK-NEXT: 2 18.1 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 14.5 0.0 0.0 sqrdmlah z0.d, z1.d, z2.d
+# CHECK-NEXT: 2. 2 17.5 0.0 0.0 sqrdmlah z0.d, z1.d, z2.d
+# CHECK-NEXT: 3. 2 22.5 0.0 0.0 sqrdmlah z0.d, z0.d, z1.d
+# CHECK-NEXT: 2 16.0 0.1 0.0 <total>
# CHECK: [36] Code Region - Z fcmla ZPmZZ
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1503
+# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.27
-# CHECK-NEXT: IPC: 0.27
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.31
+# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 012
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
-# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D===eeeeER. . . . . . fcmla z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: [0,2] D=======eeeeER . . . . . fcmla z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: [0,3] D===========eeeeER . . . . fcmla z0.d, p0/m, z0.d, z1.d, #90
-# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D==================eeeeER. . . fcmla z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: [1,2] D======================eeeeER . . fcmla z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: [1,3] D==========================eeeeER fcmla z0.d, p0/m, z0.d, z1.d, #90
+# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . fcmla z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [0,2] D=====eeeeER . . . . fcmla z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [0,3] D=========eeeeER . . . fcmla z0.d, p0/m, z0.d, z1.d, #90
+# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D===============eeeeER . . fcmla z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [1,2] .D=================eeeeER. . fcmla z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [1,3] .D=====================eeeeER fcmla z0.d, p0/m, z0.d, z1.d, #90
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1761,36 +1761,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fcmla z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: 2. 2 15.5 0.0 0.0 fcmla z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: 3. 2 19.5 0.0 0.0 fcmla z0.d, p0/m, z0.d, z1.d, #90
-# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 10.0 0.0 0.0 fcmla z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 fcmla z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fcmla z0.d, p0/m, z0.d, z1.d, #90
+# CHECK-NEXT: 2 11.4 0.1 0.0 <total>
# CHECK: [37] Code Region - Z fcmla ZZZI
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1503
+# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.27
-# CHECK-NEXT: IPC: 0.27
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.31
+# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 012
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
-# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D===eeeeER. . . . . . fcmla z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: [0,2] D=======eeeeER . . . . . fcmla z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: [0,3] D===========eeeeER . . . . fcmla z0.s, z0.s, z1.s[1], #90
-# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D==================eeeeER. . . fcmla z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: [1,2] D======================eeeeER . . fcmla z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: [1,3] D==========================eeeeER fcmla z0.s, z0.s, z1.s[1], #90
+# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . fcmla z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [0,2] D=====eeeeER . . . . fcmla z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [0,3] D=========eeeeER . . . fcmla z0.s, z0.s, z1.s[1], #90
+# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D===============eeeeER . . fcmla z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [1,2] .D=================eeeeER. . fcmla z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [1,3] .D=====================eeeeER fcmla z0.s, z0.s, z1.s[1], #90
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1799,36 +1799,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fcmla z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: 2. 2 15.5 0.0 0.0 fcmla z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: 3. 2 19.5 0.0 0.0 fcmla z0.s, z0.s, z1.s[1], #90
-# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 10.0 0.0 0.0 fcmla z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 fcmla z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fcmla z0.s, z0.s, z1.s[1], #90
+# CHECK-NEXT: 2 11.4 0.1 0.0 <total>
# CHECK: [38] Code Region - Z fmla ZPmZZ
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1503
+# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.27
-# CHECK-NEXT: IPC: 0.27
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.31
+# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 012
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
-# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D===eeeeER. . . . . . fmla z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [0,2] D=======eeeeER . . . . . fmla z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [0,3] D===========eeeeER . . . . fmla z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D==================eeeeER. . . fmla z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,2] D======================eeeeER . . fmla z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,3] D==========================eeeeER fmla z0.d, p0/m, z0.d, z1.d
+# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . fmla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,2] D=====eeeeER . . . . fmla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,3] D=========eeeeER . . . fmla z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D===============eeeeER . . fmla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2] .D=================eeeeER. . fmla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3] .D=====================eeeeER fmla z0.d, p0/m, z0.d, z1.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1837,36 +1837,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fmla z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 2. 2 15.5 0.0 0.0 fmla z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 3. 2 19.5 0.0 0.0 fmla z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 10.0 0.0 0.0 fmla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 fmla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fmla z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: 2 11.4 0.1 0.0 <total>
# CHECK: [39] Code Region - Z fmla ZZZI
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1503
+# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.27
-# CHECK-NEXT: IPC: 0.27
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.31
+# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 012
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
-# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D===eeeeER. . . . . . fmla z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: [0,2] D=======eeeeER . . . . . fmla z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: [0,3] D===========eeeeER . . . . fmla z0.d, z0.d, z1.d[1]
-# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D==================eeeeER. . . fmla z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: [1,2] D======================eeeeER . . fmla z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: [1,3] D==========================eeeeER fmla z0.d, z0.d, z1.d[1]
+# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . fmla z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [0,2] D=====eeeeER . . . . fmla z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [0,3] D=========eeeeER . . . fmla z0.d, z0.d, z1.d[1]
+# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D===============eeeeER . . fmla z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [1,2] .D=================eeeeER. . fmla z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [1,3] .D=====================eeeeER fmla z0.d, z0.d, z1.d[1]
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1875,36 +1875,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fmla z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: 2. 2 15.5 0.0 0.0 fmla z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: 3. 2 19.5 0.0 0.0 fmla z0.d, z0.d, z1.d[1]
-# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 10.0 0.0 0.0 fmla z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 fmla z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fmla z0.d, z0.d, z1.d[1]
+# CHECK-NEXT: 2 11.4 0.1 0.0 <total>
# CHECK: [40] Code Region - Z fmlalb ZZZ
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1503
+# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.27
-# CHECK-NEXT: IPC: 0.27
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.31
+# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 012
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
-# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D===eeeeER. . . . . . fmlalb z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,2] D=======eeeeER . . . . . fmlalb z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,3] D===========eeeeER . . . . fmlalb z0.s, z0.h, z1.h
-# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D==================eeeeER. . . fmlalb z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2] D======================eeeeER . . fmlalb z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3] D==========================eeeeER fmlalb z0.s, z0.h, z1.h
+# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . fmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2] D=====eeeeER . . . . fmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3] D=========eeeeER . . . fmlalb z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D===============eeeeER . . fmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2] .D=================eeeeER. . fmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3] .D=====================eeeeER fmlalb z0.s, z0.h, z1.h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1913,36 +1913,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fmlalb z0.s, z1.h, z2.h
-# CHECK-NEXT: 2. 2 15.5 0.0 0.0 fmlalb z0.s, z1.h, z2.h
-# CHECK-NEXT: 3. 2 19.5 0.0 0.0 fmlalb z0.s, z0.h, z1.h
-# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 10.0 0.0 0.0 fmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 fmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fmlalb z0.s, z0.h, z1.h
+# CHECK-NEXT: 2 11.4 0.1 0.0 <total>
# CHECK: [41] Code Region - Z bfdot
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1503
+# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.27
-# CHECK-NEXT: IPC: 0.27
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.31
+# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 012
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
-# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D===eeeeER. . . . . . bfdot z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,2] D=======eeeeER . . . . . bfdot z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,3] D===========eeeeER . . . . bfdot z0.s, z0.h, z1.h
-# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D==================eeeeER. . . bfdot z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2] D======================eeeeER . . bfdot z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3] D==========================eeeeER bfdot z0.s, z0.h, z1.h
+# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . bfdot z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2] D=====eeeeER . . . . bfdot z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3] D=========eeeeER . . . bfdot z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D===============eeeeER . . bfdot z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2] .D=================eeeeER. . bfdot z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3] .D=====================eeeeER bfdot z0.s, z0.h, z1.h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1951,36 +1951,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 11.5 0.0 0.0 bfdot z0.s, z1.h, z2.h
-# CHECK-NEXT: 2. 2 15.5 0.0 0.0 bfdot z0.s, z1.h, z2.h
-# CHECK-NEXT: 3. 2 19.5 0.0 0.0 bfdot z0.s, z0.h, z1.h
-# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 10.0 0.0 0.0 bfdot z0.s, z1.h, z2.h
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 bfdot z0.s, z1.h, z2.h
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 bfdot z0.s, z0.h, z1.h
+# CHECK-NEXT: 2 11.4 0.1 0.0 <total>
# CHECK: [42] Code Region - Z bfmmla
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1803
+# CHECK-NEXT: Total Cycles: 1603
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.22
-# CHECK-NEXT: IPC: 0.22
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.25
+# CHECK-NEXT: IPC: 0.25
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 012345678
+# CHECK-NEXT: 0123456789 01234
# CHECK-NEXT: Index 0123456789 0123456789
-# CHECK: [0,0] DeeeER . . . . . . . fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D===eeeeeER . . . . . . bfmmla z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,2] D========eeeeeER . . . . . bfmmla z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,3] D=============eeeeeER . . . . bfmmla z0.s, z0.h, z1.h
-# CHECK-NEXT: [1,0] D==================eeeER . . . . fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D=====================eeeeeER . . . bfmmla z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2] D==========================eeeeeER . . bfmmla z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3] D===============================eeeeeER bfmmla z0.s, z0.h, z1.h
+# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D===eeeeeER . . . . . bfmmla z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2] D======eeeeeER . . . . . bfmmla z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3] D===========eeeeeER . . . . bfmmla z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0] D================eeeER . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D==================eeeeeER . . bfmmla z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2] .D=====================eeeeeER. . bfmmla z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3] .D==========================eeeeeER bfmmla z0.s, z0.h, z1.h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1989,36 +1989,36 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 10.0 0.5 0.0 fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 13.0 0.0 0.0 bfmmla z0.s, z1.h, z2.h
-# CHECK-NEXT: 2. 2 18.0 0.0 0.0 bfmmla z0.s, z1.h, z2.h
-# CHECK-NEXT: 3. 2 23.0 0.0 0.0 bfmmla z0.s, z0.h, z1.h
-# CHECK-NEXT: 2 16.0 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 9.0 0.5 0.0 fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 bfmmla z0.s, z1.h, z2.h
+# CHECK-NEXT: 2. 2 14.5 0.0 0.0 bfmmla z0.s, z1.h, z2.h
+# CHECK-NEXT: 3. 2 19.5 0.0 0.0 bfmmla z0.s, z0.h, z1.h
+# CHECK-NEXT: 2 13.6 0.1 0.0 <total>
# CHECK: [43] Code Region - bfmlalb
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1503
+# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.27
-# CHECK-NEXT: IPC: 0.27
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 0.31
+# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 012
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
-# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1] D===eeeeER. . . . . . bfmlalb z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,2] D=======eeeeER . . . . . bfmlalb z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,3] D===========eeeeER . . . . bfmlalb z0.s, z0.h, z1.h
-# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D==================eeeeER. . . bfmlalb z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2] D======================eeeeER . . bfmlalb z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3] D==========================eeeeER bfmlalb z0.s, z0.h, z1.h
+# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . bfmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2] D=====eeeeER . . . . bfmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3] D=========eeeeER . . . bfmlalb z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D===============eeeeER . . bfmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2] .D=================eeeeER. . bfmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3] .D=====================eeeeER bfmlalb z0.s, z0.h, z1.h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2027,8 +2027,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 11.5 0.0 0.0 bfmlalb z0.s, z1.h, z2.h
-# CHECK-NEXT: 2. 2 15.5 0.0 0.0 bfmlalb z0.s, z1.h, z2.h
-# CHECK-NEXT: 3. 2 19.5 0.0 0.0 bfmlalb z0.s, z0.h, z1.h
-# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 10.0 0.0 0.0 bfmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 bfmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 bfmlalb z0.s, z0.h, z1.h
+# CHECK-NEXT: 2 11.4 0.1 0.0 <total>
More information about the llvm-commits
mailing list