[llvm] [AArch64] Add SchedReadAdvance to the Neoverse-N3 scheduling model (PR #167302)

Asher Dobrescu via llvm-commits llvm-commits at lists.llvm.org
Mon Nov 10 03:34:49 PST 2025


https://github.com/Asher8118 created https://github.com/llvm/llvm-project/pull/167302

 Introduce a description of late forwarding to the Neoverse-N3 scheduling model.

>From 461c54cb6bd49d84a27feee73eae394e24b9b6b7 Mon Sep 17 00:00:00 2001
From: Ash Dobrescu <ash.dobrescu at arm.com>
Date: Fri, 7 Nov 2025 15:13:51 +0000
Subject: [PATCH 1/2] [AArch64] Add SchedReadAdvance to Neoverse-N3 scheduling
 model

Introduce a description of late forwarding to the Neoverse-N3 scheduling model.
---
 .../llvm-mca/AArch64/Neoverse/N3-forwarding.s | 2034 +++++++++++++++++
 1 file changed, 2034 insertions(+)
 create mode 100644 llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s

diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s
new file mode 100644
index 0000000000000..59e3af8abd708
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s
@@ -0,0 +1,2034 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-n3 -mattr=+sve --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=2 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN madd
+mul  x0, x0, x0
+madd x0, x1, x2, x0
+madd x0, x1, x2, x0
+madd x0, x0, x0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smaddl
+mul    x0, x0, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w0, w0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmadd
+fadd  d0, d0, d0
+fmadd d0, d1, d2, d0
+fmul  d0, d0, d0
+fmadd d0, d1, d2, d0
+fmadd d0, d1, d2, d0
+fmadd d0, d0, d1, d2
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN saba
+mul  v0.4s, v0.4s, v0.4s
+saba v0.4s, v1.4s, v2.4s
+saba v0.4s, v1.4s, v2.4s
+saba v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sdot
+mul  v0.4s, v0.4s,  v0.4s
+sdot v0.4s, v1.16b, v2.16b
+sdot v0.4s, v1.16b, v2.16b
+sdot v0.4s, v0.16b, v1.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smmla
+mul   v0.4s, v0.4s,  v0.4s
+smmla v0.4s, v1.16b, v2.16b
+smmla v0.4s, v1.16b, v2.16b
+smmla v0.4s, v0.16b, v1.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN mla
+mul v0.4s, v0.4s, v0.4s
+mla v0.4s, v1.4s, v2.4s
+mla v0.4s, v1.4s, v2.4s
+mla v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sqrdmlah
+mul    v0.4s, v0.4s, v0.4s
+sqrdmlah v0.8h, v1.8h, v2.8h
+sqrdmlah v0.8h, v1.8h, v2.8h
+sqrdmlah v0.8h, v1.8h, v2.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smlal2
+mul    v0.4s, v0.4s, v0.4s
+smlal2 v0.4s, v1.8h, v2.8h
+smlal2 v0.4s, v1.8h, v2.8h
+smlal2 v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sqdmlal2
+mul    v0.4s, v0.4s, v0.4s
+sqdmlal2 v0.4s, v1.8h, v2.8h
+sqdmlal2 v0.4s, v1.8h, v2.8h
+sqdmlal2 v0.4s, v1.8h, v2.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sadalp
+mul    v0.4s, v0.4s, v0.4s
+sadalp v0.2d, v1.4s
+sadalp v0.2d, v1.4s
+sadalp v0.2d, v0.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fcmla
+fmul  v0.4s, v0.4s, v0.4s
+fcmla v0.2d, v1.2d, v2.2d, #90
+fcmla v0.2d, v1.2d, v2.2d, #90
+fcmla v0.2d, v0.2d, v1.2d, #90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmla
+fmul v0.2d, v0.2d, v0.2d
+fmla v0.2d, v1.2d, v2.2d
+fadd v0.2d, v0.2d, v0.2d
+fmla v0.2d, v1.2d, v2.2d
+fmla v0.2d, v1.2d, v2.2d
+fmla v0.2d, v0.2d, v1.2d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmlal
+fmul  v0.2d, v0.2d, v0.2d
+fmlal v0.4s, v1.4h, v2.4h
+fadd  v0.2d, v0.2d, v0.2d
+fmlal v0.4s, v1.4h, v2.4h
+fmlal v0.4s, v1.4h, v2.4h
+fmlal v0.4s, v0.4h, v1.4h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfdot
+fmul  v0.2d, v0.2d, v0.2d
+bfdot v0.4s, v1.8h, v2.8h
+bfdot v0.4s, v1.8h, v2.8h
+bfdot v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfmmla
+fmul   v0.2d, v0.2d, v0.2d
+bfmmla v0.4s, v1.8h, v2.8h
+bfmmla v0.4s, v1.8h, v2.8h
+bfmmla v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfmlalb
+fmul    v0.2d, v0.2d, v0.2d
+bfmlalb v0.4s, v1.8h, v2.8h
+bfmlalb v0.4s, v1.8h, v2.8h
+bfmlalb v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN crc32
+mul     w0, w0, w0
+crc32cb w0, w0, w1
+crc32cb w0, w0, w1
+crc32cb w0, w0, w0
+crc32b  w0, w0, w15
+crc32h  w0, w0, w21
+crc32w  w0, w0, w24
+crc32x  w0, w0, x25
+crc32ch w0, w0, w16
+crc32cw w0, w0, w23
+crc32cx w0, w0, x5
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z saba
+mul  z0.d, z0.d, z0.d
+saba z0.d, z1.d, z2.d
+saba z0.d, z1.d, z2.d
+saba z0.d, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sabalt
+mul  z0.h, z0.h, z0.h
+sabalt z0.h, z1.b, z2.b
+sabalt z0.h, z1.b, z2.b
+sabalt z0.h, z0.b, z1.b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sadalp
+mul    z0.d, z0.d, z0.d
+sadalp z0.d, p0/m, z1.s
+sadalp z0.d, p0/m, z1.s
+sadalp z0.d, p0/m, z0.s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z ssra
+mul  z0.d, z0.d, z0.d
+ssra z0.d, z1.d, #1
+ssra z0.d, z1.d, #1
+ssra z0.d, z0.d, #1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z cdot.s
+mul  z0.d, z0.d, z0.d
+cdot z0.s, z1.b, z2.b, #90
+cdot z0.s, z1.b, z2.b, #90
+cdot z0.s, z0.b, z1.b, #90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z cdot.d
+mul  z0.d, z0.d, z0.d
+cdot z0.d, z1.h, z2.h, #90
+cdot z0.d, z1.h, z2.h, #90
+cdot z0.d, z0.h, z1.h, #90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z cmla.b
+mul  z0.d, z0.d, z0.d
+cmla z0.b, z1.b, z2.b, #90
+cmla z0.b, z1.b, z2.b, #90
+cmla z0.b, z0.b, z1.b, #90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z cmla.d
+mul  z0.d, z0.d, z0.d
+cmla z0.d, z1.d, z2.d, #90
+cmla z0.d, z1.d, z2.d, #90
+cmla z0.d, z0.d, z1.d, #90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sdot.s
+mul z0.d, p0/m, z0.d, z0.d
+sdot z0.s, z1.b, z2.b
+sdot z0.s, z1.b, z2.b
+sdot z0.s, z0.b, z1.b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sudot
+mul z0.d, p0/m, z0.d, z0.d
+sdot z0.s, z1.b, z2.b[1]
+sdot z0.s, z1.b, z2.b[1]
+sdot z0.s, z0.b, z1.b[1]
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sdot.d
+mul  z0.d, z0.d, z0.d
+sdot z0.d, z1.h, z2.h
+sdot z0.d, z1.h, z2.h
+sdot z0.d, z0.h, z1.h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z smmla
+mul   z0.s, z0.s, z0.s
+smmla z0.s, z1.b, z2.b
+smmla z0.s, z1.b, z2.b
+smmla z0.s, z0.b, z1.b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z mla.b
+mul z0.d, z0.d, z0.d
+mla z0.b, p0/m, z1.b, z2.b
+mla z0.b, p0/m, z1.b, z2.b
+mla z0.b, p0/m, z0.b, z1.b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z mla.d
+mul z0.d, z0.d, z0.d
+mla z0.d, p0/m, z1.d, z2.d
+mla z0.d, p0/m, z1.d, z2.d
+mla z0.d, p0/m, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z smlalb
+mul    z0.d, z0.d, z0.d
+smlalb z0.d, z1.s, z2.s
+smlalb z0.d, z1.s, z2.s
+smlalb z0.d, z0.s, z1.s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sqdmlalb
+mul      z0.d, z0.d, z0.d
+sqdmlalb z0.d, z1.s, z2.s
+sqdmlalb z0.d, z1.s, z2.s
+sqdmlalb z0.d, z0.s, z1.s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sqrdmlah.b
+mul      z0.d, z0.d, z0.d
+sqrdmlah z0.b, z1.b, z2.b
+sqrdmlah z0.b, z1.b, z2.b
+sqrdmlah z0.b, z0.b, z1.b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sqrdmlah.d
+mul      z0.d, z0.d, z0.d
+sqrdmlah z0.d, z1.d, z2.d
+sqrdmlah z0.d, z1.d, z2.d
+sqrdmlah z0.d, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fcmla ZPmZZ
+fmul  z0.d, z0.d, z0.d
+fcmla z0.d, p0/m, z1.d, z2.d, 90
+fcmla z0.d, p0/m, z1.d, z2.d, 90
+fcmla z0.d, p0/m, z0.d, z1.d, 90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fcmla ZZZI
+fmul  z0.d, z0.d, z0.d
+fcmla z0.s, z1.s, z2.s[1], 90
+fcmla z0.s, z1.s, z2.s[1], 90
+fcmla z0.s, z0.s, z1.s[1], 90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fmla ZPmZZ
+fmul z0.d, z0.d, z0.d
+fmla z0.d, p0/m, z1.d, z2.d
+fmla z0.d, p0/m, z1.d, z2.d
+fmla z0.d, p0/m, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fmla ZZZI
+fmul z0.d, z0.d, z0.d
+fmla z0.d, z1.d, z2.d[1]
+fmla z0.d, z1.d, z2.d[1]
+fmla z0.d, z0.d, z1.d[1]
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fmlalb ZZZ
+fmul   z0.d, z0.d, z0.d
+fmlalb z0.s, z1.h, z2.h
+fmlalb z0.s, z1.h, z2.h
+fmlalb z0.s, z0.h, z1.h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z bfdot
+fmul  z0.d, z0.d, z0.d
+bfdot z0.s, z1.h, z2.h
+bfdot z0.s, z1.h, z2.h
+bfdot z0.s, z0.h, z1.h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z bfmmla
+fmul   z0.d, z0.d, z0.d
+bfmmla z0.s, z1.h, z2.h
+bfmmla z0.s, z1.h, z2.h
+bfmmla z0.s, z0.h, z1.h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfmlalb
+fmul    z0.d, z0.d, z0.d
+bfmlalb z0.s, z1.h, z2.h
+bfmlalb z0.s, z1.h, z2.h
+bfmlalb z0.s, z0.h, z1.h
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - madd
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      703
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.57
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .    ..   mul	x0, x0, x0
+# CHECK-NEXT: [0,1]     D==eeER   .    ..   madd	x0, x1, x2, x0
+# CHECK-NEXT: [0,2]     D===eeER  .    ..   madd	x0, x1, x2, x0
+# CHECK-NEXT: [0,3]     D=====eeER.    ..   madd	x0, x0, x0, x0
+# CHECK-NEXT: [1,0]     D=======eeER   ..   mul	x0, x0, x0
+# CHECK-NEXT: [1,1]     D=========eeER ..   madd	x0, x1, x2, x0
+# CHECK-NEXT: [1,2]     D==========eeER..   madd	x0, x1, x2, x0
+# CHECK-NEXT: [1,3]     D============eeER   madd	x0, x0, x0, x0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	x0, x0, x0
+# CHECK-NEXT: 1.     2     6.5    0.0    0.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 2.     2     7.5    0.0    0.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 3.     2     9.5    0.0    0.0       madd	x0, x0, x0, x0
+# CHECK-NEXT:        2     7.0    0.1    0.0       <total>
+
+# CHECK:      [1] Code Region - smaddl
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      703
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.57
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .    ..   mul	x0, x0, x0
+# CHECK-NEXT: [0,1]     D==eeER   .    ..   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [0,2]     D===eeER  .    ..   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [0,3]     D=====eeER.    ..   smaddl	x0, w0, w0, x0
+# CHECK-NEXT: [1,0]     D=======eeER   ..   mul	x0, x0, x0
+# CHECK-NEXT: [1,1]     D=========eeER ..   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [1,2]     D==========eeER..   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [1,3]     D============eeER   smaddl	x0, w0, w0, x0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	x0, x0, x0
+# CHECK-NEXT: 1.     2     6.5    0.0    0.0       smaddl	x0, w1, w2, x0
+# CHECK-NEXT: 2.     2     7.5    0.0    0.0       smaddl	x0, w1, w2, x0
+# CHECK-NEXT: 3.     2     9.5    0.0    0.0       smaddl	x0, w0, w0, x0
+# CHECK-NEXT:        2     7.0    0.1    0.0       <total>
+
+# CHECK:      [2] Code Region - fmadd
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      2103
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234
+
+# CHECK:      [0,0]     DeeER.    .    .    .    .    .    .    .   .   fadd	d0, d0, d0
+# CHECK-NEXT: [0,1]     D==eeeeER .    .    .    .    .    .    .   .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,2]     D======eeeER   .    .    .    .    .    .   .   fmul	d0, d0, d0
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .    .    .    .   .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,4]     D=============eeeeER.    .    .    .    .   .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,5]     D=================eeeeER .    .    .    .   .   fmadd	d0, d0, d1, d2
+# CHECK-NEXT: [1,0]     D=====================eeER    .    .    .   .   fadd	d0, d0, d0
+# CHECK-NEXT: [1,1]     D=======================eeeeER.    .    .   .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,2]     D===========================eeeER  .    .   .   fmul	d0, d0, d0
+# CHECK-NEXT: [1,3]     D==============================eeeeER   .   .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,4]     .D=================================eeeeER   .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,5]     .D=====================================eeeeER   fmadd	d0, d0, d1, d2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     11.5   0.5    0.0       fadd	d0, d0, d0
+# CHECK-NEXT: 1.     2     13.5   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 2.     2     17.5   0.0    0.0       fmul	d0, d0, d0
+# CHECK-NEXT: 3.     2     20.5   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 4.     2     24.0   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 5.     2     28.0   0.0    0.0       fmadd	d0, d0, d1, d2
+# CHECK-NEXT:        2     19.2   0.1    0.0       <total>
+
+# CHECK:      [3] Code Region - saba
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   saba	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2]     D========================eeeeER   .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     D============================eeeeER   saba	v0.4s, v0.4s, v1.4s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2.     2     17.0   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     21.0   0.0    0.0       saba	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+
+# CHECK:      [4] Code Region - sdot
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeER.    .    .    .  .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,2]     D=======eeeER  .    .    .  .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,3]     D==========eeeER    .    .  .   sdot	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D=================eeeER  .  .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,2]     D====================eeeER  .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,3]     D=======================eeeER   sdot	v0.4s, v0.16b, v1.16b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 2.     2     14.5   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       sdot	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
+
+# CHECK:      [5] Code Region - smmla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeER.    .    .    .  .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,2]     D=======eeeER  .    .    .  .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,3]     D==========eeeER    .    .  .   smmla	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D=================eeeER  .  .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,2]     D====================eeeER  .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,3]     D=======================eeeER   smmla	v0.4s, v0.16b, v1.16b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 2.     2     14.5   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       smmla	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
+
+# CHECK:      [6] Code Region - mla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   mla	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2]     D========================eeeeER   .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     D============================eeeeER   mla	v0.4s, v0.4s, v1.4s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2.     2     17.0   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     21.0   0.0    0.0       mla	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+
+# CHECK:      [7] Code Region - sqrdmlah
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   sqrdmlah	v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   sqrdmlah	v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   sqrdmlah	v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   sqrdmlah	v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     D========================eeeeER   .   sqrdmlah	v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     D============================eeeeER   sqrdmlah	v0.8h, v1.8h, v2.8h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sqrdmlah	v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     17.0   0.0    0.0       sqrdmlah	v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     21.0   0.0    0.0       sqrdmlah	v0.8h, v1.8h, v2.8h
+# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+
+# CHECK:      [8] Code Region - smlal2
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   smlal2	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     D========================eeeeER   .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     D============================eeeeER   smlal2	v0.4s, v0.8h, v1.8h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     17.0   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     21.0   0.0    0.0       smlal2	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+
+# CHECK:      [9] Code Region - sqdmlal2
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   sqdmlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   sqdmlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   sqdmlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   sqdmlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     D========================eeeeER   .   sqdmlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     D============================eeeeER   sqdmlal2	v0.4s, v1.8h, v2.8h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sqdmlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     17.0   0.0    0.0       sqdmlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     21.0   0.0    0.0       sqdmlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+
+# CHECK:      [10] Code Region - sadalp
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   sadalp	v0.2d, v0.4s
+# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [1,2]     D========================eeeeER   .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [1,3]     D============================eeeeER   sadalp	v0.2d, v0.4s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sadalp	v0.2d, v1.4s
+# CHECK-NEXT: 2.     2     17.0   0.0    0.0       sadalp	v0.2d, v1.4s
+# CHECK-NEXT: 3.     2     21.0   0.0    0.0       sadalp	v0.2d, v0.4s
+# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+
+# CHECK:      [11] Code Region - fcmla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   fcmla	v0.2d, v0.2d, v1.2d, #90
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [1,2]     D======================eeeeER . .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [1,3]     D==========================eeeeER   fcmla	v0.2d, v0.2d, v1.2d, #90
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fcmla	v0.2d, v0.2d, v1.2d, #90
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+
+# CHECK:      [12] Code Region - fmla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      2103
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    .    .    .   .   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,2]     D=======eeER   .    .    .    .    .    .   .   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .    .    .    .   .   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,4]     D=============eeeeER.    .    .    .    .   .   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,5]     D=================eeeeER .    .    .    .   .   fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: [1,0]     D=====================eeeER   .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     D========================eeeeER    .    .   .   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,2]     D============================eeER  .    .   .   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3]     D==============================eeeeER   .   .   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,4]     .D=================================eeeeER   .   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,5]     .D=====================================eeeeER   fmla	v0.2d, v0.2d, v1.2d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     11.5   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 2.     2     18.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3.     2     20.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 4.     2     24.0   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 5.     2     28.0   0.0    0.0       fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT:        2     19.5   0.1    0.0       <total>
+
+# CHECK:      [13] Code Region - fmlal
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      2103
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    .    .    .   .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,2]     D=======eeER   .    .    .    .    .    .   .   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .    .    .    .   .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,4]     D=============eeeeER.    .    .    .    .   .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,5]     D=================eeeeER .    .    .    .   .   fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: [1,0]     D=====================eeeER   .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     D========================eeeeER    .    .   .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,2]     D============================eeER  .    .   .   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3]     D==============================eeeeER   .   .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,4]     .D=================================eeeeER   .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,5]     .D=====================================eeeeER   fmlal	v0.4s, v0.4h, v1.4h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     11.5   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 2.     2     18.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3.     2     20.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 4.     2     24.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 5.     2     28.0   0.0    0.0       fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT:        2     19.5   0.1    0.0       <total>
+
+# CHECK:      [14] Code Region - bfdot
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   bfdot	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     D======================eeeeER . .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     D==========================eeeeER   bfdot	v0.4s, v0.8h, v1.8h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     15.5   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       bfdot	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+
+# CHECK:      [15] Code Region - bfmmla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.22
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .  .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .  .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   bfmmla	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     D==================eeeER .    .    .  .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     D=====================eeeeeER .    .  .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     D===============================eeeeeER   bfmmla	v0.4s, v0.8h, v1.8h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     10.0   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     18.0   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     23.0   0.0    0.0       bfmmla	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
+
+# CHECK:      [16] Code Region - bfmlalb
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   bfmlalb	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     D======================eeeeER . .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     D==========================eeeeER   bfmlalb	v0.4s, v0.8h, v1.8h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     15.5   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       bfmlalb	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+
+# CHECK:      [17] Code Region - crc32
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      1100
+# CHECK-NEXT: Total Cycles:      2203
+# CHECK-NEXT: Total uOps:        1100
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456
+
+# CHECK:      [0,0]     DeeER.    .    .    .    .    .    .    .    ..   mul	w0, w0, w0
+# CHECK-NEXT: [0,1]     D==eeER   .    .    .    .    .    .    .    ..   crc32cb	w0, w0, w1
+# CHECK-NEXT: [0,2]     D====eeER .    .    .    .    .    .    .    ..   crc32cb	w0, w0, w1
+# CHECK-NEXT: [0,3]     D======eeER    .    .    .    .    .    .    ..   crc32cb	w0, w0, w0
+# CHECK-NEXT: [0,4]     D========eeER  .    .    .    .    .    .    ..   crc32b	w0, w0, w15
+# CHECK-NEXT: [0,5]     D==========eeER.    .    .    .    .    .    ..   crc32h	w0, w0, w21
+# CHECK-NEXT: [0,6]     D============eeER   .    .    .    .    .    ..   crc32w	w0, w0, w24
+# CHECK-NEXT: [0,7]     D==============eeER .    .    .    .    .    ..   crc32x	w0, w0, x25
+# CHECK-NEXT: [0,8]     D================eeER    .    .    .    .    ..   crc32ch	w0, w0, w16
+# CHECK-NEXT: [0,9]     D==================eeER  .    .    .    .    ..   crc32cw	w0, w0, w23
+# CHECK-NEXT: [0,10]    .D===================eeER.    .    .    .    ..   crc32cx	w0, w0, x5
+# CHECK-NEXT: [1,0]     .D=====================eeER   .    .    .    ..   mul	w0, w0, w0
+# CHECK-NEXT: [1,1]     .D=======================eeER .    .    .    ..   crc32cb	w0, w0, w1
+# CHECK-NEXT: [1,2]     .D=========================eeER    .    .    ..   crc32cb	w0, w0, w1
+# CHECK-NEXT: [1,3]     .D===========================eeER  .    .    ..   crc32cb	w0, w0, w0
+# CHECK-NEXT: [1,4]     .D=============================eeER.    .    ..   crc32b	w0, w0, w15
+# CHECK-NEXT: [1,5]     .D===============================eeER   .    ..   crc32h	w0, w0, w21
+# CHECK-NEXT: [1,6]     .D=================================eeER .    ..   crc32w	w0, w0, w24
+# CHECK-NEXT: [1,7]     .D===================================eeER    ..   crc32x	w0, w0, x25
+# CHECK-NEXT: [1,8]     .D=====================================eeER  ..   crc32ch	w0, w0, w16
+# CHECK-NEXT: [1,9]     . D======================================eeER..   crc32cw	w0, w0, w23
+# CHECK-NEXT: [1,10]    . D========================================eeER   crc32cx	w0, w0, x5
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     11.5   0.5    0.0       mul	w0, w0, w0
+# CHECK-NEXT: 1.     2     13.5   0.0    0.0       crc32cb	w0, w0, w1
+# CHECK-NEXT: 2.     2     15.5   0.0    0.0       crc32cb	w0, w0, w1
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       crc32cb	w0, w0, w0
+# CHECK-NEXT: 4.     2     19.5   0.0    0.0       crc32b	w0, w0, w15
+# CHECK-NEXT: 5.     2     21.5   0.0    0.0       crc32h	w0, w0, w21
+# CHECK-NEXT: 6.     2     23.5   0.0    0.0       crc32w	w0, w0, w24
+# CHECK-NEXT: 7.     2     25.5   0.0    0.0       crc32x	w0, w0, x25
+# CHECK-NEXT: 8.     2     27.5   0.0    0.0       crc32ch	w0, w0, w16
+# CHECK-NEXT: 9.     2     29.0   0.0    0.0       crc32cw	w0, w0, w23
+# CHECK-NEXT: 10.    2     30.5   0.0    0.0       crc32cx	w0, w0, x5
+# CHECK-NEXT:        2     21.4   0.0    0.0       <total>
+
+# CHECK:      [18] Code Region - Z saba
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   saba	z0.d, z1.d, z2.d
+# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   saba	z0.d, z1.d, z2.d
+# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   saba	z0.d, z0.d, z1.d
+# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   saba	z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   saba	z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,3]     D==============================eeeeER   saba	z0.d, z0.d, z1.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       saba	z0.d, z1.d, z2.d
+# CHECK-NEXT: 2.     2     18.5   0.0    0.0       saba	z0.d, z1.d, z2.d
+# CHECK-NEXT: 3.     2     22.5   0.0    0.0       saba	z0.d, z0.d, z1.d
+# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+
+# CHECK:      [19] Code Region - Z sabalt
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	z0.h, z0.h, z0.h
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   sabalt	z0.h, z1.b, z2.b
+# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   sabalt	z0.h, z1.b, z2.b
+# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   sabalt	z0.h, z0.b, z1.b
+# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	z0.h, z0.h, z0.h
+# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   sabalt	z0.h, z1.b, z2.b
+# CHECK-NEXT: [1,2]     D========================eeeeER   .   sabalt	z0.h, z1.b, z2.b
+# CHECK-NEXT: [1,3]     D============================eeeeER   sabalt	z0.h, z0.b, z1.b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	z0.h, z0.h, z0.h
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sabalt	z0.h, z1.b, z2.b
+# CHECK-NEXT: 2.     2     17.0   0.0    0.0       sabalt	z0.h, z1.b, z2.b
+# CHECK-NEXT: 3.     2     21.0   0.0    0.0       sabalt	z0.h, z0.b, z1.b
+# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+
+# CHECK:      [20] Code Region - Z sadalp
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   sadalp	z0.d, p0/m, z1.s
+# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   sadalp	z0.d, p0/m, z1.s
+# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   sadalp	z0.d, p0/m, z0.s
+# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   sadalp	z0.d, p0/m, z1.s
+# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   sadalp	z0.d, p0/m, z1.s
+# CHECK-NEXT: [1,3]     D==============================eeeeER   sadalp	z0.d, p0/m, z0.s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       sadalp	z0.d, p0/m, z1.s
+# CHECK-NEXT: 2.     2     18.5   0.0    0.0       sadalp	z0.d, p0/m, z1.s
+# CHECK-NEXT: 3.     2     22.5   0.0    0.0       sadalp	z0.d, p0/m, z0.s
+# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+
+# CHECK:      [21] Code Region - Z ssra
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   ssra	z0.d, z1.d, #1
+# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   ssra	z0.d, z1.d, #1
+# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   ssra	z0.d, z0.d, #1
+# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   ssra	z0.d, z1.d, #1
+# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   ssra	z0.d, z1.d, #1
+# CHECK-NEXT: [1,3]     D==============================eeeeER   ssra	z0.d, z0.d, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       ssra	z0.d, z1.d, #1
+# CHECK-NEXT: 2.     2     18.5   0.0    0.0       ssra	z0.d, z1.d, #1
+# CHECK-NEXT: 3.     2     22.5   0.0    0.0       ssra	z0.d, z0.d, #1
+# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+
+# CHECK:      [22] Code Region - Z cdot.s
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1403
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeER    .    .    .    .   cdot	z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [0,2]     D========eeeER .    .    .    .   cdot	z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [0,3]     D===========eeeER   .    .    .   cdot	z0.s, z0.b, z1.b, #90
+# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D===================eeeER.    .   cdot	z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [1,2]     D======================eeeER  .   cdot	z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [1,3]     D=========================eeeER   cdot	z0.s, z0.b, z1.b, #90
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       cdot	z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: 2.     2     16.0   0.0    0.0       cdot	z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: 3.     2     19.0   0.0    0.0       cdot	z0.s, z0.b, z1.b, #90
+# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
+
+# CHECK:      [23] Code Region - Z cdot.d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   cdot	z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   cdot	z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   cdot	z0.d, z0.h, z1.h, #90
+# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   cdot	z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   cdot	z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [1,3]     D==============================eeeeER   cdot	z0.d, z0.h, z1.h, #90
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       cdot	z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: 2.     2     18.5   0.0    0.0       cdot	z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: 3.     2     22.5   0.0    0.0       cdot	z0.d, z0.h, z1.h, #90
+# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+
+# CHECK:      [24] Code Region - Z cmla.b
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   cmla	z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   cmla	z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   cmla	z0.b, z0.b, z1.b, #90
+# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   cmla	z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   cmla	z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [1,3]     D==============================eeeeER   cmla	z0.b, z0.b, z1.b, #90
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       cmla	z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: 2.     2     18.5   0.0    0.0       cmla	z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: 3.     2     22.5   0.0    0.0       cmla	z0.b, z0.b, z1.b, #90
+# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+
+# CHECK:      [25] Code Region - Z cmla.d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total uOps:        800
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    . .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .    . .   cmla	z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [0,2]     D==========eeeeeER  .    .    .    .    . .   cmla	z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [0,3]     D===============eeeeeER  .    .    .    . .   cmla	z0.d, z0.d, z1.d, #90
+# CHECK-NEXT: [1,0]     D====================eeeeeER  .    .    . .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D========================eeeeeER  .    . .   cmla	z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [1,2]     .D=============================eeeeeER  . .   cmla	z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [1,3]     .D==================================eeeeeER   cmla	z0.d, z0.d, z1.d, #90
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     11.0   0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     15.5   0.0    0.0       cmla	z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: 2.     2     20.5   0.0    0.0       cmla	z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: 3.     2     25.5   0.0    0.0       cmla	z0.d, z0.d, z1.d, #90
+# CHECK-NEXT:        2     18.1   0.1    0.0       <total>
+
+# CHECK:      [26] Code Region - Z sdot.s
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1403
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeER    .    .    .    .   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,2]     D========eeeER .    .    .    .   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,3]     D===========eeeER   .    .    .   sdot	z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D===================eeeER.    .   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,2]     D======================eeeER  .   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3]     D=========================eeeER   sdot	z0.s, z0.b, z1.b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: 2.     2     16.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: 3.     2     19.0   0.0    0.0       sdot	z0.s, z0.b, z1.b
+# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
+
+# CHECK:      [27] Code Region - Z sudot
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1403
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeER    .    .    .    .   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [0,2]     D========eeeER .    .    .    .   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [0,3]     D===========eeeER   .    .    .   sdot	z0.s, z0.b, z1.b[1]
+# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D===================eeeER.    .   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,2]     D======================eeeER  .   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,3]     D=========================eeeER   sdot	z0.s, z0.b, z1.b[1]
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 2.     2     16.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 3.     2     19.0   0.0    0.0       sdot	z0.s, z0.b, z1.b[1]
+# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
+
+# CHECK:      [28] Code Region - Z sdot.d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   sdot	z0.d, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,3]     D==============================eeeeER   sdot	z0.d, z0.h, z1.h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: 2.     2     18.5   0.0    0.0       sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: 3.     2     22.5   0.0    0.0       sdot	z0.d, z0.h, z1.h
+# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+
+# CHECK:      [29] Code Region - Z smmla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	z0.s, z0.s, z0.s
+# CHECK-NEXT: [0,1]     D====eeeER.    .    .    .  .   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,2]     D=======eeeER  .    .    .  .   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,3]     D==========eeeER    .    .  .   smmla	z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	z0.s, z0.s, z0.s
+# CHECK-NEXT: [1,1]     D=================eeeER  .  .   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,2]     D====================eeeER  .   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3]     D=======================eeeER   smmla	z0.s, z0.b, z1.b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.s, z0.s, z0.s
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: 2.     2     14.5   0.0    0.0       smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       smmla	z0.s, z0.b, z1.b
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
+
+# CHECK:      [30] Code Region - Z mla.b
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   mla	z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   mla	z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   mla	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   mla	z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   mla	z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [1,3]     D==============================eeeeER   mla	z0.b, p0/m, z0.b, z1.b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       mla	z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: 2.     2     18.5   0.0    0.0       mla	z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: 3.     2     22.5   0.0    0.0       mla	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+
+# CHECK:      [31] Code Region - Z mla.d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total uOps:        800
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    . .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .    . .   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,2]     D==========eeeeeER  .    .    .    .    . .   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,3]     D===============eeeeeER  .    .    .    . .   mla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0]     D====================eeeeeER  .    .    . .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D========================eeeeeER  .    . .   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     .D=============================eeeeeER  . .   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D==================================eeeeeER   mla	z0.d, p0/m, z0.d, z1.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     11.0   0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     15.5   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     20.5   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3.     2     25.5   0.0    0.0       mla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT:        2     18.1   0.1    0.0       <total>
+
+# CHECK:      [32] Code Region - Z smlalb
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   smlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   smlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   smlalb	z0.d, z0.s, z1.s
+# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   smlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   smlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,3]     D==============================eeeeER   smlalb	z0.d, z0.s, z1.s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       smlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: 2.     2     18.5   0.0    0.0       smlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: 3.     2     22.5   0.0    0.0       smlalb	z0.d, z0.s, z1.s
+# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+
+# CHECK:      [33] Code Region - Z sqdmlalb
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   sqdmlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   sqdmlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   sqdmlalb	z0.d, z0.s, z1.s
+# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   sqdmlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   sqdmlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,3]     D==============================eeeeER   sqdmlalb	z0.d, z0.s, z1.s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       sqdmlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: 2.     2     18.5   0.0    0.0       sqdmlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: 3.     2     22.5   0.0    0.0       sqdmlalb	z0.d, z0.s, z1.s
+# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+
+# CHECK:      [34] Code Region - Z sqrdmlah.b
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   sqrdmlah	z0.b, z1.b, z2.b
+# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   sqrdmlah	z0.b, z1.b, z2.b
+# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   sqrdmlah	z0.b, z0.b, z1.b
+# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   sqrdmlah	z0.b, z1.b, z2.b
+# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   sqrdmlah	z0.b, z1.b, z2.b
+# CHECK-NEXT: [1,3]     D==============================eeeeER   sqrdmlah	z0.b, z0.b, z1.b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       sqrdmlah	z0.b, z1.b, z2.b
+# CHECK-NEXT: 2.     2     18.5   0.0    0.0       sqrdmlah	z0.b, z1.b, z2.b
+# CHECK-NEXT: 3.     2     22.5   0.0    0.0       sqrdmlah	z0.b, z0.b, z1.b
+# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+
+# CHECK:      [35] Code Region - Z sqrdmlah.d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total uOps:        800
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    . .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .    . .   sqrdmlah	z0.d, z1.d, z2.d
+# CHECK-NEXT: [0,2]     D==========eeeeeER  .    .    .    .    . .   sqrdmlah	z0.d, z1.d, z2.d
+# CHECK-NEXT: [0,3]     D===============eeeeeER  .    .    .    . .   sqrdmlah	z0.d, z0.d, z1.d
+# CHECK-NEXT: [1,0]     D====================eeeeeER  .    .    . .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D========================eeeeeER  .    . .   sqrdmlah	z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,2]     .D=============================eeeeeER  . .   sqrdmlah	z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D==================================eeeeeER   sqrdmlah	z0.d, z0.d, z1.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     11.0   0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     15.5   0.0    0.0       sqrdmlah	z0.d, z1.d, z2.d
+# CHECK-NEXT: 2.     2     20.5   0.0    0.0       sqrdmlah	z0.d, z1.d, z2.d
+# CHECK-NEXT: 3.     2     25.5   0.0    0.0       sqrdmlah	z0.d, z0.d, z1.d
+# CHECK-NEXT:        2     18.1   0.1    0.0       <total>
+
+# CHECK:      [36] Code Region - Z fcmla ZPmZZ
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   fcmla	z0.d, p0/m, z0.d, z1.d, #90
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [1,2]     D======================eeeeER . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [1,3]     D==========================eeeeER   fcmla	z0.d, p0/m, z0.d, z1.d, #90
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fcmla	z0.d, p0/m, z0.d, z1.d, #90
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+
+# CHECK:      [37] Code Region - Z fcmla ZZZI
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   fcmla	z0.s, z0.s, z1.s[1], #90
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [1,2]     D======================eeeeER . .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [1,3]     D==========================eeeeER   fcmla	z0.s, z0.s, z1.s[1], #90
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fcmla	z0.s, z0.s, z1.s[1], #90
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+
+# CHECK:      [38] Code Region - Z fmla ZPmZZ
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   fmla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     D======================eeeeER . .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3]     D==========================eeeeER   fmla	z0.d, p0/m, z0.d, z1.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fmla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+
+# CHECK:      [39] Code Region - Z fmla ZZZI
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   fmla	z0.d, z0.d, z1.d[1]
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [1,2]     D======================eeeeER . .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [1,3]     D==========================eeeeER   fmla	z0.d, z0.d, z1.d[1]
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fmla	z0.d, z0.d, z1.d[1]
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+
+# CHECK:      [40] Code Region - Z fmlalb ZZZ
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   fmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   fmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   fmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   fmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2]     D======================eeeeER . .   fmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     D==========================eeeeER   fmlalb	z0.s, z0.h, z1.h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+
+# CHECK:      [41] Code Region - Z bfdot
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   bfdot	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2]     D======================eeeeER . .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     D==========================eeeeER   bfdot	z0.s, z0.h, z1.h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: 2.     2     15.5   0.0    0.0       bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       bfdot	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+
+# CHECK:      [42] Code Region - Z bfmmla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.22
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .  .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   bfmmla	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D==================eeeER .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D=====================eeeeeER .    .  .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     D===============================eeeeeER   bfmmla	z0.s, z0.h, z1.h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     10.0   0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: 2.     2     18.0   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     23.0   0.0    0.0       bfmmla	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
+
+# CHECK:      [43] Code Region - bfmlalb
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   bfmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2]     D======================eeeeER . .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     D==========================eeeeER   bfmlalb	z0.s, z0.h, z1.h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: 2.     2     15.5   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       bfmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>

>From cfa2112ff0dca73bfa6a37b5b6234f38674399b9 Mon Sep 17 00:00:00 2001
From: Ash Dobrescu <ash.dobrescu at arm.com>
Date: Fri, 7 Nov 2025 16:28:37 +0000
Subject: [PATCH 2/2] [AArch64] Add late forwarding to Neoverse-N3 and update
 test

---
 .../Target/AArch64/AArch64SchedNeoverseN3.td  |  205 +-
 .../llvm-mca/AArch64/Neoverse/N3-forwarding.s | 1688 ++++++++---------
 2 files changed, 1003 insertions(+), 890 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
index 411b372a3f533..d3705b932bf62 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
@@ -553,6 +553,107 @@ def N3Write_16c_16V0 : SchedWriteRes<[N3UnitV0, N3UnitV0, N3UnitV0, N3UnitV0,
     let NumMicroOps = 16;
 }
 
+//===----------------------------------------------------------------------===//
+// Define forwarded types
+// NOTE: SOG, p. 19, n. 2: Accumulator forwarding is not supported for
+// consumers of 64 bit multiply high operations?
+
+def N3Wr_FMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_FMA : SchedReadAdvance<2, [WriteFMul, N3Wr_FMA]>;
+
+def N3Wr_VMA : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_VMA : SchedReadAdvance<3, [N3Wr_VMA]>;
+
+def N3Wr_VMAL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_VMAL : SchedReadAdvance<3, [N3Wr_VMAL]>;
+
+def N3Wr_VMAH : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_VMAH : SchedReadAdvance<2, [N3Wr_VMAH]>;
+
+def N3Wr_VMASL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_VMASL : SchedReadAdvance<2, [N3Wr_VMASL]>;
+
+def N3Wr_ADA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; }
+def N3Rd_ADA : SchedReadAdvance<3, [N3Wr_ADA]>;
+
+def N3Wr_VDOT : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
+def N3Rd_VDOT : SchedReadAdvance<2, [N3Wr_VDOT]>;
+
+def N3Wr_VMMA : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
+def N3Rd_VMMA : SchedReadAdvance<2, [N3Wr_VMMA]>;
+
+def N3Wr_FCMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_FCMA : SchedReadAdvance<2, [N3Wr_FCMA]>;
+
+def N3Wr_FPM : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
+def N3Wr_FPMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_FPMA : SchedReadAdvance<2, [N3Wr_FPM, N3Wr_FPMA]>;
+
+def N3Wr_FPMAL : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_FPMAL : SchedReadAdvance<2, [N3Wr_FPMAL]>;
+
+def N3Wr_BFD : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_BFD : SchedReadAdvance<2, [N3Wr_BFD]>;
+
+def N3Wr_BFMMA : SchedWriteRes<[N3UnitV]> { let Latency = 5; }
+def N3Rd_BFMMA : SchedReadAdvance<2, [N3Wr_BFMMA]>;
+
+def N3Wr_BFMLA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_BFMLA : SchedReadAdvance<2, [N3Wr_BFMLA]>;
+
+def N3Wr_CRC : SchedWriteRes<[N3UnitM0]> { let Latency = 2; }
+def N3Rd_CRC : SchedReadAdvance<1, [N3Wr_CRC]>;
+
+def N3Wr_ZA  : SchedWriteRes<[N3UnitV1]> { let Latency = 4; }
+def N3Rd_ZA  : SchedReadAdvance<3, [N3Wr_ZA]>;
+def N3Wr_ZPA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; }
+def N3Rd_ZPA : SchedReadAdvance<3, [N3Wr_ZPA]>;
+def N3Wr_ZSA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; }
+def N3Rd_ZSA : SchedReadAdvance<3, [N3Wr_ZSA]>;
+
+def N3Wr_ZDOTB : SchedWriteRes<[N3UnitV]>   { let Latency = 3; }
+def N3Rd_ZDOTB : SchedReadAdvance<2, [N3Wr_ZDOTB]>;
+def N3Wr_ZDOTH : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_ZDOTH : SchedReadAdvance<3, [N3Wr_ZDOTH]>;
+
+def N3Wr_ZCMABHS : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_ZCMABHS : SchedReadAdvance<3, [N3Wr_ZCMABHS]>;
+def N3Wr_ZCMAD   : SchedWriteRes<[N3UnitV0, N3UnitV0]> { let Latency = 5; }
+def N3Rd_ZCMAD   : SchedReadAdvance<2, [N3Wr_ZCMAD]>;
+
+def N3Wr_ZMMA : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
+def N3Rd_ZMMA : SchedReadAdvance<2, [N3Wr_ZMMA]>;
+
+def N3Wr_ZMABHS : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_ZMABHS : SchedReadAdvance<3, [N3Wr_ZMABHS]>;
+def N3Wr_ZMAD  : SchedWriteRes<[N3UnitV0, N3UnitV0]> { let Latency = 5; }
+def N3Rd_ZMAD  : SchedReadAdvance<2, [N3Wr_ZMAD]>;
+
+def N3Wr_ZMAL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_ZMAL : SchedReadAdvance<3, [N3Wr_ZMAL]>;
+
+def N3Wr_ZMASQL   : SchedWriteRes<[N3UnitV0]>            { let Latency = 4; }
+def N3Wr_ZMASQBHS : SchedWriteRes<[N3UnitV0]>            { let Latency = 4; }
+def N3Wr_ZMASQD   : SchedWriteRes<[N3UnitV0, N3UnitV0]> { let Latency = 5; }
+def N3Rd_ZMASQ    : SchedReadAdvance<2, [N3Wr_ZMASQL, N3Wr_ZMASQBHS,
+                                        N3Wr_ZMASQD]>;
+
+def N3Wr_ZFCMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_ZFCMA : SchedReadAdvance<2, [N3Wr_ZFCMA]>;
+
+def N3Wr_ZFMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_ZFMA : SchedReadAdvance<2, [N3Wr_ZFMA]>;
+
+def N3Wr_ZFMAL : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_ZFMAL : SchedReadAdvance<2, [N3Wr_ZFMAL]>;
+
+def N3Wr_ZBFDOT : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_ZBFDOT : SchedReadAdvance<2, [N3Wr_ZBFDOT]>;
+def N3Wr_ZBFMMA : SchedWriteRes<[N3UnitV]> { let Latency = 5; }
+def N3Rd_ZBFMMA : SchedReadAdvance<2, [N3Wr_ZBFMMA]>;
+def N3Wr_ZBFMAL : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_ZBFMAL : SchedReadAdvance<2, [N3Wr_ZBFMAL]>;
+
 // Miscellaneous
 // -----------------------------------------------------------------------------
 
@@ -832,10 +933,11 @@ def : SchedAlias<WriteFDiv , N3Write_7c_1V0>;
 def : InstRW<[N3Write_12c_1V0], (instrs FDIVDrr, FSQRTDr)>;
 
 // FP multiply
-def : SchedAlias<WriteFMul, N3Write_3c_1V>;
+def : WriteRes<WriteFMul, [N3UnitV]> { let Latency = 3; }
 
 // FP multiply accumulate
-def : InstRW<[N3Write_4c_1V], (instregex "^(FMADD|FMSUB|FNMADD|FNMSUB)[DHS]rrr$")>;
+def : InstRW<[N3Wr_FMA, ReadDefault, ReadDefault, N3Rd_FMA],
+             (instregex "^(FMADD|FMSUB|FNMADD|FNMSUB)[DHS]rrr$")>;
 
 // FP round to integral
 def : InstRW<[N3Write_3c_1V0], (instregex "^FRINT([AIMNPXZ]|32X|64X|32Z|64Z)[DHS]r$")>;
@@ -969,7 +1071,7 @@ def : SchedAlias<WriteVq, N3Write_2c_1V>;
 // ASIMD absolute diff accum long
 // ASIMD pairwise add and accumulate long
 // ASIMD shift accumulate
-def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ABAL?v",
+def : InstRW<[N3Wr_ADA, N3Rd_ADA], (instregex "^[SU]ABAL?v",
                                           "^[SU]ADALPv",
                                           "^[SU]R?SRAv")>;
 
@@ -984,10 +1086,11 @@ def : InstRW<[N3Write_6c_2V1], (instregex "^[SU]?ADDL?Vv16i8v$")>;
 
 // ASIMD dot product
 // ASIMD dot product using signed and unsigned integers
-def : InstRW<[N3Write_3c_1V], (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>;
+def : InstRW<[N3Wr_VDOT, N3Rd_VDOT],
+             (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>;
 
 // ASIMD matrix multiply-accumulate
-def : InstRW<[N3Write_3c_1V], (instrs SMMLA, UMMLA, USMMLA)>;
+def : InstRW<[N3Wr_VMMA, N3Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>;
 
 // ASIMD max/min, reduce, 4H/4S
 def : InstRW<[N3Write_3c_1V1], (instregex "^[SU](MAX|MIN)Vv4i(16|32)v$")>;
@@ -1002,16 +1105,16 @@ def : InstRW<[N3Write_6c_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
 def : InstRW<[N3Write_4c_1V0], (instregex "^MULv", "^SQ(R)?DMULHv")>;
 
 // ASIMD multiply accumulate
-def : InstRW<[N3Write_4c_1V0], (instregex "^MLAv", "^MLSv")>;
+def : InstRW<[N3Wr_VMA, N3Rd_VMA], (instregex "^MLAv", "^MLSv")>;
 
 // ASIMD multiply accumulate high
-def : InstRW<[N3Write_4c_1V0], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
+def : InstRW<[N3Wr_VMAH, N3Rd_VMAH], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
 
 // ASIMD multiply accumulate long
-def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
+def : InstRW<[N3Wr_VMAL, N3Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
 
 // ASIMD multiply accumulate saturating long
-def : InstRW<[N3Write_4c_1V0], (instregex "^SQDMLALv", "^SQDMLSLv")>;
+def : InstRW<[N3Wr_VMASL, N3Rd_VMASL], (instregex "^SQDMLALv", "^SQDMLSLv")>;
 
 // ASIMD multiply/multiply long (8x8) polynomial, D-form
 // ASIMD multiply/multiply long (8x8) polynomial, Q-form
@@ -1058,7 +1161,7 @@ def : InstRW<[N3Write_4c_1V1],
 def : InstRW<[N3Write_3c_1V], (instregex "^FCADDv")>;
 
 // ASIMD FP complex multiply add
-def : InstRW<[N3Write_4c_1V], (instregex "^FCMLAv")>;
+def : InstRW<[N3Wr_FCMA, N3Rd_FCMA], (instregex "^FCMLAv")>;
 
 // ASIMD FP convert, long (F16 to F32)
 def : InstRW<[N3Write_4c_2V0], (instregex "^FCVTL(v4|v8)i16")>;
@@ -1114,13 +1217,13 @@ def : InstRW<[N3Write_4c_2V], (instregex "^(FMAX|FMIN)(NM)?Vv4(i16|i32)v$")>;
 def : InstRW<[N3Write_6c_3V], (instregex "^(FMAX|FMIN)(NM)?Vv8i16v$")>;
 
 // ASIMD FP multiply
-def : InstRW<[N3Write_3c_1V], (instregex "^FMULv", "^FMULXv")>;
+def : InstRW<[N3Wr_FPM], (instregex "^FMULv", "^FMULXv")>;
 
 // ASIMD FP multiply accumulate
-def : InstRW<[N3Write_4c_1V], (instregex "^FMLAv", "^FMLSv")>;
+def : InstRW<[N3Wr_FPMA, N3Rd_FPMA], (instregex "^FMLAv", "^FMLSv")>;
 
 // ASIMD FP multiply accumulate long
-def : InstRW<[N3Write_4c_1V], (instregex "^FMLALv", "^FMLSLv")>;
+def : InstRW<[N3Wr_FPMAL, N3Rd_FPMAL], (instregex "^FMLALv", "^FMLSLv")>;
 
 // ASIMD FP round, D-form F32 and Q-form F64
 def : InstRW<[N3Write_3c_1V0],
@@ -1157,13 +1260,14 @@ def : InstRW<[N3Write_13c_2V0], (instrs FSQRTv2f64)>;
 def : InstRW<[N3Write_4c_2V0], (instrs BFCVTN, BFCVTN2)>;
 
 // ASIMD dot product
-def : InstRW<[N3Write_4c_1V], (instrs BFDOTv4bf16, BFDOTv8bf16)>;
+def : InstRW<[N3Wr_BFD, N3Rd_BFD], (instrs BFDOTv4bf16, BFDOTv8bf16)>;
 
 // ASIMD matrix multiply accumulate
-def : InstRW<[N3Write_5c_1V], (instrs BFMMLA)>;
+def : InstRW<[N3Wr_BFMMA, N3Rd_BFMMA], (instrs BFMMLA)>;
 
 // ASIMD multiply accumulate long
-def : InstRW<[N3Write_4c_1V], (instrs BFMLALB, BFMLALBIdx, BFMLALT, BFMLALTIdx)>;
+def : InstRW<[N3Wr_BFMLA, N3Rd_BFMLA],
+             (instrs BFMLALB, BFMLALBIdx, BFMLALT, BFMLALTIdx)>;
 
 // Scalar convert, F32 to BF16
 def : InstRW<[N3Write_3c_1V0], (instrs BFCVT)>;
@@ -1502,7 +1606,7 @@ def : InstRW<[N3Write_4c_1V0], (instrs SM4E, SM4ENCKEY)>;
 // -----------------------------------------------------------------------------
 
 // CRC checksum ops
-def : InstRW<[N3Write_2c_1M0], (instregex "^CRC32")>;
+def : InstRW<[N3Wr_CRC, N3Rd_CRC], (instregex "^CRC32")>;
 
 // SVE Predicate instructions
 // -----------------------------------------------------------------------------
@@ -1592,10 +1696,10 @@ def : InstRW<[N3Write_2c_1V], (instregex "^[SU]ABD_ZPmZ_[BHSD]",
                                          "^[SU]ABD_ZPZZ_[BHSD]")>;
 
 // Arithmetic, absolute diff accum
-def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ABA_ZZZ_[BHSD]$")>;
+def : InstRW<[N3Wr_ZA, N3Rd_ZA], (instregex "^[SU]ABA_ZZZ_[BHSD]$")>;
 
 // Arithmetic, absolute diff accum long
-def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]$")>;
+def : InstRW<[N3Wr_ZA, N3Rd_ZA], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]$")>;
 
 // Arithmetic, absolute diff long
 def : InstRW<[N3Write_2c_1V], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]$")>;
@@ -1629,7 +1733,8 @@ def : InstRW<[N3Write_2c_1V], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]$")>;
 def : InstRW<[N3Write_2c_1V], (instregex "^ADDP_ZPmZ_[BHSD]$")>;
 
 // Arithmetic, pairwise add and accum long
-def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ADALP_ZPmZ_[HSD]$")>;
+def : InstRW<[N3Wr_ZPA, ReadDefault, N3Rd_ZPA],
+             (instregex "^[SU]ADALP_ZPmZ_[HSD]$")>;
 
 // Arithmetic, shift
 def : InstRW<[N3Write_2c_1V1],
@@ -1642,7 +1747,7 @@ def : InstRW<[N3Write_2c_1V1],
                         "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>;
 
 // Arithmetic, shift and accumulate
-def : InstRW<[N3Write_4c_1V1],
+def : InstRW<[N3Wr_ZSA, N3Rd_ZSA],
              (instregex "^(SRSRA|SSRA|URSRA|USRA)_ZZI_[BHSD]$")>;
 
 // Arithmetic, shift by immediate
@@ -1688,16 +1793,17 @@ def : InstRW<[N3Write_2c_1V],
 def : InstRW<[N3Write_2c_1V], (instregex "^(SQ)?CADD_ZZI_[BHSD]$")>;
 
 // Complex dot product 8-bit element
-def : InstRW<[N3Write_3c_1V], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>;
+def : InstRW<[N3Wr_ZDOTB, N3Rd_ZDOTB], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>;
 
 // Complex dot product 16-bit element
-def : InstRW<[N3Write_4c_1V0], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>;
+def : InstRW<[N3Wr_ZDOTH, N3Rd_ZDOTH], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>;
 
 // Complex multiply-add B, H, S element size
-def : InstRW<[N3Write_4c_1V0], (instregex "^CMLA_ZZZ_[BHS]$", "^CMLA_ZZZI_[HS]$")>;
+def : InstRW<[N3Wr_ZCMABHS, N3Rd_ZCMABHS],
+             (instregex "^CMLA_ZZZ_[BHS]$", "^CMLA_ZZZI_[HS]$")>;
 
 // Complex multiply-add D element size
-def : InstRW<[N3Write_5c_2V0], (instrs CMLA_ZZZ_D)>;
+def : InstRW<[N3Wr_ZCMAD, N3Rd_ZCMAD], (instrs CMLA_ZZZ_D)>;
 
 // Conditional extract operations, scalar form
 def : InstRW<[N3Write_8c_1M0_1V], (instregex "^CLAST[AB]_RPZ_[BHSD]$")>;
@@ -1736,13 +1842,14 @@ def : InstRW<[N3Write_16c_16V0], (instregex "^[SU]DIVR?_ZPmZ_D",
                                             "^[SU]DIV_ZPZZ_D")>;
 
 // Dot product, 8 bit
-def : InstRW<[N3Write_3c_1V], (instregex "^[SU]DOT_ZZZI?_BtoS$")>;
+def : InstRW<[N3Wr_ZDOTB, N3Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_BtoS$")>;
 
 // Dot product, 8 bit, using signed and unsigned integers
-def : InstRW<[N3Write_3c_1V], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>;
+def : InstRW<[N3Wr_ZDOTB, N3Rd_ZDOTB],
+             (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>;
 
 // Dot product, 16 bit
-def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]DOT_ZZZI?_HtoD$")>;
+def : InstRW<[N3Wr_ZDOTH, N3Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_HtoD$")>;
 
 // Duplicate, immediate and indexed form
 def : InstRW<[N3Write_2c_1V], (instregex "^DUP_ZI_[BHSD]$",
@@ -1804,7 +1911,7 @@ def : InstRW<[N3Write_2c_1V], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]",
 def : InstRW<[N3Write_2c_1V], (instregex "^N?MATCH_PPzZZ_[BH]$")>;
 
 // Matrix multiply-accumulate
-def : InstRW<[N3Write_3c_1V], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
+def : InstRW<[N3Wr_ZMMA, N3Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
 
 // Move prefix
 def : InstRW<[N3Write_2c_1V], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]$",
@@ -1827,20 +1934,22 @@ def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]MULL[BT]_ZZZI_[SD]$",
                                           "^[SU]MULL[BT]_ZZZ_[HSD]$")>;
 
 // Multiply accumulate, B, H, S element size
-def : InstRW<[N3Write_4c_1V0], (instregex "^ML[AS]_ZZZI_[BHS]$",
-                                          "^(ML[AS]|MAD|MSB)_(ZPmZZ|ZPZZZ)_[BHS]")>;
+def : InstRW<[N3Wr_ZMABHS, ReadDefault, N3Rd_ZMABHS],
+             (instregex "^ML[AS]_ZZZI_[BHS]$",
+                        "^(ML[AS]|MAD|MSB)_(ZPmZZ|ZPZZZ)_[BHS]")>;
 
 // Multiply accumulate, D element size
-def : InstRW<[N3Write_5c_2V0], (instregex "^ML[AS]_ZZZI_D$",
+def : InstRW<[N3Wr_ZMAD, ReadDefault, N3Rd_ZMAD], (instregex "^ML[AS]_ZZZI_D$",
                                           "^(ML[AS]|MAD|MSB)_(ZPmZZ|ZPZZZ)_D")>;
 
 // Multiply accumulate long
-def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]$",
+def : InstRW<[N3Wr_ZMAL, N3Rd_ZMAL], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]$",
                                           "^[SU]ML[AS]L[BT]_ZZZI_[SD]$")>;
 
 // Multiply accumulate saturating doubling long regular
-def : InstRW<[N3Write_4c_1V0], (instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]$",
-                                          "^SQDML[AS](LB|LT)_ZZZI_[SD]$")>;
+def : InstRW<[N3Wr_ZMASQL, N3Rd_ZMASQ],
+            (instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]$",
+                       "^SQDML[AS](LB|LT)_ZZZI_[SD]$")>;
 
 // Multiply saturating doubling high, B, H, S element size
 def : InstRW<[N3Write_4c_1V0], (instregex "^SQDMULH_ZZZ_[BHS]$",
@@ -1854,13 +1963,13 @@ def : InstRW<[N3Write_4c_1V0], (instregex "^SQDMULL[BT]_ZZZ_[HSD]$",
                                           "^SQDMULL[BT]_ZZZI_[SD]$")>;
 
 // Multiply saturating rounding doubling regular/complex accumulate, B, H, S element size
-def : InstRW<[N3Write_4c_1V0], (instregex "^SQRDML[AS]H_ZZZ_[BHS]$",
+def : InstRW<[N3Wr_ZMASQBHS, N3Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZ_[BHS]$",
                                           "^SQRDCMLAH_ZZZ_[BHS]$",
                                           "^SQRDML[AS]H_ZZZI_[HS]$",
                                           "^SQRDCMLAH_ZZZI_[HS]$")>;
 
 // Multiply saturating rounding doubling regular/complex accumulate, D element size
-def : InstRW<[N3Write_5c_2V0], (instregex "^SQRDML[AS]H_ZZZI?_D$",
+def : InstRW<[N3Wr_ZMASQD, N3Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZI?_D$",
                                           "^SQRDCMLAH_ZZZ_D$")>;
 
 // Multiply saturating rounding doubling regular/complex, B, H, S element size
@@ -1949,8 +2058,9 @@ def : InstRW<[N3Write_2c_1V], (instregex "^FAC(GE|GT)_PPzZZ_[HSD]$",
 def : InstRW<[N3Write_3c_1V], (instregex "^FCADD_ZPmZ_[HSD]$")>;
 
 // Floating point complex multiply add
-def : InstRW<[N3Write_4c_1V], (instregex "^FCMLA_ZPmZZ_[HSD]$",
-                                         "^FCMLA_ZZZI_[HS]$")>;
+def : InstRW<[N3Wr_ZFCMA, ReadDefault, N3Rd_ZFCMA],
+             (instregex "^FCMLA_ZPmZZ_[HSD]")>;
+def : InstRW<[N3Wr_ZFCMA, N3Rd_ZFCMA], (instregex "^FCMLA_ZZZI_[HS]")>;
 
 // Floating point convert, long or narrow (F16 to F32 or F32 to F16)
 def : InstRW<[N3Write_4c_2V0], (instregex "^FCVT_ZPmZ_(HtoS|StoH)",
@@ -2014,12 +2124,15 @@ def : InstRW<[N3Write_3c_1V], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]",
                                          "^FMUL_ZPZ[IZ]_[HSD]")>;
 
 // Floating point multiply accumulate
-def : InstRW<[N3Write_4c_1V], (instregex "^F(N?M(AD|SB)|N?ML[AS])_ZPmZZ_[HSD]$",
-                                         "^FN?ML[AS]_ZPZZZ_[HSD]",
-                                         "^FML[AS]_ZZZI_[HSD]$")>;
+def : InstRW<[N3Wr_ZFMA, ReadDefault, N3Rd_ZFMA],
+             (instregex "^FN?ML[AS]_ZPmZZ_[HSD]",
+                        "^FN?(MAD|MSB)_ZPmZZ_[HSD]")>;
+def : InstRW<[N3Wr_ZFMA, N3Rd_ZFMA],
+             (instregex "^FML[AS]_ZZZI_[HSD]",
+                        "^FN?ML[AS]_ZPZZZ_[HSD]")>;
 
 // Floating point multiply add/sub accumulate long
-def : InstRW<[N3Write_4c_1V], (instregex "^FML[AS]L[BT]_ZZZI?_SHH$")>;
+def : InstRW<[N3Wr_ZFMAL, N3Rd_ZFMAL], (instregex "^FML[AS]L[BT]_ZZZI?_SHH$")>;
 
 // Floating point reciprocal estimate, F16
 def : InstRW<[N3Write_6c_4V0], (instregex "^FR(ECP|SQRT)E_ZZ_H", "^FRECPX_ZPmZ_H")>;
@@ -2079,13 +2192,13 @@ def : InstRW<[N3Write_3c_1V], (instregex "^FTS(MUL|SEL)_ZZZ_[HSD]$")>;
 def : InstRW<[N3Write_4c_2V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;
 
 // Dot product
-def : InstRW<[N3Write_4c_1V], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
+def : InstRW<[N3Wr_ZBFDOT, N3Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
 
 // Matrix multiply accumulate
-def : InstRW<[N3Write_5c_1V], (instrs BFMMLA_ZZZ_HtoS)>;
+def : InstRW<[N3Wr_ZBFMMA, N3Rd_ZBFMMA], (instrs BFMMLA_ZZZ_HtoS)>;
 
 // Multiply accumulate long
-def : InstRW<[N3Write_4c_1V], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;
+def : InstRW<[N3Wr_ZBFMAL, N3Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;
 
 // SVE Load instructions
 // -----------------------------------------------------------------------------
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s
index 59e3af8abd708..f6b9db13624b6 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s
@@ -329,7 +329,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    0.57
 # CHECK-NEXT: IPC:               0.57
 # CHECK-NEXT: Block RThroughput: 3.0
@@ -343,9 +343,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,2]     D===eeER  .    ..   madd	x0, x1, x2, x0
 # CHECK-NEXT: [0,3]     D=====eeER.    ..   madd	x0, x0, x0, x0
 # CHECK-NEXT: [1,0]     D=======eeER   ..   mul	x0, x0, x0
-# CHECK-NEXT: [1,1]     D=========eeER ..   madd	x0, x1, x2, x0
-# CHECK-NEXT: [1,2]     D==========eeER..   madd	x0, x1, x2, x0
-# CHECK-NEXT: [1,3]     D============eeER   madd	x0, x0, x0, x0
+# CHECK-NEXT: [1,1]     .D========eeER ..   madd	x0, x1, x2, x0
+# CHECK-NEXT: [1,2]     .D=========eeER..   madd	x0, x1, x2, x0
+# CHECK-NEXT: [1,3]     .D===========eeER   madd	x0, x0, x0, x0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -355,10 +355,10 @@ bfmlalb z0.s, z0.h, z1.h
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	x0, x0, x0
-# CHECK-NEXT: 1.     2     6.5    0.0    0.0       madd	x0, x1, x2, x0
-# CHECK-NEXT: 2.     2     7.5    0.0    0.0       madd	x0, x1, x2, x0
-# CHECK-NEXT: 3.     2     9.5    0.0    0.0       madd	x0, x0, x0, x0
-# CHECK-NEXT:        2     7.0    0.1    0.0       <total>
+# CHECK-NEXT: 1.     2     6.0    0.0    0.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 2.     2     7.0    0.0    0.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 3.     2     9.0    0.0    0.0       madd	x0, x0, x0, x0
+# CHECK-NEXT:        2     6.6    0.1    0.0       <total>
 
 # CHECK:      [1] Code Region - smaddl
 
@@ -367,7 +367,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
+# CHECK:      Dispatch Width:    5
 # CHECK-NEXT: uOps Per Cycle:    0.57
 # CHECK-NEXT: IPC:               0.57
 # CHECK-NEXT: Block RThroughput: 3.0
@@ -381,9 +381,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,2]     D===eeER  .    ..   smaddl	x0, w1, w2, x0
 # CHECK-NEXT: [0,3]     D=====eeER.    ..   smaddl	x0, w0, w0, x0
 # CHECK-NEXT: [1,0]     D=======eeER   ..   mul	x0, x0, x0
-# CHECK-NEXT: [1,1]     D=========eeER ..   smaddl	x0, w1, w2, x0
-# CHECK-NEXT: [1,2]     D==========eeER..   smaddl	x0, w1, w2, x0
-# CHECK-NEXT: [1,3]     D============eeER   smaddl	x0, w0, w0, x0
+# CHECK-NEXT: [1,1]     .D========eeER ..   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [1,2]     .D=========eeER..   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [1,3]     .D===========eeER   smaddl	x0, w0, w0, x0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -393,39 +393,39 @@ bfmlalb z0.s, z0.h, z1.h
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	x0, x0, x0
-# CHECK-NEXT: 1.     2     6.5    0.0    0.0       smaddl	x0, w1, w2, x0
-# CHECK-NEXT: 2.     2     7.5    0.0    0.0       smaddl	x0, w1, w2, x0
-# CHECK-NEXT: 3.     2     9.5    0.0    0.0       smaddl	x0, w0, w0, x0
-# CHECK-NEXT:        2     7.0    0.1    0.0       <total>
+# CHECK-NEXT: 1.     2     6.0    0.0    0.0       smaddl	x0, w1, w2, x0
+# CHECK-NEXT: 2.     2     7.0    0.0    0.0       smaddl	x0, w1, w2, x0
+# CHECK-NEXT: 3.     2     9.0    0.0    0.0       smaddl	x0, w0, w0, x0
+# CHECK-NEXT:        2     6.6    0.1    0.0       <total>
 
 # CHECK:      [2] Code Region - fmadd
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
-# CHECK-NEXT: Total Cycles:      2103
+# CHECK-NEXT: Total Cycles:      1703
 # CHECK-NEXT: Total uOps:        600
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.29
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.35
+# CHECK-NEXT: IPC:               0.35
 # CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01234
-
-# CHECK:      [0,0]     DeeER.    .    .    .    .    .    .    .   .   fadd	d0, d0, d0
-# CHECK-NEXT: [0,1]     D==eeeeER .    .    .    .    .    .    .   .   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [0,2]     D======eeeER   .    .    .    .    .    .   .   fmul	d0, d0, d0
-# CHECK-NEXT: [0,3]     D=========eeeeER    .    .    .    .    .   .   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [0,4]     D=============eeeeER.    .    .    .    .   .   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [0,5]     D=================eeeeER .    .    .    .   .   fmadd	d0, d0, d1, d2
-# CHECK-NEXT: [1,0]     D=====================eeER    .    .    .   .   fadd	d0, d0, d0
-# CHECK-NEXT: [1,1]     D=======================eeeeER.    .    .   .   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [1,2]     D===========================eeeER  .    .   .   fmul	d0, d0, d0
-# CHECK-NEXT: [1,3]     D==============================eeeeER   .   .   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [1,4]     .D=================================eeeeER   .   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [1,5]     .D=====================================eeeeER   fmadd	d0, d0, d1, d2
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeER.    .    .    .    .    .    ..   fadd	d0, d0, d0
+# CHECK-NEXT: [0,1]     D==eeeeER .    .    .    .    .    ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,2]     D======eeeER   .    .    .    .    ..   fmul	d0, d0, d0
+# CHECK-NEXT: [0,3]     D=======eeeeER .    .    .    .    ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,4]     D=========eeeeER    .    .    .    ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,5]     .D============eeeeER.    .    .    ..   fmadd	d0, d0, d1, d2
+# CHECK-NEXT: [1,0]     .D================eeER   .    .    ..   fadd	d0, d0, d0
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .    ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,2]     .D======================eeeER .    ..   fmul	d0, d0, d0
+# CHECK-NEXT: [1,3]     .D=======================eeeeER    ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,4]     . D========================eeeeER  ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,5]     . D============================eeeeER   fmadd	d0, d0, d1, d2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -434,38 +434,38 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     11.5   0.5    0.0       fadd	d0, d0, d0
-# CHECK-NEXT: 1.     2     13.5   0.0    0.0       fmadd	d0, d1, d2, d0
-# CHECK-NEXT: 2.     2     17.5   0.0    0.0       fmul	d0, d0, d0
-# CHECK-NEXT: 3.     2     20.5   0.0    0.0       fmadd	d0, d1, d2, d0
-# CHECK-NEXT: 4.     2     24.0   0.0    0.0       fmadd	d0, d1, d2, d0
-# CHECK-NEXT: 5.     2     28.0   0.0    0.0       fmadd	d0, d0, d1, d2
-# CHECK-NEXT:        2     19.2   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       fadd	d0, d0, d0
+# CHECK-NEXT: 1.     2     11.0   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 2.     2     15.0   0.0    0.0       fmul	d0, d0, d0
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 4.     2     17.5   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 5.     2     21.0   0.0    0.0       fmadd	d0, d0, d1, d2
+# CHECK-NEXT:        2     14.9   0.1    0.0       <total>
 
 # CHECK:      [3] Code Region - saba
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.25
-# CHECK-NEXT: IPC:               0.25
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   saba	v0.4s, v0.4s, v1.4s
-# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2]     D========================eeeeER   .   saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3]     D============================eeeeER   saba	v0.4s, v0.4s, v1.4s
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   saba	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     .D================eeeeER .  .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   saba	v0.4s, v0.4s, v1.4s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -474,36 +474,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2.     2     17.0   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3.     2     21.0   0.0    0.0       saba	v0.4s, v0.4s, v1.4s
-# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.0   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       saba	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     11.6   0.1    0.0       <total>
 
 # CHECK:      [4] Code Region - sdot
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total Cycles:      1103
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.31
-# CHECK-NEXT: IPC:               0.31
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.36
 # CHECK-NEXT: Block RThroughput: 1.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          01234
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D====eeeER.    .    .    .  .   sdot	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [0,2]     D=======eeeER  .    .    .  .   sdot	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [0,3]     D==========eeeER    .    .  .   sdot	v0.4s, v0.16b, v1.16b
-# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     D=================eeeER  .  .   sdot	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,2]     D====================eeeER  .   sdot	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,3]     D=======================eeeER   sdot	v0.4s, v0.16b, v1.16b
+# CHECK:      [0,0]     DeeeeER   .    .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeER.    .    .   .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,2]     D=====eeeER    .    .   .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,3]     D========eeeER .    .   .   sdot	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: [1,0]     D===========eeeeER  .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     .D==============eeeER   .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,2]     .D===============eeeER  .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,3]     .D==================eeeER   sdot	v0.4s, v0.16b, v1.16b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -512,36 +512,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 2.     2     14.5   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 3.     2     17.5   0.0    0.0       sdot	v0.4s, v0.16b, v1.16b
-# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     10.0   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 2.     2     11.0   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 3.     2     14.0   0.0    0.0       sdot	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT:        2     10.4   0.1    0.0       <total>
 
 # CHECK:      [5] Code Region - smmla
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total Cycles:      1103
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.31
-# CHECK-NEXT: IPC:               0.31
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.36
 # CHECK-NEXT: Block RThroughput: 1.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          01234
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D====eeeER.    .    .    .  .   smmla	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [0,2]     D=======eeeER  .    .    .  .   smmla	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [0,3]     D==========eeeER    .    .  .   smmla	v0.4s, v0.16b, v1.16b
-# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     D=================eeeER  .  .   smmla	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,2]     D====================eeeER  .   smmla	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,3]     D=======================eeeER   smmla	v0.4s, v0.16b, v1.16b
+# CHECK:      [0,0]     DeeeeER   .    .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeER.    .    .   .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,2]     D=====eeeER    .    .   .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,3]     D========eeeER .    .   .   smmla	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: [1,0]     D===========eeeeER  .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     .D==============eeeER   .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,2]     .D===============eeeER  .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,3]     .D==================eeeER   smmla	v0.4s, v0.16b, v1.16b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -550,36 +550,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 2.     2     14.5   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 3.     2     17.5   0.0    0.0       smmla	v0.4s, v0.16b, v1.16b
-# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     10.0   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 2.     2     11.0   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 3.     2     14.0   0.0    0.0       smmla	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT:        2     10.4   0.1    0.0       <total>
 
 # CHECK:      [6] Code Region - mla
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.25
-# CHECK-NEXT: IPC:               0.25
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   mla	v0.4s, v0.4s, v1.4s
-# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2]     D========================eeeeER   .   mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3]     D============================eeeeER   mla	v0.4s, v0.4s, v1.4s
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   mla	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     .D================eeeeER .  .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   mla	v0.4s, v0.4s, v1.4s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -588,36 +588,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2.     2     17.0   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3.     2     21.0   0.0    0.0       mla	v0.4s, v0.4s, v1.4s
-# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.0   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       mla	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     11.6   0.1    0.0       <total>
 
 # CHECK:      [7] Code Region - sqrdmlah
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.25
-# CHECK-NEXT: IPC:               0.25
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   sqrdmlah	v0.8h, v1.8h, v2.8h
-# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   sqrdmlah	v0.8h, v1.8h, v2.8h
-# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   sqrdmlah	v0.8h, v1.8h, v2.8h
-# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   sqrdmlah	v0.8h, v1.8h, v2.8h
-# CHECK-NEXT: [1,2]     D========================eeeeER   .   sqrdmlah	v0.8h, v1.8h, v2.8h
-# CHECK-NEXT: [1,3]     D============================eeeeER   sqrdmlah	v0.8h, v1.8h, v2.8h
+# CHECK:      [0,0]     DeeeeER   .    .    .    ..   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    ..   sqrdmlah	v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D======eeeeER  .    .    ..   sqrdmlah	v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D========eeeeER.    .    ..   sqrdmlah	v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: [1,0]     D============eeeeER .    ..   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     .D===============eeeeER  ..   sqrdmlah	v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     .D=================eeeeER..   sqrdmlah	v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     .D===================eeeeER   sqrdmlah	v0.8h, v1.8h, v2.8h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -626,36 +626,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sqrdmlah	v0.8h, v1.8h, v2.8h
-# CHECK-NEXT: 2.     2     17.0   0.0    0.0       sqrdmlah	v0.8h, v1.8h, v2.8h
-# CHECK-NEXT: 3.     2     21.0   0.0    0.0       sqrdmlah	v0.8h, v1.8h, v2.8h
-# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       sqrdmlah	v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       sqrdmlah	v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     14.5   0.0    0.0       sqrdmlah	v0.8h, v1.8h, v2.8h
+# CHECK-NEXT:        2     11.1   0.1    0.0       <total>
 
 # CHECK:      [8] Code Region - smlal2
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.25
-# CHECK-NEXT: IPC:               0.25
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   smlal2	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2]     D========================eeeeER   .   smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3]     D============================eeeeER   smlal2	v0.4s, v0.8h, v1.8h
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   smlal2	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     .D================eeeeER .  .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   smlal2	v0.4s, v0.8h, v1.8h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -664,36 +664,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2.     2     17.0   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3.     2     21.0   0.0    0.0       smlal2	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.0   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       smlal2	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     11.6   0.1    0.0       <total>
 
 # CHECK:      [9] Code Region - sqdmlal2
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.25
-# CHECK-NEXT: IPC:               0.25
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   sqdmlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   sqdmlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   sqdmlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   sqdmlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2]     D========================eeeeER   .   sqdmlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3]     D============================eeeeER   sqdmlal2	v0.4s, v1.8h, v2.8h
+# CHECK:      [0,0]     DeeeeER   .    .    .    ..   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    ..   sqdmlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D======eeeeER  .    .    ..   sqdmlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D========eeeeER.    .    ..   sqdmlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,0]     D============eeeeER .    ..   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     .D===============eeeeER  ..   sqdmlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     .D=================eeeeER..   sqdmlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     .D===================eeeeER   sqdmlal2	v0.4s, v1.8h, v2.8h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -702,36 +702,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sqdmlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2.     2     17.0   0.0    0.0       sqdmlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3.     2     21.0   0.0    0.0       sqdmlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       sqdmlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       sqdmlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     14.5   0.0    0.0       sqdmlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT:        2     11.1   0.1    0.0       <total>
 
 # CHECK:      [10] Code Region - sadalp
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.25
-# CHECK-NEXT: IPC:               0.25
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   sadalp	v0.2d, v1.4s
-# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   sadalp	v0.2d, v1.4s
-# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   sadalp	v0.2d, v0.4s
-# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   sadalp	v0.2d, v1.4s
-# CHECK-NEXT: [1,2]     D========================eeeeER   .   sadalp	v0.2d, v1.4s
-# CHECK-NEXT: [1,3]     D============================eeeeER   sadalp	v0.2d, v0.4s
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   sadalp	v0.2d, v0.4s
+# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     .D================eeeeER .  .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   sadalp	v0.2d, v0.4s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -740,36 +740,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sadalp	v0.2d, v1.4s
-# CHECK-NEXT: 2.     2     17.0   0.0    0.0       sadalp	v0.2d, v1.4s
-# CHECK-NEXT: 3.     2     21.0   0.0    0.0       sadalp	v0.2d, v0.4s
-# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.0   0.0    0.0       sadalp	v0.2d, v1.4s
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       sadalp	v0.2d, v1.4s
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       sadalp	v0.2d, v0.4s
+# CHECK-NEXT:        2     11.6   0.1    0.0       <total>
 
 # CHECK:      [11] Code Region - fcmla
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.27
-# CHECK-NEXT: IPC:               0.27
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   fcmla	v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   fcmla	v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   fcmla	v0.2d, v0.2d, v1.2d, #90
-# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   fcmla	v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: [1,2]     D======================eeeeER . .   fcmla	v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: [1,3]     D==========================eeeeER   fcmla	v0.2d, v0.2d, v1.2d, #90
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fcmla	v0.2d, v0.2d, v1.2d, #90
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     .D===============eeeeER  .  .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   fcmla	v0.2d, v0.2d, v1.2d, #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -778,40 +778,40 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fcmla	v0.2d, v0.2d, v1.2d, #90
-# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     10.0   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fcmla	v0.2d, v0.2d, v1.2d, #90
+# CHECK-NEXT:        2     11.4   0.1    0.0       <total>
 
 # CHECK:      [12] Code Region - fmla
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
-# CHECK-NEXT: Total Cycles:      2103
+# CHECK-NEXT: Total Cycles:      1703
 # CHECK-NEXT: Total uOps:        600
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.29
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.35
+# CHECK-NEXT: IPC:               0.35
 # CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01234
-
-# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    .    .    .   .   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [0,2]     D=======eeER   .    .    .    .    .    .   .   fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,3]     D=========eeeeER    .    .    .    .    .   .   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [0,4]     D=============eeeeER.    .    .    .    .   .   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [0,5]     D=================eeeeER .    .    .    .   .   fmla	v0.2d, v0.2d, v1.2d
-# CHECK-NEXT: [1,0]     D=====================eeeER   .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1]     D========================eeeeER    .    .   .   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,2]     D============================eeER  .    .   .   fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,3]     D==============================eeeeER   .   .   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,4]     .D=================================eeeeER   .   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,5]     .D=====================================eeeeER   fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    ..   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D=eeeeER  .    .    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,2]     D=====eeER.    .    .    .    .    ..   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,3]     D=======eeeeER .    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,4]     D=========eeeeER    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,5]     .D============eeeeER.    .    .    ..   fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: [1,0]     .D================eeeER  .    .    ..   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     .D=================eeeeER.    .    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,2]     .D=====================eeER   .    ..   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3]     .D=======================eeeeER    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,4]     . D========================eeeeER  ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,5]     . D============================eeeeER   fmla	v0.2d, v0.2d, v1.2d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -820,42 +820,42 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     11.5   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1.     2     14.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 2.     2     18.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 3.     2     20.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 4.     2     24.0   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 5.     2     28.0   0.0    0.0       fmla	v0.2d, v0.2d, v1.2d
-# CHECK-NEXT:        2     19.5   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     10.0   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 2.     2     14.0   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 4.     2     17.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 5.     2     21.0   0.0    0.0       fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT:        2     14.6   0.1    0.0       <total>
 
 # CHECK:      [13] Code Region - fmlal
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
-# CHECK-NEXT: Total Cycles:      2103
+# CHECK-NEXT: Total Cycles:      1903
 # CHECK-NEXT: Total uOps:        600
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.29
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.32
+# CHECK-NEXT: IPC:               0.32
 # CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01234
-
-# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    .    .    .   .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [0,2]     D=======eeER   .    .    .    .    .    .   .   fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,3]     D=========eeeeER    .    .    .    .    .   .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [0,4]     D=============eeeeER.    .    .    .    .   .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [0,5]     D=================eeeeER .    .    .    .   .   fmlal	v0.4s, v0.4h, v1.4h
-# CHECK-NEXT: [1,0]     D=====================eeeER   .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1]     D========================eeeeER    .    .   .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,2]     D============================eeER  .    .   .   fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,3]     D==============================eeeeER   .   .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,4]     .D=================================eeeeER   .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,5]     .D=====================================eeeeER   fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    .    .    .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,2]     D=======eeER   .    .    .    .    .    .   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .    .    .    .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,4]     D===========eeeeER  .    .    .    .    .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,5]     .D==============eeeeER   .    .    .    .   fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: [1,0]     .D==================eeeER.    .    .    .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     .D=====================eeeeER .    .    .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,2]     .D=========================eeER    .    .   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3]     .D===========================eeeeER.    .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,4]     . D============================eeeeER   .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,5]     . D================================eeeeER   fmlal	v0.4s, v0.4h, v1.4h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -864,38 +864,38 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     11.5   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1.     2     14.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 2.     2     18.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 3.     2     20.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 4.     2     24.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 5.     2     28.0   0.0    0.0       fmlal	v0.4s, v0.4h, v1.4h
-# CHECK-NEXT:        2     19.5   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     10.0   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 2.     2     17.0   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3.     2     19.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 4.     2     20.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 5.     2     24.0   0.0    0.0       fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT:        2     17.3   0.1    0.0       <total>
 
 # CHECK:      [14] Code Region - bfdot
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.27
-# CHECK-NEXT: IPC:               0.27
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   bfdot	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   bfdot	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   bfdot	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   bfdot	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2]     D======================eeeeER . .   bfdot	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3]     D==========================eeeeER   bfdot	v0.4s, v0.8h, v1.8h
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   bfdot	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     .D===============eeeeER  .  .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   bfdot	v0.4s, v0.8h, v1.8h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -904,36 +904,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       bfdot	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     10.0   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       bfdot	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     11.4   0.1    0.0       <total>
 
 # CHECK:      [15] Code Region - bfmmla
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total Cycles:      1603
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.22
-# CHECK-NEXT: IPC:               0.22
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT:                     0123456789          01234
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .  .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .  .   bfmmla	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   bfmmla	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   bfmmla	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: [1,0]     D==================eeeER .    .    .  .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1]     D=====================eeeeeER .    .  .   bfmmla	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   bfmmla	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3]     D===============================eeeeeER   bfmmla	v0.4s, v0.8h, v1.8h
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .   .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D======eeeeeER .    .    .    .   .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D===========eeeeeER .    .    .   .   bfmmla	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     D================eeeER   .    .   .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     .D==================eeeeeER   .   .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     .D=====================eeeeeER.   .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     .D==========================eeeeeER   bfmmla	v0.4s, v0.8h, v1.8h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -942,36 +942,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     10.0   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2.     2     18.0   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3.     2     23.0   0.0    0.0       bfmmla	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     14.5   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       bfmmla	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     13.6   0.1    0.0       <total>
 
 # CHECK:      [16] Code Region - bfmlalb
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.27
-# CHECK-NEXT: IPC:               0.27
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   bfmlalb	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   bfmlalb	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   bfmlalb	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   bfmlalb	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2]     D======================eeeeER . .   bfmlalb	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3]     D==========================eeeeER   bfmlalb	v0.4s, v0.8h, v1.8h
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   bfmlalb	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     .D===============eeeeER  .  .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   bfmlalb	v0.4s, v0.8h, v1.8h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -980,50 +980,50 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       bfmlalb	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     10.0   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       bfmlalb	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     11.4   0.1    0.0       <total>
 
 # CHECK:      [17] Code Region - crc32
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1100
-# CHECK-NEXT: Total Cycles:      2203
+# CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        1100
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.50
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.78
+# CHECK-NEXT: IPC:               0.78
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456
-
-# CHECK:      [0,0]     DeeER.    .    .    .    .    .    .    .    ..   mul	w0, w0, w0
-# CHECK-NEXT: [0,1]     D==eeER   .    .    .    .    .    .    .    ..   crc32cb	w0, w0, w1
-# CHECK-NEXT: [0,2]     D====eeER .    .    .    .    .    .    .    ..   crc32cb	w0, w0, w1
-# CHECK-NEXT: [0,3]     D======eeER    .    .    .    .    .    .    ..   crc32cb	w0, w0, w0
-# CHECK-NEXT: [0,4]     D========eeER  .    .    .    .    .    .    ..   crc32b	w0, w0, w15
-# CHECK-NEXT: [0,5]     D==========eeER.    .    .    .    .    .    ..   crc32h	w0, w0, w21
-# CHECK-NEXT: [0,6]     D============eeER   .    .    .    .    .    ..   crc32w	w0, w0, w24
-# CHECK-NEXT: [0,7]     D==============eeER .    .    .    .    .    ..   crc32x	w0, w0, x25
-# CHECK-NEXT: [0,8]     D================eeER    .    .    .    .    ..   crc32ch	w0, w0, w16
-# CHECK-NEXT: [0,9]     D==================eeER  .    .    .    .    ..   crc32cw	w0, w0, w23
-# CHECK-NEXT: [0,10]    .D===================eeER.    .    .    .    ..   crc32cx	w0, w0, x5
-# CHECK-NEXT: [1,0]     .D=====================eeER   .    .    .    ..   mul	w0, w0, w0
-# CHECK-NEXT: [1,1]     .D=======================eeER .    .    .    ..   crc32cb	w0, w0, w1
-# CHECK-NEXT: [1,2]     .D=========================eeER    .    .    ..   crc32cb	w0, w0, w1
-# CHECK-NEXT: [1,3]     .D===========================eeER  .    .    ..   crc32cb	w0, w0, w0
-# CHECK-NEXT: [1,4]     .D=============================eeER.    .    ..   crc32b	w0, w0, w15
-# CHECK-NEXT: [1,5]     .D===============================eeER   .    ..   crc32h	w0, w0, w21
-# CHECK-NEXT: [1,6]     .D=================================eeER .    ..   crc32w	w0, w0, w24
-# CHECK-NEXT: [1,7]     .D===================================eeER    ..   crc32x	w0, w0, x25
-# CHECK-NEXT: [1,8]     .D=====================================eeER  ..   crc32ch	w0, w0, w16
-# CHECK-NEXT: [1,9]     . D======================================eeER..   crc32cw	w0, w0, w23
-# CHECK-NEXT: [1,10]    . D========================================eeER   crc32cx	w0, w0, x5
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeER.    .    .    .    .    .   mul	w0, w0, w0
+# CHECK-NEXT: [0,1]     D==eeER   .    .    .    .    .   crc32cb	w0, w0, w1
+# CHECK-NEXT: [0,2]     D===eeER  .    .    .    .    .   crc32cb	w0, w0, w1
+# CHECK-NEXT: [0,3]     D=====eeER.    .    .    .    .   crc32cb	w0, w0, w0
+# CHECK-NEXT: [0,4]     D======eeER    .    .    .    .   crc32b	w0, w0, w15
+# CHECK-NEXT: [0,5]     .D======eeER   .    .    .    .   crc32h	w0, w0, w21
+# CHECK-NEXT: [0,6]     .D=======eeER  .    .    .    .   crc32w	w0, w0, w24
+# CHECK-NEXT: [0,7]     .D========eeER .    .    .    .   crc32x	w0, w0, x25
+# CHECK-NEXT: [0,8]     .D=========eeER.    .    .    .   crc32ch	w0, w0, w16
+# CHECK-NEXT: [0,9]     .D==========eeER    .    .    .   crc32cw	w0, w0, w23
+# CHECK-NEXT: [0,10]    . D==========eeER   .    .    .   crc32cx	w0, w0, x5
+# CHECK-NEXT: [1,0]     . D============eeER .    .    .   mul	w0, w0, w0
+# CHECK-NEXT: [1,1]     . D==============eeER    .    .   crc32cb	w0, w0, w1
+# CHECK-NEXT: [1,2]     . D===============eeER   .    .   crc32cb	w0, w0, w1
+# CHECK-NEXT: [1,3]     . D=================eeER .    .   crc32cb	w0, w0, w0
+# CHECK-NEXT: [1,4]     .  D=================eeER.    .   crc32b	w0, w0, w15
+# CHECK-NEXT: [1,5]     .  D==================eeER    .   crc32h	w0, w0, w21
+# CHECK-NEXT: [1,6]     .  D===================eeER   .   crc32w	w0, w0, w24
+# CHECK-NEXT: [1,7]     .  D====================eeER  .   crc32x	w0, w0, x25
+# CHECK-NEXT: [1,8]     .  D=====================eeER .   crc32ch	w0, w0, w16
+# CHECK-NEXT: [1,9]     .   D=====================eeER.   crc32cw	w0, w0, w23
+# CHECK-NEXT: [1,10]    .   D======================eeER   crc32cx	w0, w0, x5
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1032,43 +1032,43 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     11.5   0.5    0.0       mul	w0, w0, w0
-# CHECK-NEXT: 1.     2     13.5   0.0    0.0       crc32cb	w0, w0, w1
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       crc32cb	w0, w0, w1
-# CHECK-NEXT: 3.     2     17.5   0.0    0.0       crc32cb	w0, w0, w0
-# CHECK-NEXT: 4.     2     19.5   0.0    0.0       crc32b	w0, w0, w15
-# CHECK-NEXT: 5.     2     21.5   0.0    0.0       crc32h	w0, w0, w21
-# CHECK-NEXT: 6.     2     23.5   0.0    0.0       crc32w	w0, w0, w24
-# CHECK-NEXT: 7.     2     25.5   0.0    0.0       crc32x	w0, w0, x25
-# CHECK-NEXT: 8.     2     27.5   0.0    0.0       crc32ch	w0, w0, w16
-# CHECK-NEXT: 9.     2     29.0   0.0    0.0       crc32cw	w0, w0, w23
-# CHECK-NEXT: 10.    2     30.5   0.0    0.0       crc32cx	w0, w0, x5
-# CHECK-NEXT:        2     21.4   0.0    0.0       <total>
+# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	w0, w0, w0
+# CHECK-NEXT: 1.     2     9.0    0.0    0.0       crc32cb	w0, w0, w1
+# CHECK-NEXT: 2.     2     10.0   0.0    0.0       crc32cb	w0, w0, w1
+# CHECK-NEXT: 3.     2     12.0   0.0    0.0       crc32cb	w0, w0, w0
+# CHECK-NEXT: 4.     2     12.5   0.0    0.0       crc32b	w0, w0, w15
+# CHECK-NEXT: 5.     2     13.0   0.0    0.0       crc32h	w0, w0, w21
+# CHECK-NEXT: 6.     2     14.0   0.0    0.0       crc32w	w0, w0, w24
+# CHECK-NEXT: 7.     2     15.0   0.0    0.0       crc32x	w0, w0, x25
+# CHECK-NEXT: 8.     2     16.0   0.0    0.0       crc32ch	w0, w0, w16
+# CHECK-NEXT: 9.     2     16.5   0.0    0.0       crc32cw	w0, w0, w23
+# CHECK-NEXT: 10.    2     17.0   0.0    0.0       crc32cx	w0, w0, x5
+# CHECK-NEXT:        2     12.9   0.0    0.0       <total>
 
 # CHECK:      [18] Code Region - Z saba
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.23
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT:                     0123456789          0
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   saba	z0.d, z1.d, z2.d
-# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   saba	z0.d, z1.d, z2.d
-# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   saba	z0.d, z0.d, z1.d
-# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   saba	z0.d, z1.d, z2.d
-# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   saba	z0.d, z1.d, z2.d
-# CHECK-NEXT: [1,3]     D==============================eeeeER   saba	z0.d, z0.d, z1.d
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   saba	z0.d, z1.d, z2.d
+# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   saba	z0.d, z1.d, z2.d
+# CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   saba	z0.d, z0.d, z1.d
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   saba	z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   saba	z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   saba	z0.d, z0.d, z1.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1077,36 +1077,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     14.5   0.0    0.0       saba	z0.d, z1.d, z2.d
-# CHECK-NEXT: 2.     2     18.5   0.0    0.0       saba	z0.d, z1.d, z2.d
-# CHECK-NEXT: 3.     2     22.5   0.0    0.0       saba	z0.d, z0.d, z1.d
-# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       saba	z0.d, z1.d, z2.d
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       saba	z0.d, z1.d, z2.d
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       saba	z0.d, z0.d, z1.d
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [19] Code Region - Z sabalt
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.25
-# CHECK-NEXT: IPC:               0.25
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	z0.h, z0.h, z0.h
-# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   sabalt	z0.h, z1.b, z2.b
-# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   sabalt	z0.h, z1.b, z2.b
-# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   sabalt	z0.h, z0.b, z1.b
-# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	z0.h, z0.h, z0.h
-# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   sabalt	z0.h, z1.b, z2.b
-# CHECK-NEXT: [1,2]     D========================eeeeER   .   sabalt	z0.h, z1.b, z2.b
-# CHECK-NEXT: [1,3]     D============================eeeeER   sabalt	z0.h, z0.b, z1.b
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	z0.h, z0.h, z0.h
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   sabalt	z0.h, z1.b, z2.b
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   sabalt	z0.h, z1.b, z2.b
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   sabalt	z0.h, z0.b, z1.b
+# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	z0.h, z0.h, z0.h
+# CHECK-NEXT: [1,1]     .D================eeeeER .  .   sabalt	z0.h, z1.b, z2.b
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   sabalt	z0.h, z1.b, z2.b
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   sabalt	z0.h, z0.b, z1.b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1115,36 +1115,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	z0.h, z0.h, z0.h
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sabalt	z0.h, z1.b, z2.b
-# CHECK-NEXT: 2.     2     17.0   0.0    0.0       sabalt	z0.h, z1.b, z2.b
-# CHECK-NEXT: 3.     2     21.0   0.0    0.0       sabalt	z0.h, z0.b, z1.b
-# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.h, z0.h, z0.h
+# CHECK-NEXT: 1.     2     11.0   0.0    0.0       sabalt	z0.h, z1.b, z2.b
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       sabalt	z0.h, z1.b, z2.b
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       sabalt	z0.h, z0.b, z1.b
+# CHECK-NEXT:        2     11.6   0.1    0.0       <total>
 
 # CHECK:      [20] Code Region - Z sadalp
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.23
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT:                     0123456789          0
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   sadalp	z0.d, p0/m, z1.s
-# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   sadalp	z0.d, p0/m, z1.s
-# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   sadalp	z0.d, p0/m, z0.s
-# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   sadalp	z0.d, p0/m, z1.s
-# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   sadalp	z0.d, p0/m, z1.s
-# CHECK-NEXT: [1,3]     D==============================eeeeER   sadalp	z0.d, p0/m, z0.s
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   sadalp	z0.d, p0/m, z1.s
+# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   sadalp	z0.d, p0/m, z1.s
+# CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   sadalp	z0.d, p0/m, z0.s
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   sadalp	z0.d, p0/m, z1.s
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   sadalp	z0.d, p0/m, z1.s
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   sadalp	z0.d, p0/m, z0.s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1153,36 +1153,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     14.5   0.0    0.0       sadalp	z0.d, p0/m, z1.s
-# CHECK-NEXT: 2.     2     18.5   0.0    0.0       sadalp	z0.d, p0/m, z1.s
-# CHECK-NEXT: 3.     2     22.5   0.0    0.0       sadalp	z0.d, p0/m, z0.s
-# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       sadalp	z0.d, p0/m, z1.s
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       sadalp	z0.d, p0/m, z1.s
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       sadalp	z0.d, p0/m, z0.s
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [21] Code Region - Z ssra
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.23
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT:                     0123456789          0
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   ssra	z0.d, z1.d, #1
-# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   ssra	z0.d, z1.d, #1
-# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   ssra	z0.d, z0.d, #1
-# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   ssra	z0.d, z1.d, #1
-# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   ssra	z0.d, z1.d, #1
-# CHECK-NEXT: [1,3]     D==============================eeeeER   ssra	z0.d, z0.d, #1
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   ssra	z0.d, z1.d, #1
+# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   ssra	z0.d, z1.d, #1
+# CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   ssra	z0.d, z0.d, #1
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   ssra	z0.d, z1.d, #1
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   ssra	z0.d, z1.d, #1
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   ssra	z0.d, z0.d, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1191,36 +1191,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     14.5   0.0    0.0       ssra	z0.d, z1.d, #1
-# CHECK-NEXT: 2.     2     18.5   0.0    0.0       ssra	z0.d, z1.d, #1
-# CHECK-NEXT: 3.     2     22.5   0.0    0.0       ssra	z0.d, z0.d, #1
-# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       ssra	z0.d, z1.d, #1
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       ssra	z0.d, z1.d, #1
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       ssra	z0.d, z0.d, #1
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [22] Code Region - Z cdot.s
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1403
+# CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.36
-# CHECK-NEXT: IPC:               0.29
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.42
+# CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeER    .    .    .    .   cdot	z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: [0,2]     D========eeeER .    .    .    .   cdot	z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: [0,3]     D===========eeeER   .    .    .   cdot	z0.s, z0.b, z1.b, #90
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeER.    .   cdot	z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: [1,2]     D======================eeeER  .   cdot	z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: [1,3]     D=========================eeeER   cdot	z0.s, z0.b, z1.b, #90
+# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   cdot	z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   cdot	z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   cdot	z0.s, z0.b, z1.b, #90
+# CHECK-NEXT: [1,0]     .D===========eeeeeER.    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D================eeeER  ..   cdot	z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [1,2]     .D=================eeeER ..   cdot	z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [1,3]     .D====================eeeER   cdot	z0.s, z0.b, z1.b, #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1229,36 +1229,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       cdot	z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: 2.     2     16.0   0.0    0.0       cdot	z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: 3.     2     19.0   0.0    0.0       cdot	z0.s, z0.b, z1.b, #90
-# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       cdot	z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       cdot	z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: 3.     2     15.5   0.0    0.0       cdot	z0.s, z0.b, z1.b, #90
+# CHECK-NEXT:        2     11.5   0.1    0.0       <total>
 
 # CHECK:      [23] Code Region - Z cdot.d
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.23
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT:                     0123456789          0
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   cdot	z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   cdot	z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   cdot	z0.d, z0.h, z1.h, #90
-# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   cdot	z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   cdot	z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: [1,3]     D==============================eeeeER   cdot	z0.d, z0.h, z1.h, #90
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   cdot	z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   cdot	z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   cdot	z0.d, z0.h, z1.h, #90
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   cdot	z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   cdot	z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   cdot	z0.d, z0.h, z1.h, #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1267,36 +1267,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     14.5   0.0    0.0       cdot	z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: 2.     2     18.5   0.0    0.0       cdot	z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: 3.     2     22.5   0.0    0.0       cdot	z0.d, z0.h, z1.h, #90
-# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       cdot	z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       cdot	z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       cdot	z0.d, z0.h, z1.h, #90
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [24] Code Region - Z cmla.b
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.23
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT:                     0123456789          0
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   cmla	z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   cmla	z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   cmla	z0.b, z0.b, z1.b, #90
-# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   cmla	z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   cmla	z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: [1,3]     D==============================eeeeER   cmla	z0.b, z0.b, z1.b, #90
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   cmla	z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   cmla	z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   cmla	z0.b, z0.b, z1.b, #90
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   cmla	z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   cmla	z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   cmla	z0.b, z0.b, z1.b, #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1305,36 +1305,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     14.5   0.0    0.0       cmla	z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: 2.     2     18.5   0.0    0.0       cmla	z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: 3.     2     22.5   0.0    0.0       cmla	z0.b, z0.b, z1.b, #90
-# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       cmla	z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       cmla	z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       cmla	z0.b, z0.b, z1.b, #90
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [25] Code Region - Z cmla.d
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      2003
-# CHECK-NEXT: Total uOps:        800
+# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.40
-# CHECK-NEXT: IPC:               0.20
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.28
+# CHECK-NEXT: IPC:               0.22
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    . .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .    . .   cmla	z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: [0,2]     D==========eeeeeER  .    .    .    .    . .   cmla	z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: [0,3]     D===============eeeeeER  .    .    .    . .   cmla	z0.d, z0.d, z1.d, #90
-# CHECK-NEXT: [1,0]     D====================eeeeeER  .    .    . .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     .D========================eeeeeER  .    . .   cmla	z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: [1,2]     .D=============================eeeeeER  . .   cmla	z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: [1,3]     .D==================================eeeeeER   cmla	z0.d, z0.d, z1.d, #90
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .  .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .  .   cmla	z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   cmla	z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   cmla	z0.d, z0.d, z1.d, #90
+# CHECK-NEXT: [1,0]     .D=================eeeeeER    .    .  .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D======================eeeeeER    .  .   cmla	z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [1,2]     .D=========================eeeeeER .  .   cmla	z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [1,3]     .D==============================eeeeeER   cmla	z0.d, z0.d, z1.d, #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1343,36 +1343,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     11.0   0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     15.5   0.0    0.0       cmla	z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: 2.     2     20.5   0.0    0.0       cmla	z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: 3.     2     25.5   0.0    0.0       cmla	z0.d, z0.d, z1.d, #90
-# CHECK-NEXT:        2     18.1   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       cmla	z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: 2.     2     17.5   0.0    0.0       cmla	z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: 3.     2     22.5   0.0    0.0       cmla	z0.d, z0.d, z1.d, #90
+# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
 
 # CHECK:      [26] Code Region - Z sdot.s
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1403
+# CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.36
-# CHECK-NEXT: IPC:               0.29
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.42
+# CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeER    .    .    .    .   sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: [0,2]     D========eeeER .    .    .    .   sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: [0,3]     D===========eeeER   .    .    .   sdot	z0.s, z0.b, z1.b
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeER.    .   sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,2]     D======================eeeER  .   sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,3]     D=========================eeeER   sdot	z0.s, z0.b, z1.b
+# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   sdot	z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,0]     .D===========eeeeeER.    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D================eeeER  ..   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,2]     .D=================eeeER ..   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3]     .D====================eeeER   sdot	z0.s, z0.b, z1.b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1381,36 +1381,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: 2.     2     16.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: 3.     2     19.0   0.0    0.0       sdot	z0.s, z0.b, z1.b
-# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: 3.     2     15.5   0.0    0.0       sdot	z0.s, z0.b, z1.b
+# CHECK-NEXT:        2     11.5   0.1    0.0       <total>
 
 # CHECK:      [27] Code Region - Z sudot
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1403
+# CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.36
-# CHECK-NEXT: IPC:               0.29
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.42
+# CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeER    .    .    .    .   sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: [0,2]     D========eeeER .    .    .    .   sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: [0,3]     D===========eeeER   .    .    .   sdot	z0.s, z0.b, z1.b[1]
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeER.    .   sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: [1,2]     D======================eeeER  .   sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: [1,3]     D=========================eeeER   sdot	z0.s, z0.b, z1.b[1]
+# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   sdot	z0.s, z0.b, z1.b[1]
+# CHECK-NEXT: [1,0]     .D===========eeeeeER.    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D================eeeER  ..   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,2]     .D=================eeeER ..   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,3]     .D====================eeeER   sdot	z0.s, z0.b, z1.b[1]
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1419,36 +1419,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: 2.     2     16.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: 3.     2     19.0   0.0    0.0       sdot	z0.s, z0.b, z1.b[1]
-# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 3.     2     15.5   0.0    0.0       sdot	z0.s, z0.b, z1.b[1]
+# CHECK-NEXT:        2     11.5   0.1    0.0       <total>
 
 # CHECK:      [28] Code Region - Z sdot.d
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.23
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT:                     0123456789          0
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   sdot	z0.d, z0.h, z1.h
-# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: [1,3]     D==============================eeeeER   sdot	z0.d, z0.h, z1.h
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   sdot	z0.d, z0.h, z1.h
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   sdot	z0.d, z0.h, z1.h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1457,36 +1457,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     14.5   0.0    0.0       sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: 2.     2     18.5   0.0    0.0       sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: 3.     2     22.5   0.0    0.0       sdot	z0.d, z0.h, z1.h
-# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       sdot	z0.d, z0.h, z1.h
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [29] Code Region - Z smmla
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total Cycles:      1103
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.31
-# CHECK-NEXT: IPC:               0.31
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.36
 # CHECK-NEXT: Block RThroughput: 1.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          01234
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	z0.s, z0.s, z0.s
-# CHECK-NEXT: [0,1]     D====eeeER.    .    .    .  .   smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: [0,2]     D=======eeeER  .    .    .  .   smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: [0,3]     D==========eeeER    .    .  .   smmla	z0.s, z0.b, z1.b
-# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	z0.s, z0.s, z0.s
-# CHECK-NEXT: [1,1]     D=================eeeER  .  .   smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,2]     D====================eeeER  .   smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,3]     D=======================eeeER   smmla	z0.s, z0.b, z1.b
+# CHECK:      [0,0]     DeeeeER   .    .    .   .   mul	z0.s, z0.s, z0.s
+# CHECK-NEXT: [0,1]     D====eeeER.    .    .   .   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,2]     D=====eeeER    .    .   .   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,3]     D========eeeER .    .   .   smmla	z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,0]     D===========eeeeER  .   .   mul	z0.s, z0.s, z0.s
+# CHECK-NEXT: [1,1]     .D==============eeeER   .   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,2]     .D===============eeeER  .   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3]     .D==================eeeER   smmla	z0.s, z0.b, z1.b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1495,36 +1495,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.s, z0.s, z0.s
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: 2.     2     14.5   0.0    0.0       smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: 3.     2     17.5   0.0    0.0       smmla	z0.s, z0.b, z1.b
-# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	z0.s, z0.s, z0.s
+# CHECK-NEXT: 1.     2     10.0   0.0    0.0       smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: 2.     2     11.0   0.0    0.0       smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: 3.     2     14.0   0.0    0.0       smmla	z0.s, z0.b, z1.b
+# CHECK-NEXT:        2     10.4   0.1    0.0       <total>
 
 # CHECK:      [30] Code Region - Z mla.b
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.23
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT:                     0123456789          0
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   mla	z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   mla	z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   mla	z0.b, p0/m, z0.b, z1.b
-# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   mla	z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   mla	z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: [1,3]     D==============================eeeeER   mla	z0.b, p0/m, z0.b, z1.b
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   mla	z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   mla	z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   mla	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   mla	z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   mla	z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   mla	z0.b, p0/m, z0.b, z1.b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1533,36 +1533,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     14.5   0.0    0.0       mla	z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: 2.     2     18.5   0.0    0.0       mla	z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: 3.     2     22.5   0.0    0.0       mla	z0.b, p0/m, z0.b, z1.b
-# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       mla	z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       mla	z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       mla	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [31] Code Region - Z mla.d
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      2003
-# CHECK-NEXT: Total uOps:        800
+# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.40
-# CHECK-NEXT: IPC:               0.20
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.28
+# CHECK-NEXT: IPC:               0.22
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    . .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .    . .   mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [0,2]     D==========eeeeeER  .    .    .    .    . .   mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [0,3]     D===============eeeeeER  .    .    .    . .   mla	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT: [1,0]     D====================eeeeeER  .    .    . .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     .D========================eeeeeER  .    . .   mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,2]     .D=============================eeeeeER  . .   mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,3]     .D==================================eeeeeER   mla	z0.d, p0/m, z0.d, z1.d
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .  .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .  .   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   mla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0]     .D=================eeeeeER    .    .  .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D======================eeeeeER    .  .   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     .D=========================eeeeeER .  .   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D==============================eeeeeER   mla	z0.d, p0/m, z0.d, z1.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1571,36 +1571,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     11.0   0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     15.5   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 2.     2     20.5   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 3.     2     25.5   0.0    0.0       mla	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT:        2     18.1   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     17.5   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3.     2     22.5   0.0    0.0       mla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
 
 # CHECK:      [32] Code Region - Z smlalb
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.23
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT:                     0123456789          0
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   smlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   smlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   smlalb	z0.d, z0.s, z1.s
-# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   smlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   smlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: [1,3]     D==============================eeeeER   smlalb	z0.d, z0.s, z1.s
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   smlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   smlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   smlalb	z0.d, z0.s, z1.s
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   smlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   smlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   smlalb	z0.d, z0.s, z1.s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1609,36 +1609,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     14.5   0.0    0.0       smlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: 2.     2     18.5   0.0    0.0       smlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: 3.     2     22.5   0.0    0.0       smlalb	z0.d, z0.s, z1.s
-# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       smlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       smlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       smlalb	z0.d, z0.s, z1.s
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [33] Code Region - Z sqdmlalb
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.23
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT:                     0123456789          012
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   sqdmlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   sqdmlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   sqdmlalb	z0.d, z0.s, z1.s
-# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   sqdmlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   sqdmlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: [1,3]     D==============================eeeeER   sqdmlalb	z0.d, z0.s, z1.s
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    . .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    . .   sqdmlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   sqdmlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   sqdmlalb	z0.d, z0.s, z1.s
+# CHECK-NEXT: [1,0]     .D==============eeeeeER  .    . .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D===================eeeeER   . .   sqdmlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,2]     .D=====================eeeeER . .   sqdmlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,3]     .D=========================eeeeER   sqdmlalb	z0.d, z0.s, z1.s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1647,36 +1647,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     14.5   0.0    0.0       sqdmlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: 2.     2     18.5   0.0    0.0       sqdmlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: 3.     2     22.5   0.0    0.0       sqdmlalb	z0.d, z0.s, z1.s
-# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sqdmlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: 2.     2     15.0   0.0    0.0       sqdmlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: 3.     2     19.0   0.0    0.0       sqdmlalb	z0.d, z0.s, z1.s
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
 
 # CHECK:      [34] Code Region - Z sqrdmlah.b
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.23
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT:                     0123456789          012
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   sqrdmlah	z0.b, z1.b, z2.b
-# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   sqrdmlah	z0.b, z1.b, z2.b
-# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   sqrdmlah	z0.b, z0.b, z1.b
-# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   sqrdmlah	z0.b, z1.b, z2.b
-# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   sqrdmlah	z0.b, z1.b, z2.b
-# CHECK-NEXT: [1,3]     D==============================eeeeER   sqrdmlah	z0.b, z0.b, z1.b
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    . .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    . .   sqrdmlah	z0.b, z1.b, z2.b
+# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   sqrdmlah	z0.b, z1.b, z2.b
+# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   sqrdmlah	z0.b, z0.b, z1.b
+# CHECK-NEXT: [1,0]     .D==============eeeeeER  .    . .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D===================eeeeER   . .   sqrdmlah	z0.b, z1.b, z2.b
+# CHECK-NEXT: [1,2]     .D=====================eeeeER . .   sqrdmlah	z0.b, z1.b, z2.b
+# CHECK-NEXT: [1,3]     .D=========================eeeeER   sqrdmlah	z0.b, z0.b, z1.b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1685,36 +1685,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     14.5   0.0    0.0       sqrdmlah	z0.b, z1.b, z2.b
-# CHECK-NEXT: 2.     2     18.5   0.0    0.0       sqrdmlah	z0.b, z1.b, z2.b
-# CHECK-NEXT: 3.     2     22.5   0.0    0.0       sqrdmlah	z0.b, z0.b, z1.b
-# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sqrdmlah	z0.b, z1.b, z2.b
+# CHECK-NEXT: 2.     2     15.0   0.0    0.0       sqrdmlah	z0.b, z1.b, z2.b
+# CHECK-NEXT: 3.     2     19.0   0.0    0.0       sqrdmlah	z0.b, z0.b, z1.b
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
 
 # CHECK:      [35] Code Region - Z sqrdmlah.d
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      2003
-# CHECK-NEXT: Total uOps:        800
+# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.40
-# CHECK-NEXT: IPC:               0.20
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.28
+# CHECK-NEXT: IPC:               0.22
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    . .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .    . .   sqrdmlah	z0.d, z1.d, z2.d
-# CHECK-NEXT: [0,2]     D==========eeeeeER  .    .    .    .    . .   sqrdmlah	z0.d, z1.d, z2.d
-# CHECK-NEXT: [0,3]     D===============eeeeeER  .    .    .    . .   sqrdmlah	z0.d, z0.d, z1.d
-# CHECK-NEXT: [1,0]     D====================eeeeeER  .    .    . .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     .D========================eeeeeER  .    . .   sqrdmlah	z0.d, z1.d, z2.d
-# CHECK-NEXT: [1,2]     .D=============================eeeeeER  . .   sqrdmlah	z0.d, z1.d, z2.d
-# CHECK-NEXT: [1,3]     .D==================================eeeeeER   sqrdmlah	z0.d, z0.d, z1.d
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .  .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .  .   sqrdmlah	z0.d, z1.d, z2.d
+# CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   sqrdmlah	z0.d, z1.d, z2.d
+# CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   sqrdmlah	z0.d, z0.d, z1.d
+# CHECK-NEXT: [1,0]     .D=================eeeeeER    .    .  .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D======================eeeeeER    .  .   sqrdmlah	z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,2]     .D=========================eeeeeER .  .   sqrdmlah	z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D==============================eeeeeER   sqrdmlah	z0.d, z0.d, z1.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1723,36 +1723,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     11.0   0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     15.5   0.0    0.0       sqrdmlah	z0.d, z1.d, z2.d
-# CHECK-NEXT: 2.     2     20.5   0.0    0.0       sqrdmlah	z0.d, z1.d, z2.d
-# CHECK-NEXT: 3.     2     25.5   0.0    0.0       sqrdmlah	z0.d, z0.d, z1.d
-# CHECK-NEXT:        2     18.1   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       sqrdmlah	z0.d, z1.d, z2.d
+# CHECK-NEXT: 2.     2     17.5   0.0    0.0       sqrdmlah	z0.d, z1.d, z2.d
+# CHECK-NEXT: 3.     2     22.5   0.0    0.0       sqrdmlah	z0.d, z0.d, z1.d
+# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
 
 # CHECK:      [36] Code Region - Z fcmla ZPmZZ
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.27
-# CHECK-NEXT: IPC:               0.27
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   fcmla	z0.d, p0/m, z0.d, z1.d, #90
-# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: [1,2]     D======================eeeeER . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: [1,3]     D==========================eeeeER   fcmla	z0.d, p0/m, z0.d, z1.d, #90
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fcmla	z0.d, p0/m, z0.d, z1.d, #90
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D===============eeeeER  .  .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   fcmla	z0.d, p0/m, z0.d, z1.d, #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1761,36 +1761,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fcmla	z0.d, p0/m, z0.d, z1.d, #90
-# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     10.0   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fcmla	z0.d, p0/m, z0.d, z1.d, #90
+# CHECK-NEXT:        2     11.4   0.1    0.0       <total>
 
 # CHECK:      [37] Code Region - Z fcmla ZZZI
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.27
-# CHECK-NEXT: IPC:               0.27
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   fcmla	z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   fcmla	z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   fcmla	z0.s, z0.s, z1.s[1], #90
-# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   fcmla	z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: [1,2]     D======================eeeeER . .   fcmla	z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: [1,3]     D==========================eeeeER   fcmla	z0.s, z0.s, z1.s[1], #90
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fcmla	z0.s, z0.s, z1.s[1], #90
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D===============eeeeER  .  .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   fcmla	z0.s, z0.s, z1.s[1], #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1799,36 +1799,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fcmla	z0.s, z0.s, z1.s[1], #90
-# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     10.0   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fcmla	z0.s, z0.s, z1.s[1], #90
+# CHECK-NEXT:        2     11.4   0.1    0.0       <total>
 
 # CHECK:      [38] Code Region - Z fmla ZPmZZ
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.27
-# CHECK-NEXT: IPC:               0.27
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   fmla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   fmla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   fmla	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   fmla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,2]     D======================eeeeER . .   fmla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,3]     D==========================eeeeER   fmla	z0.d, p0/m, z0.d, z1.d
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fmla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D===============eeeeER  .  .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   fmla	z0.d, p0/m, z0.d, z1.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1837,36 +1837,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fmla	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     10.0   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fmla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT:        2     11.4   0.1    0.0       <total>
 
 # CHECK:      [39] Code Region - Z fmla ZZZI
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.27
-# CHECK-NEXT: IPC:               0.27
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   fmla	z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   fmla	z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   fmla	z0.d, z0.d, z1.d[1]
-# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   fmla	z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: [1,2]     D======================eeeeER . .   fmla	z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: [1,3]     D==========================eeeeER   fmla	z0.d, z0.d, z1.d[1]
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fmla	z0.d, z0.d, z1.d[1]
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D===============eeeeER  .  .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   fmla	z0.d, z0.d, z1.d[1]
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1875,36 +1875,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fmla	z0.d, z0.d, z1.d[1]
-# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     10.0   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fmla	z0.d, z0.d, z1.d[1]
+# CHECK-NEXT:        2     11.4   0.1    0.0       <total>
 
 # CHECK:      [40] Code Region - Z fmlalb ZZZ
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.27
-# CHECK-NEXT: IPC:               0.27
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   fmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   fmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   fmlalb	z0.s, z0.h, z1.h
-# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   fmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2]     D======================eeeeER . .   fmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3]     D==========================eeeeER   fmlalb	z0.s, z0.h, z1.h
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   fmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   fmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D===============eeeeER  .  .   fmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   fmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   fmlalb	z0.s, z0.h, z1.h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1913,36 +1913,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fmlalb	z0.s, z0.h, z1.h
-# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     10.0   0.0    0.0       fmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       fmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     11.4   0.1    0.0       <total>
 
 # CHECK:      [41] Code Region - Z bfdot
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.27
-# CHECK-NEXT: IPC:               0.27
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   bfdot	z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   bfdot	z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   bfdot	z0.s, z0.h, z1.h
-# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   bfdot	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2]     D======================eeeeER . .   bfdot	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3]     D==========================eeeeER   bfdot	z0.s, z0.h, z1.h
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   bfdot	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D===============eeeeER  .  .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   bfdot	z0.s, z0.h, z1.h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1951,36 +1951,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfdot	z0.s, z1.h, z2.h
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       bfdot	z0.s, z1.h, z2.h
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       bfdot	z0.s, z0.h, z1.h
-# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     10.0   0.0    0.0       bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       bfdot	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     11.4   0.1    0.0       <total>
 
 # CHECK:      [42] Code Region - Z bfmmla
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total Cycles:      1603
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.22
-# CHECK-NEXT: IPC:               0.22
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT:                     0123456789          01234
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .  .   bfmmla	z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   bfmmla	z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   bfmmla	z0.s, z0.h, z1.h
-# CHECK-NEXT: [1,0]     D==================eeeER .    .    .  .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D=====================eeeeeER .    .  .   bfmmla	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   bfmmla	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3]     D===============================eeeeeER   bfmmla	z0.s, z0.h, z1.h
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .   .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .   .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D======eeeeeER .    .    .    .   .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D===========eeeeeER .    .    .   .   bfmmla	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D================eeeER   .    .   .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeeER   .   .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2]     .D=====================eeeeeER.   .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     .D==========================eeeeeER   bfmmla	z0.s, z0.h, z1.h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1989,36 +1989,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     10.0   0.5    0.0       fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
-# CHECK-NEXT: 2.     2     18.0   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
-# CHECK-NEXT: 3.     2     23.0   0.0    0.0       bfmmla	z0.s, z0.h, z1.h
-# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: 2.     2     14.5   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       bfmmla	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     13.6   0.1    0.0       <total>
 
 # CHECK:      [43] Code Region - bfmlalb
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.27
-# CHECK-NEXT: IPC:               0.27
+# CHECK:      Dispatch Width:    5
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   bfmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   bfmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   bfmlalb	z0.s, z0.h, z1.h
-# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   bfmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2]     D======================eeeeER . .   bfmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3]     D==========================eeeeER   bfmlalb	z0.s, z0.h, z1.h
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   bfmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D===============eeeeER  .  .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   bfmlalb	z0.s, z0.h, z1.h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2027,8 +2027,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       bfmlalb	z0.s, z0.h, z1.h
-# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     10.0   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       bfmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     11.4   0.1    0.0       <total>



More information about the llvm-commits mailing list