[llvm] [AArch64] Add SchedReadAdvance to Neoverse-V1 scheduling model. (PR #111538)

Rin Dobrescu via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 10 02:46:08 PDT 2024


https://github.com/Rin18 updated https://github.com/llvm/llvm-project/pull/111538

>From 5b36c2235ef60f05df66ae1dba479f35083734cc Mon Sep 17 00:00:00 2001
From: Rin Dobrescu <rin.dobrescu at arm.com>
Date: Fri, 4 Oct 2024 12:24:36 +0000
Subject: [PATCH 1/3] Add pre-commit test.

---
 .../Target/AArch64/AArch64SchedNeoverseV1.td  |    6 +-
 .../llvm-mca/AArch64/Neoverse/V1-forwarding.s | 1421 +++++++++++++++++
 2 files changed, 1425 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s

diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
index f7e6545f0dd386..8abbfe63d9a806 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
@@ -843,7 +843,7 @@ def : InstRW<[V1Write_4c_2V13], (instregex "^(ADD|[SU]ADDL)Vv16i8v$",
 
 // ASIMD dot product
 // ASIMD dot product using signed and unsigned integers
-def : InstRW<[V1Write_2c_1V], (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>;
+def : InstRW<[V1Write_3c_1V], (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>;
 
 // ASIMD matrix multiply- accumulate
 def : InstRW<[V1Write_3c_1V], (instrs SMMLA, UMMLA, USMMLA)>;
@@ -896,11 +896,13 @@ def : InstRW<[V1Write_2c_1V13], (instregex "^SHLL?v", "^SHRNv", "^[SU]SHLLv",
 // ASIMD FP negate
 // Covered by "SchedAlias (WriteV[dq]...)" above
 
-// ASIMD FP complex multiply add
 // ASIMD FP multiply accumulate
 def : InstRW<[V1Write_4c_1V], (instregex "^FCADD(v[48]f16|v[24]f32|v2f64)$",
                                          "^FML[AS]v")>;
 
+// ASIMD FP complex multiply add
+def : InstRW<[V1Write_4c_1V], (instregex "^FCMLAv")>;
+
 // ASIMD FP convert, long (F16 to F32)
 def : InstRW<[V1Write_4c_2V02], (instregex "^FCVTLv[48]i16$")>;
 
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s
new file mode 100644
index 00000000000000..694c931c029412
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s
@@ -0,0 +1,1421 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 -mattr=+sve --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=2 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN madd
+mul  x0, x0, x0
+madd x0, x1, x2, x0
+madd x0, x1, x2, x0
+madd x0, x0, x0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smaddl
+mul    x0, x0, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w0, w0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmadd
+fadd  d0, d0, d0
+fmadd d0, d1, d2, d0
+fmul  d0, d0, d0
+fmadd d0, d1, d2, d0
+fmadd d0, d1, d2, d0
+fmadd d0, d0, d1, d2
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN saba
+mul  v0.4s, v0.4s, v0.4s
+saba v0.4s, v1.4s, v2.4s
+saba v0.4s, v1.4s, v2.4s
+saba v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sadalp
+mul    v0.4s, v0.4s, v0.4s
+sadalp v0.2d, v1.4s
+sadalp v0.2d, v1.4s
+sadalp v0.2d, v0.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sdot
+mul  v0.4s, v0.4s,  v0.4s
+sdot v0.4s, v1.16b, v2.16b
+sdot v0.4s, v1.16b, v2.16b
+sdot v0.4s, v0.16b, v1.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smmla
+mul   v0.4s, v0.4s,  v0.4s
+smmla v0.4s, v1.16b, v2.16b
+smmla v0.4s, v1.16b, v2.16b
+smmla v0.4s, v0.16b, v1.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN mla
+mul v0.4s, v0.4s, v0.4s
+mla v0.4s, v1.4s, v2.4s
+mla v0.4s, v1.4s, v2.4s
+mla v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smlal2
+mul    v0.4s, v0.4s, v0.4s
+smlal2 v0.4s, v1.8h, v2.8h
+smlal2 v0.4s, v1.8h, v2.8h
+smlal2 v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ssra
+mul  v0.4s, v0.4s, v0.4s
+ssra v0.2d, v1.2d, #1
+ssra v0.2d, v1.2d, #1
+ssra v0.2d, v0.2d, #1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fcmla
+fmul  v0.4s, v0.4s, v0.4s
+fcmla v0.2d, v1.2d, v2.2d, #90
+fcmla v0.2d, v1.2d, v2.2d, #90
+fcmla v0.2d, v0.2d, v1.2d, #90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmla
+fmul v0.2d, v0.2d, v0.2d
+fmla v0.2d, v1.2d, v2.2d
+fadd v0.2d, v0.2d, v0.2d
+fmla v0.2d, v1.2d, v2.2d
+fmla v0.2d, v1.2d, v2.2d
+fmla v0.2d, v0.2d, v1.2d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmlal
+fmul  v0.2d, v0.2d, v0.2d
+fmlal v0.4s, v1.4h, v2.4h
+fadd  v0.2d, v0.2d, v0.2d
+fmlal v0.4s, v1.4h, v2.4h
+fmlal v0.4s, v1.4h, v2.4h
+fmlal v0.4s, v0.4h, v1.4h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfdot
+fmul  v0.2d, v0.2d, v0.2d
+bfdot v0.4s, v1.8h, v2.8h
+bfdot v0.4s, v1.8h, v2.8h
+bfdot v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfmmla
+fmul   v0.2d, v0.2d, v0.2d
+bfmmla v0.4s, v1.8h, v2.8h
+bfmmla v0.4s, v1.8h, v2.8h
+bfmmla v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfmlalb
+fmul    v0.2d, v0.2d, v0.2d
+bfmlalb v0.4s, v1.8h, v2.8h
+bfmlalb v0.4s, v1.8h, v2.8h
+bfmlalb v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN crc32cb
+mul    w0, w0, w0
+crc32cb w0, w0, w1
+crc32cb w0, w0, w1
+crc32cb w0, w0, w0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sdot.s
+mul z0.d, p0/m, z0.d, z0.d
+sdot z0.s, z1.b, z2.b
+sdot z0.s, z1.b, z2.b
+sdot z0.s, z0.b, z1.b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sudot
+mul z0.d, p0/m, z0.d, z0.d
+sdot z0.s, z1.b, z2.b[1]
+sdot z0.s, z1.b, z2.b[1]
+sdot z0.s, z0.b, z1.b[1]
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z sdot.d
+mul z0.d, p0/m, z0.d, z0.d
+sdot z0.d, z1.h, z2.h
+sdot z0.d, z1.h, z2.h
+sdot z0.d, z0.h, z1.h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z smmla
+mul z0.d, p0/m, z0.d, z0.d
+smmla z0.s, z1.b, z2.b
+smmla z0.s, z1.b, z2.b
+smmla z0.s, z0.b, z1.b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z mla.d
+mul z0.d, p0/m, z0.d, z0.d
+mla z0.d, p0/m, z1.d, z2.d
+mla z0.d, p0/m, z1.d, z2.d
+mla z0.d, p0/m, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z mad.d
+mul z0.d, p0/m, z0.d, z0.d
+mad z0.d, p0/m, z1.d, z2.d
+mad z0.d, p0/m, z1.d, z2.d
+mad z0.d, p0/m, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z msb.d
+mul z0.d, p0/m, z0.d, z0.d
+msb z0.d, p0/m, z1.d, z2.d
+msb z0.d, p0/m, z1.d, z2.d
+msb z0.d, p0/m, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fcmla ZPmZZ
+fmul  z0.d, z0.d, z0.d
+fcmla z0.d, p0/m, z1.d, z2.d, 90
+fcmla z0.d, p0/m, z1.d, z2.d, 90
+fcmla z0.d, p0/m, z0.d, z1.d, 90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fcmla ZZZI
+fmul  z0.d, z0.d, z0.d
+fcmla z0.s, z1.s, z2.s[1], 90
+fcmla z0.s, z1.s, z2.s[1], 90
+fcmla z0.s, z0.s, z1.s[1], 90
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fmla ZPmZZ
+fmul z0.d, z0.d, z0.d
+fmla z0.d, p0/m, z1.d, z2.d
+fmla z0.d, p0/m, z1.d, z2.d
+fmla z0.d, p0/m, z0.d, z1.d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z fmla ZZZI
+fmul z0.d, z0.d, z0.d
+fmla z0.d, z1.d, z2.d[1]
+fmla z0.d, z1.d, z2.d[1]
+fmla z0.d, z0.d, z1.d[1]
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z bfdot
+fmul  z0.d, z0.d, z0.d
+bfdot z0.s, z1.h, z2.h
+bfdot z0.s, z1.h, z2.h
+bfdot z0.s, z0.h, z1.h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN Z bfmmla
+fmul   z0.d, z0.d, z0.d
+bfmmla z0.s, z1.h, z2.h
+bfmmla z0.s, z1.h, z2.h
+bfmmla z0.s, z0.h, z1.h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN bfmlalb
+fmul    z0.d, z0.d, z0.d
+bfmlalb z0.s, z1.h, z2.h
+bfmlalb z0.s, z1.h, z2.h
+bfmlalb z0.s, z0.h, z1.h
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - madd
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      803
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .    .  .   mul	x0, x0, x0
+# CHECK-NEXT: [0,1]     D==eeER   .    .  .   madd	x0, x1, x2, x0
+# CHECK-NEXT: [0,2]     D====eeER .    .  .   madd	x0, x1, x2, x0
+# CHECK-NEXT: [0,3]     D======eeER    .  .   madd	x0, x0, x0, x0
+# CHECK-NEXT: [1,0]     D========eeER  .  .   mul	x0, x0, x0
+# CHECK-NEXT: [1,1]     D==========eeER.  .   madd	x0, x1, x2, x0
+# CHECK-NEXT: [1,2]     D============eeER .   madd	x0, x1, x2, x0
+# CHECK-NEXT: [1,3]     D==============eeER   madd	x0, x0, x0, x0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     5.0    0.5    0.0       mul	x0, x0, x0
+# CHECK-NEXT: 1.     2     7.0    0.0    0.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 2.     2     9.0    0.0    0.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 3.     2     11.0   0.0    0.0       madd	x0, x0, x0, x0
+# CHECK-NEXT:        2     8.0    0.1    0.0       <total>
+
+# CHECK:      [1] Code Region - smaddl
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      803
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .    .  .   mul	x0, x0, x0
+# CHECK-NEXT: [0,1]     D==eeER   .    .  .   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [0,2]     D====eeER .    .  .   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [0,3]     D======eeER    .  .   smaddl	x0, w0, w0, x0
+# CHECK-NEXT: [1,0]     D========eeER  .  .   mul	x0, x0, x0
+# CHECK-NEXT: [1,1]     D==========eeER.  .   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [1,2]     D============eeER .   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [1,3]     D==============eeER   smaddl	x0, w0, w0, x0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     5.0    0.5    0.0       mul	x0, x0, x0
+# CHECK-NEXT: 1.     2     7.0    0.0    0.0       smaddl	x0, w1, w2, x0
+# CHECK-NEXT: 2.     2     9.0    0.0    0.0       smaddl	x0, w1, w2, x0
+# CHECK-NEXT: 3.     2     11.0   0.0    0.0       smaddl	x0, w0, w0, x0
+# CHECK-NEXT:        2     8.0    0.1    0.0       <total>
+
+# CHECK:      [2] Code Region - fmadd
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      2103
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234
+
+# CHECK:      [0,0]     DeeER.    .    .    .    .    .    .    .   .   fadd	d0, d0, d0
+# CHECK-NEXT: [0,1]     D==eeeeER .    .    .    .    .    .    .   .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,2]     D======eeeER   .    .    .    .    .    .   .   fmul	d0, d0, d0
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .    .    .    .   .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,4]     D=============eeeeER.    .    .    .    .   .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,5]     D=================eeeeER .    .    .    .   .   fmadd	d0, d0, d1, d2
+# CHECK-NEXT: [1,0]     D=====================eeER    .    .    .   .   fadd	d0, d0, d0
+# CHECK-NEXT: [1,1]     D=======================eeeeER.    .    .   .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,2]     D===========================eeeER  .    .   .   fmul	d0, d0, d0
+# CHECK-NEXT: [1,3]     D==============================eeeeER   .   .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,4]     D==================================eeeeER   .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,5]     D======================================eeeeER   fmadd	d0, d0, d1, d2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     11.5   0.5    0.0       fadd	d0, d0, d0
+# CHECK-NEXT: 1.     2     13.5   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 2.     2     17.5   0.0    0.0       fmul	d0, d0, d0
+# CHECK-NEXT: 3.     2     20.5   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 4.     2     24.5   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 5.     2     28.5   0.0    0.0       fmadd	d0, d0, d1, d2
+# CHECK-NEXT:        2     19.3   0.1    0.0       <total>
+
+# CHECK:      [3] Code Region - saba
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   saba	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2]     D========================eeeeER   .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     D============================eeeeER   saba	v0.4s, v0.4s, v1.4s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2.     2     17.0   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     21.0   0.0    0.0       saba	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+
+# CHECK:      [4] Code Region - sadalp
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   sadalp	v0.2d, v0.4s
+# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [1,2]     D========================eeeeER   .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [1,3]     D============================eeeeER   sadalp	v0.2d, v0.4s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sadalp	v0.2d, v1.4s
+# CHECK-NEXT: 2.     2     17.0   0.0    0.0       sadalp	v0.2d, v1.4s
+# CHECK-NEXT: 3.     2     21.0   0.0    0.0       sadalp	v0.2d, v0.4s
+# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+
+# CHECK:      [5] Code Region - sdot
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 0.8
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeER.    .    .    .  .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,2]     D=======eeeER  .    .    .  .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,3]     D==========eeeER    .    .  .   sdot	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D=================eeeER  .  .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,2]     D====================eeeER  .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,3]     D=======================eeeER   sdot	v0.4s, v0.16b, v1.16b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 2.     2     14.5   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       sdot	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
+
+# CHECK:      [6] Code Region - smmla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: Block RThroughput: 0.8
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeER.    .    .    .  .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,2]     D=======eeeER  .    .    .  .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,3]     D==========eeeER    .    .  .   smmla	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D=================eeeER  .  .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,2]     D====================eeeER  .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,3]     D=======================eeeER   smmla	v0.4s, v0.16b, v1.16b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 2.     2     14.5   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       smmla	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
+
+# CHECK:      [7] Code Region - mla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   mla	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2]     D========================eeeeER   .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     D============================eeeeER   mla	v0.4s, v0.4s, v1.4s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2.     2     17.0   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     21.0   0.0    0.0       mla	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+
+# CHECK:      [8] Code Region - smlal2
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   smlal2	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     D========================eeeeER   .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     D============================eeeeER   smlal2	v0.4s, v0.8h, v1.8h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     17.0   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     21.0   0.0    0.0       smlal2	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+
+# CHECK:      [9] Code Region - ssra
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   ssra	v0.2d, v0.2d, #1
+# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: [1,2]     D========================eeeeER   .   ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: [1,3]     D============================eeeeER   ssra	v0.2d, v0.2d, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: 2.     2     17.0   0.0    0.0       ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: 3.     2     21.0   0.0    0.0       ssra	v0.2d, v0.2d, #1
+# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+
+# CHECK:      [10] Code Region - fcmla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   fcmla	v0.2d, v0.2d, v1.2d, #90
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [1,2]     D======================eeeeER . .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [1,3]     D==========================eeeeER   fcmla	v0.2d, v0.2d, v1.2d, #90
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fcmla	v0.2d, v0.2d, v1.2d, #90
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+
+# CHECK:      [11] Code Region - fmla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      2103
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    .    .    .   .   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,2]     D=======eeER   .    .    .    .    .    .   .   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .    .    .    .   .   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,4]     D=============eeeeER.    .    .    .    .   .   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,5]     D=================eeeeER .    .    .    .   .   fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: [1,0]     D=====================eeeER   .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     D========================eeeeER    .    .   .   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,2]     D============================eeER  .    .   .   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3]     D==============================eeeeER   .   .   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,4]     D==================================eeeeER   .   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,5]     D======================================eeeeER   fmla	v0.2d, v0.2d, v1.2d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     11.5   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 2.     2     18.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3.     2     20.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 4.     2     24.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 5.     2     28.5   0.0    0.0       fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT:        2     19.7   0.1    0.0       <total>
+
+# CHECK:      [12] Code Region - fmlal
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.24
+# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    .    .    . .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .    .    .    . .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,2]     D========eeER  .    .    .    .    .    .    .    . .   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    .    .    .    .    . .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,4]     D===============eeeeeER  .    .    .    .    .    . .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,5]     D====================eeeeeER  .    .    .    .    . .   fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: [1,0]     D=========================eeeER    .    .    .    . .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     D============================eeeeeER    .    .    . .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,2]     D=================================eeER  .    .    . .   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3]     D===================================eeeeeER  .    . .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,4]     D========================================eeeeeER  . .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,5]     D=============================================eeeeeER   fmlal	v0.4s, v0.4h, v1.4h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     13.5   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     16.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 2.     2     21.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3.     2     23.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 4.     2     28.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 5.     2     33.5   0.0    0.0       fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT:        2     22.8   0.1    0.0       <total>
+
+# CHECK:      [13] Code Region - bfdot
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   bfdot	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     D======================eeeeER . .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     D==========================eeeeER   bfdot	v0.4s, v0.8h, v1.8h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     15.5   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       bfdot	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+
+# CHECK:      [14] Code Region - bfmmla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.22
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .  .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .  .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   bfmmla	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     D==================eeeER .    .    .  .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     D=====================eeeeeER .    .  .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     D===============================eeeeeER   bfmmla	v0.4s, v0.8h, v1.8h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     10.0   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     18.0   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     23.0   0.0    0.0       bfmmla	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
+
+# CHECK:      [15] Code Region - bfmlalb
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   bfmlalb	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     D======================eeeeER . .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     D==========================eeeeER   bfmlalb	v0.4s, v0.8h, v1.8h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     15.5   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       bfmlalb	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+
+# CHECK:      [16] Code Region - crc32cb
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      803
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .    .  .   mul	w0, w0, w0
+# CHECK-NEXT: [0,1]     D==eeER   .    .  .   crc32cb	w0, w0, w1
+# CHECK-NEXT: [0,2]     D====eeER .    .  .   crc32cb	w0, w0, w1
+# CHECK-NEXT: [0,3]     D======eeER    .  .   crc32cb	w0, w0, w0
+# CHECK-NEXT: [1,0]     D========eeER  .  .   mul	w0, w0, w0
+# CHECK-NEXT: [1,1]     D==========eeER.  .   crc32cb	w0, w0, w1
+# CHECK-NEXT: [1,2]     D============eeER .   crc32cb	w0, w0, w1
+# CHECK-NEXT: [1,3]     D==============eeER   crc32cb	w0, w0, w0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     5.0    0.5    0.0       mul	w0, w0, w0
+# CHECK-NEXT: 1.     2     7.0    0.0    0.0       crc32cb	w0, w0, w1
+# CHECK-NEXT: 2.     2     9.0    0.0    0.0       crc32cb	w0, w0, w1
+# CHECK-NEXT: 3.     2     11.0   0.0    0.0       crc32cb	w0, w0, w0
+# CHECK-NEXT:        2     8.0    0.1    0.0       <total>
+
+# CHECK:      [17] Code Region - Z sdot.s
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1403
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeER    .    .    .    .   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,2]     D========eeeER .    .    .    .   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,3]     D===========eeeER   .    .    .   sdot	z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D===================eeeER.    .   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,2]     D======================eeeER  .   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3]     D=========================eeeER   sdot	z0.s, z0.b, z1.b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: 2.     2     16.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: 3.     2     19.0   0.0    0.0       sdot	z0.s, z0.b, z1.b
+# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
+
+# CHECK:      [18] Code Region - Z sudot
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1403
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeER    .    .    .    .   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [0,2]     D========eeeER .    .    .    .   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [0,3]     D===========eeeER   .    .    .   sdot	z0.s, z0.b, z1.b[1]
+# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D===================eeeER.    .   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,2]     D======================eeeER  .   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,3]     D=========================eeeER   sdot	z0.s, z0.b, z1.b[1]
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 2.     2     16.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 3.     2     19.0   0.0    0.0       sdot	z0.s, z0.b, z1.b[1]
+# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
+
+# CHECK:      [19] Code Region - Z sdot.d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   sdot	z0.d, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,3]     D==============================eeeeER   sdot	z0.d, z0.h, z1.h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: 2.     2     18.5   0.0    0.0       sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: 3.     2     22.5   0.0    0.0       sdot	z0.d, z0.h, z1.h
+# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+
+# CHECK:      [20] Code Region - Z smmla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1403
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeER    .    .    .    .   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,2]     D========eeeER .    .    .    .   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,3]     D===========eeeER   .    .    .   smmla	z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D===================eeeER.    .   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,2]     D======================eeeER  .   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3]     D=========================eeeER   smmla	z0.s, z0.b, z1.b
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: 2.     2     16.0   0.0    0.0       smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: 3.     2     19.0   0.0    0.0       smmla	z0.s, z0.b, z1.b
+# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
+
+# CHECK:      [21] Code Region - Z mla.d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total uOps:        800
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    . .   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .    . .   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,2]     D==========eeeeeER  .    .    .    .    . .   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,3]     D===============eeeeeER  .    .    .    . .   mla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0]     D====================eeeeeER  .    .    . .   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D=========================eeeeeER  .    . .   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     D==============================eeeeeER  . .   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D==================================eeeeeER   mla	z0.d, p0/m, z0.d, z1.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     11.0   0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     16.0   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     21.0   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3.     2     25.5   0.0    0.0       mla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT:        2     18.4   0.1    0.0       <total>
+
+# CHECK:      [22] Code Region - Z mad.d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total uOps:        800
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    . .   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .    . .   mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,2]     D==========eeeeeER  .    .    .    .    . .   mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,3]     D===============eeeeeER  .    .    .    . .   mad	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0]     D====================eeeeeER  .    .    . .   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D=========================eeeeeER  .    . .   mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     D==============================eeeeeER  . .   mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D==================================eeeeeER   mad	z0.d, p0/m, z0.d, z1.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     11.0   0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     16.0   0.0    0.0       mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     21.0   0.0    0.0       mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3.     2     25.5   0.0    0.0       mad	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT:        2     18.4   0.1    0.0       <total>
+
+# CHECK:      [23] Code Region - Z msb.d
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total uOps:        800
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    . .   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .    . .   msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,2]     D==========eeeeeER  .    .    .    .    . .   msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,3]     D===============eeeeeER  .    .    .    . .   msb	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0]     D====================eeeeeER  .    .    . .   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D=========================eeeeeER  .    . .   msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     D==============================eeeeeER  . .   msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D==================================eeeeeER   msb	z0.d, p0/m, z0.d, z1.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     11.0   0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     16.0   0.0    0.0       msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     21.0   0.0    0.0       msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3.     2     25.5   0.0    0.0       msb	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT:        2     18.4   0.1    0.0       <total>
+
+# CHECK:      [24] Code Region - Z fcmla ZPmZZ
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.22
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .  .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   fcmla	z0.d, p0/m, z0.d, z1.d, #90
+# CHECK-NEXT: [1,0]     D==================eeeER .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D=====================eeeeeER .    .  .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [1,3]     D===============================eeeeeER   fcmla	z0.d, p0/m, z0.d, z1.d, #90
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     10.0   0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: 2.     2     18.0   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: 3.     2     23.0   0.0    0.0       fcmla	z0.d, p0/m, z0.d, z1.d, #90
+# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
+
+# CHECK:      [25] Code Region - Z fcmla ZZZI
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.22
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .  .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   fcmla	z0.s, z0.s, z1.s[1], #90
+# CHECK-NEXT: [1,0]     D==================eeeER .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D=====================eeeeeER .    .  .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [1,3]     D===============================eeeeeER   fcmla	z0.s, z0.s, z1.s[1], #90
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     10.0   0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: 2.     2     18.0   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: 3.     2     23.0   0.0    0.0       fcmla	z0.s, z0.s, z1.s[1], #90
+# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
+
+# CHECK:      [26] Code Region - Z fmla ZPmZZ
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   fmla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     D======================eeeeER . .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3]     D==========================eeeeER   fmla	z0.d, p0/m, z0.d, z1.d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fmla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+
+# CHECK:      [27] Code Region - Z fmla ZZZI
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   fmla	z0.d, z0.d, z1.d[1]
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [1,2]     D======================eeeeER . .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [1,3]     D==========================eeeeER   fmla	z0.d, z0.d, z1.d[1]
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fmla	z0.d, z0.d, z1.d[1]
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+
+# CHECK:      [28] Code Region - Z bfdot
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   bfdot	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2]     D======================eeeeER . .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     D==========================eeeeER   bfdot	z0.s, z0.h, z1.h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: 2.     2     15.5   0.0    0.0       bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       bfdot	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+
+# CHECK:      [29] Code Region - Z bfmmla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.22
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .  .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   bfmmla	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D==================eeeER .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D=====================eeeeeER .    .  .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     D===============================eeeeeER   bfmmla	z0.s, z0.h, z1.h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     10.0   0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: 2.     2     18.0   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     23.0   0.0    0.0       bfmmla	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
+
+# CHECK:      [30] Code Region - bfmlalb
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    15
+# CHECK-NEXT: uOps Per Cycle:    0.22
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .  .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   bfmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D==================eeeER .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D=====================eeeeeER .    .  .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     D===============================eeeeeER   bfmlalb	z0.s, z0.h, z1.h
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     10.0   0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: 2.     2     18.0   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     23.0   0.0    0.0       bfmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     16.0   0.1    0.0       <total>

>From 1e10154ae3b844a19355d2228a00bac351075bb5 Mon Sep 17 00:00:00 2001
From: Rin Dobrescu <rin.dobrescu at arm.com>
Date: Mon, 7 Oct 2024 14:55:39 +0000
Subject: [PATCH 2/3] [AArch64] Add SchedReadAdvance to Neoverse-V1 scheduling
 model.

---
 .../Target/AArch64/AArch64SchedNeoverseV1.td  |  205 ++-
 .../llvm-mca/AArch64/Neoverse/V1-forwarding.s | 1136 ++++++++---------
 .../AArch64/Neoverse/V1-neon-instructions.s   |  138 +-
 3 files changed, 790 insertions(+), 689 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
index 8abbfe63d9a806..1d7cb699f731aa 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
@@ -469,6 +469,87 @@ def V1Write_11c_9L01_9S_9V : SchedWriteRes<[V1UnitL01, V1UnitL01, V1UnitL01,
                                             V1UnitV, V1UnitV, V1UnitV,
                                             V1UnitV, V1UnitV, V1UnitV]>;
 
+//===----------------------------------------------------------------------===//
+// Define forwarded types
+
+// NOTE: SOG, p. 20, n. 2: Accumulator forwarding is not supported for
+// consumers of 64 bit multiply high operations?
+def V1WriteIM : SchedWriteVariant<
+                  [SchedVar<NeoverseMULIdiomPred, [V1Write_2c_1M]>,
+                   SchedVar<NoSchedPred,          [V1Write_2c_1M0]>]>;
+def V1Rd_MA : SchedReadAdvance<1, [V1Write_2c_1M0]>;
+
+def V1Wr_FMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
+def V1Rd_FMA : SchedReadAdvance<2, [WriteFMul, V1Wr_FMA]>;
+
+def V1Wr_ADA : SchedWriteRes<[V1UnitV13]> { let Latency = 4; }
+def V1Rd_ADA : SchedReadAdvance<3, [V1Wr_ADA]>;
+
+def V1Wr_VDOT : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
+def V1Rd_VDOT : SchedReadAdvance<2, [V1Wr_VDOT]>;
+
+def V1Wr_VMMA : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
+def V1Rd_VMMA : SchedReadAdvance<2, [V1Wr_VMMA]>;
+
+def V1Wr_VMA : SchedWriteRes<[V1UnitV02]> { let Latency = 4; }
+def V1Rd_VMA : SchedReadAdvance<3, [V1Wr_VMA]>;
+
+def V1Wr_VMAL : SchedWriteRes<[V1UnitV02]> { let Latency = 4; }
+def V1Rd_VMAL : SchedReadAdvance<3, [V1Wr_VMAL]>;
+
+def V1Wr_VSA : SchedWriteRes<[V1UnitV13]> { let Latency = 4; }
+def V1Rd_VSA : SchedReadAdvance<3, [V1Wr_VSA]>;
+
+def V1Wr_FCMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
+def V1Rd_FCMA : SchedReadAdvance<2, [V1Wr_FCMA]>;
+
+def V1Wr_FPM : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
+def V1Wr_FPMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
+def V1Rd_FPMA : SchedReadAdvance<2, [V1Wr_FPM, V1Wr_FPMA]>;
+
+def V1Wr_FPMAL : SchedWriteRes<[V1UnitV]> { let Latency = 5; }
+def V1Rd_FPMAL : SchedReadAdvance<3, [V1Wr_FPMAL]>;
+
+def V1Wr_BFD : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
+def V1Rd_BFD : SchedReadAdvance<2, [V1Wr_BFD]>;
+
+def V1Wr_BFMMA : SchedWriteRes<[V1UnitV]> { let Latency = 5; }
+def V1Rd_BFMMA : SchedReadAdvance<2, [V1Wr_BFMMA]>;
+
+def V1Wr_BFMLA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
+def V1Rd_BFMLA : SchedReadAdvance<2, [V1Wr_BFMLA]>;
+
+def V1Wr_CRC : SchedWriteRes<[V1UnitM0]> { let Latency = 2; }
+def V1Rd_CRC : SchedReadAdvance<1, [V1Wr_CRC]>;
+
+def V1Wr_ZDOTB : SchedWriteRes<[V1UnitV01]> { let Latency = 3; }
+def V1Rd_ZDOTB : SchedReadAdvance<2, [V1Wr_ZDOTB]>;
+
+def V1Wr_ZUDOTB : SchedWriteRes<[V1UnitV]> { let Latency = 3; }
+def V1Rd_ZUDOTB : SchedReadAdvance<2, [V1Wr_ZUDOTB]>;
+
+def V1Wr_ZDOTH : SchedWriteRes<[V1UnitV0]> { let Latency = 4; }
+def V1Rd_ZDOTH : SchedReadAdvance<3, [V1Wr_ZDOTH]>;
+
+def V1Wr_ZMMA : SchedWriteRes<[V1UnitV01]> { let Latency = 3; }
+def V1Rd_ZMMA : SchedReadAdvance<2, [V1Wr_ZMMA]>;
+
+let Latency = 5, NumMicroOps = 2 in
+def V1Wr_ZMAD : SchedWriteRes<[V1UnitV0, V1UnitV0]>;
+def V1Rd_ZMAD : SchedReadAdvance<3, [V1Wr_ZMAD]>;
+
+def V1Wr_ZFCMA : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
+def V1Rd_ZFCMA : SchedReadAdvance<3, [V1Wr_ZFCMA]>;
+
+def V1Wr_ZFMA : SchedWriteRes<[V1UnitV01]> { let Latency = 4; }
+def V1Rd_ZFMA : SchedReadAdvance<2, [V1Wr_ZFMA]>;
+
+def V1Wr_ZBFDOT : SchedWriteRes<[V1UnitV01]> { let Latency = 4; }
+def V1Rd_ZBFDOT : SchedReadAdvance<2, [V1Wr_ZBFDOT]>;
+def V1Wr_ZBFMMA : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
+def V1Rd_ZBFMMA : SchedReadAdvance<2, [V1Wr_ZBFMMA]>;
+def V1Wr_ZBFMAL : SchedWriteRes<[V1UnitV01]> { let Latency = 5; }
+def V1Rd_ZBFMAL : SchedReadAdvance<3, [V1Wr_ZBFMAL]>;
 
 // Miscellaneous Instructions
 // -----------------------------------------------------------------------------
@@ -553,16 +634,22 @@ def : InstRW<[V1Write_1c_1J], (instrs SETF8, SETF16, RMIF, CFINV)>;
 def : SchedAlias<WriteID32, V1Write_12c5_1M0>;
 def : SchedAlias<WriteID64, V1Write_20c5_1M0>;
 
+def           : SchedAlias<WriteIM32, V1Write_2c_1M>;
+def           : SchedAlias<WriteIM64, V1Write_2c_1M>;
+
 // Multiply
-// Multiply accumulate
-// Multiply accumulate, long
-// Multiply long
-def V1WriteIM : SchedWriteVariant<
+// Multiply accumulate, W-form
+// Multiply accumulate, X-form
+/*def V1WriteIM : SchedWriteVariant<
                   [SchedVar<NeoverseMULIdiomPred, [V1Write_2c_1M]>,
                    SchedVar<NoSchedPred,          [V1Write_2c_1M0]>]>;
-def           : SchedAlias<WriteIM32, V1WriteIM>;
-def           : SchedAlias<WriteIM64, V1WriteIM>;
+def V1Rd_MA : SchedReadAdvance<1, [V1Write_2c_1M0]>;*/
+def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_MA], (instregex "^M(ADD|SUB)[WX]rrr$")>;
 
+// Multiply accumulate long
+// Multiply long
+def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_MA],
+             (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
 // Multiply high
 def : InstRW<[V1Write_3c_1M, ReadIM, ReadIM], (instrs SMULHrr, UMULHrr)>;
 
@@ -680,10 +767,10 @@ def : InstRW<[V1Write_15c7_1V02], (instrs FDIVDrr)>;
 def : InstRW<[V1Write_16c7_1V02], (instrs FSQRTDr)>;
 
 // FP multiply
-def : SchedAlias<WriteFMul, V1Write_3c_1V>;
+def : WriteRes<WriteFMul, [V1UnitV]> { let Latency = 3; }
 
 // FP multiply accumulate
-def : InstRW<[V1Write_4c_1V], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
+def : InstRW<[V1Wr_FMA, ReadDefault, ReadDefault, V1Rd_FMA], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
 
 // FP round to integral
 def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ][HSD]r$",
@@ -824,7 +911,7 @@ def : SchedAlias<WriteVq, V1Write_2c_1V>;
 // ASIMD absolute diff accum
 // ASIMD absolute diff accum long
 // ASIMD pairwise add and accumulate long
-def : InstRW<[V1Write_4c_1V13], (instregex "^[SU]ABAL?v", "^[SU]ADALPv")>;
+def : InstRW<[V1Wr_ADA, V1Rd_ADA], (instregex "^[SU]ABAL?v", "^[SU]ADALPv")>;
 
 // ASIMD arith, reduce, 4H/4S
 // ASIMD max/min, reduce, 4H/4S
@@ -843,23 +930,25 @@ def : InstRW<[V1Write_4c_2V13], (instregex "^(ADD|[SU]ADDL)Vv16i8v$",
 
 // ASIMD dot product
 // ASIMD dot product using signed and unsigned integers
-def : InstRW<[V1Write_3c_1V], (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>;
+def : InstRW<[V1Wr_VDOT, V1Rd_VDOT], (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>;
 
-// ASIMD matrix multiply- accumulate
-def : InstRW<[V1Write_3c_1V], (instrs SMMLA, UMMLA, USMMLA)>;
+// ASIMD matrix multiply-accumulate
+def : InstRW<[V1Wr_VMMA, V1Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>;
 
 // ASIMD multiply
+def : InstRW<[V1Write_4c_1V02], (instregex "^MULv", "^SQ(R)?DMULHv")>;
+
 // ASIMD multiply accumulate
+def : InstRW<[V1Wr_VMA, V1Rd_VMA], (instregex "^MLAv", "^MLSv")>;
+
 // ASIMD multiply accumulate long
+def : InstRW<[V1Wr_VMAL, V1Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
+
 // ASIMD multiply accumulate high
+def : InstRW<[V1Write_4c_1V02], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
+
 // ASIMD multiply accumulate saturating long
-def : InstRW<[V1Write_4c_1V02], 
-             (instregex "^MUL(v[148]i16|v[124]i32)$",
-                        "^SQR?DMULH(v[48]i16|v[24]i32)$",
-                        "^ML[AS](v[148]i16|v[124]i32)$",
-                        "^[SU]ML[AS]Lv",
-                        "^SQRDML[AS]H(v[148]i16|v[124]i32)$",
-                        "^SQDML[AS]Lv")>;
+def : InstRW<[V1Write_4c_1V02], (instregex "^SQDML[AS]L[iv]")>;
 
 // ASIMD multiply/multiply long (8x8) polynomial
 def : InstRW<[V1Write_3c_1V01], (instregex "^PMULL?v(8|16)i8$")>;
@@ -868,11 +957,12 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^PMULL?v(8|16)i8$")>;
 def : InstRW<[V1Write_3c_1V02], (instregex "^([SU]|SQD)MULLv")>;
 
 // ASIMD shift accumulate
+def : InstRW<[V1Wr_VSA, V1Rd_VSA], (instregex "^[SU]SRAv", "^[SU]RSRAv")>;
+
 // ASIMD shift by immed, complex
 // ASIMD shift by register, complex
 def : InstRW<[V1Write_4c_1V13],
-             (instregex "^[SU]R?SRAv",
-                        "^RSHRNv", "^SQRSHRU?Nv", "^(SQSHLU?|UQSHL)[bhsd]$",
+             (instregex "^RSHRNv", "^SQRSHRU?Nv", "^(SQSHLU?|UQSHL)[bhsd]$",
                         "^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$",
                         "^SQSHU?RNv", "^[SU]RSHRv", "^UQR?SHRNv", 
                         "^[SU]Q?RSHLv", "^[SU]QSHLv")>;
@@ -890,18 +980,25 @@ def : InstRW<[V1Write_2c_1V13], (instregex "^SHLL?v", "^SHRNv", "^[SU]SHLLv",
 // ASIMD FP absolute value/difference
 // ASIMD FP arith, normal
 // ASIMD FP compare
-// ASIMD FP complex add
 // ASIMD FP max/min, normal
 // ASIMD FP max/min, pairwise
 // ASIMD FP negate
 // Covered by "SchedAlias (WriteV[dq]...)" above
 
-// ASIMD FP multiply accumulate
-def : InstRW<[V1Write_4c_1V], (instregex "^FCADD(v[48]f16|v[24]f32|v2f64)$",
-                                         "^FML[AS]v")>;
+// ASIMD FP complex add
+def : InstRW<[V1Write_4c_1V], (instregex "^FCADD(v[48]f16|v[24]f32|v2f64)$")>;
 
 // ASIMD FP complex multiply add
-def : InstRW<[V1Write_4c_1V], (instregex "^FCMLAv")>;
+def : InstRW<[V1Wr_FCMA, V1Rd_FCMA], (instregex "^FCMLAv")>;
+
+// ASIMD FP multiply
+def : InstRW<[V1Wr_FPM], (instregex "^FMULX?v")>;
+
+// ASIMD FP multiply accumulate
+def : InstRW<[V1Wr_FPMA, V1Rd_FPMA], (instregex "^FML[AS]v")>;
+
+// ASIMD FP multiply accumulate long
+def : InstRW<[V1Wr_FPMAL, V1Rd_FPMAL], (instregex "^FML[AS]L2?v")>;
 
 // ASIMD FP convert, long (F16 to F32)
 def : InstRW<[V1Write_4c_2V02], (instregex "^FCVTLv[48]i16$")>;
@@ -955,12 +1052,6 @@ def : InstRW<[V1Write_4c_2V], (instregex "^F(MAX|MIN)(NM)?Vv4(i16|i32)v$")>;
 // ASIMD FP max/min, reduce, Q-form F16
 def : InstRW<[V1Write_6c_3V], (instregex "^F(MAX|MIN)(NM)?Vv8i16v$")>;
 
-// ASIMD FP multiply
-def : InstRW<[V1Write_3c_1V], (instregex "^FMULX?v")>;
-
-// ASIMD FP multiply accumulate long
-def : InstRW<[V1Write_5c_1V], (instregex "^FML[AS]L2?v")>;
-
 // ASIMD FP round, D-form F32 and Q-form F64
 def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ]v2f(32|64)$")>;
 
@@ -978,13 +1069,13 @@ def : InstRW<[V1Write_6c_4V02], (instregex "^FRINT[AIMNPXZ]v8f16$")>;
 def : InstRW<[V1Write_4c_1V02], (instrs BFCVTN, BFCVTN2)>;
 
 // ASIMD dot product
-def : InstRW<[V1Write_4c_1V], (instregex "^BF(DOT|16DOTlane)v[48]bf16$")>;
+def : InstRW<[V1Wr_BFD, V1Rd_BFD], (instregex "^BF(DOT|16DOTlane)v[48]bf16$")>;
 
 // ASIMD matrix multiply accumulate
-def : InstRW<[V1Write_5c_1V], (instrs BFMMLA)>;
+def : InstRW<[V1Wr_BFMMA, V1Rd_BFMMA], (instrs BFMMLA)>;
 
 // ASIMD multiply accumulate long
-def : InstRW<[V1Write_4c_1V], (instregex "^BFMLAL[BT](Idx)?$")>;
+def : InstRW<[V1Wr_BFMLA, V1Rd_BFMLA], (instregex "^BFMLAL[BT](Idx)?$")>;
 
 // Scalar convert, F32 to BF16
 def : InstRW<[V1Write_3c_1V02], (instrs BFCVT)>;
@@ -1302,7 +1393,7 @@ def : InstRW<[V1Write_2c_1V0], (instrs BCAX, EOR3, RAX1, XAR)>;
 // -----------------------------------------------------------------------------
 
 // CRC checksum ops
-def : InstRW<[V1Write_2c_1M0], (instregex "^CRC32C?[BHWX]rr$")>;
+def : InstRW<[V1Wr_CRC, V1Rd_CRC], (instregex "^CRC32C?[BHWX]rr$")>;
 
 
 // SVE Predicate instructions
@@ -1442,13 +1533,13 @@ def : InstRW<[V1Write_20c7_1V0], (instregex "^[SU]DIVR?_ZPmZ_D",
                                              "^[SU]DIV_ZPZZ_D")>;
 
 // Dot product, 8 bit
-def : InstRW<[V1Write_3c_1V01], (instregex "^[SU]DOT_ZZZI?_S$")>;
+def : InstRW<[V1Wr_ZDOTB, V1Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_S$")>;
 
 // Dot product, 8 bit, using signed and unsigned integers
-def : InstRW<[V1Write_3c_1V], (instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>;
+def : InstRW<[V1Wr_ZUDOTB, V1Rd_ZUDOTB], (instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>;
 
 // Dot product, 16 bit
-def : InstRW<[V1Write_4c_1V0], (instregex "^[SU]DOT_ZZZI?_D$")>;
+def : InstRW<[V1Wr_ZDOTH, V1Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_D$")>;
 
 // Duplicate, immediate and indexed form
 def : InstRW<[V1Write_2c_1V01], (instregex "^DUP_ZI_[BHSD]$",
@@ -1490,7 +1581,7 @@ def : InstRW<[V1Write_2c_1V01], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]$",
                                            "^MOVPRFX_ZZ$")>;
 
 // Matrix multiply-accumulate
-def : InstRW<[V1Write_3c_1V01], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
+def : InstRW<[V1Wr_ZMMA, V1Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
 
 // Multiply, B, H, S element size
 def : InstRW<[V1Write_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]",
@@ -1499,12 +1590,16 @@ def : InstRW<[V1Write_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]",
                                           "^[SU]MULH_ZPZZ_[BHS]")>;
 
 // Multiply, D element size
-// Multiply accumulate, D element size
 def : InstRW<[V1Write_5c_2V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D",
                                           "^MUL_ZPZZ_D",
                                           "^[SU]MULH_(ZPmZ|ZZZ)_D",
-                                          "^[SU]MULH_ZPZZ_D",
-                                          "^(MLA|MLS|MAD|MSB)_(ZPmZZ|ZPZZZ)_D")>;
+                                          "^[SU]MULH_ZPZZ_D")>;
+
+// Multiply accumulate, D element size
+def : InstRW<[V1Wr_ZMAD, V1Rd_ZMAD],
+             (instregex "^ML[AS]_ZPZZZ_D")>;
+def : InstRW<[V1Wr_ZMAD, ReadDefault, V1Rd_ZMAD],
+             (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>;
 
 // Multiply accumulate, B, H, S element size
 // NOTE: This is not specified in the SOG.
@@ -1585,8 +1680,10 @@ def : InstRW<[V1Write_2c_1V0], (instregex "^FAC(GE|GT)_PPzZZ_[HSD]$",
 def : InstRW<[V1Write_3c_1V01], (instregex "^FCADD_ZPmZ_[HSD]$")>;
 
 // Floating point complex multiply add
-def : InstRW<[V1Write_5c_1V01], (instregex "^FCMLA_ZPmZZ_[HSD]$",
-                                           "^FCMLA_ZZZI_[HS]$")>;
+/*def : InstRW<[V1Write_5c_1V01], (instregex "^FCMLA_ZPmZZ_[HSD]$",
+                                           "^FCMLA_ZZZI_[HS]$")>;*/
+def : InstRW<[V1Wr_ZFCMA, ReadDefault, V1Rd_ZFCMA], (instregex "^FCMLA_ZPmZZ_[HSD]")>;
+def : InstRW<[V1Wr_ZFCMA, V1Rd_ZFCMA],              (instregex "^FCMLA_ZZZI_[HS]")>;
 
 // Floating point convert, long or narrow (F16 to F32 or F32 to F16)
 // Floating point convert to integer, F32
@@ -1625,11 +1722,15 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]",
                                            "^FMUL_ZPZ[IZ]_[HSD]")>;
 
 // Floating point multiply accumulate
+def : InstRW<[V1Wr_ZFMA, ReadDefault, V1Rd_ZFMA],
+             (instregex "^FN?ML[AS]_ZPmZZ_[HSD]",
+                        "^FN?(MAD|MSB)_ZPmZZ_[HSD]")>;
+def : InstRW<[V1Wr_ZFMA, V1Rd_ZFMA],
+             (instregex "^FML[AS]_ZZZI_[HSD]",
+                        "^FN?ML[AS]_ZPZZZ_[HSD]")>;
+
 // Floating point reciprocal step
-def : InstRW<[V1Write_4c_1V01], (instregex "^F(N?M(AD|SB)|N?ML[AS])_ZPmZZ_[HSD]$",
-                                           "^FN?ML[AS]_ZPZZZ_[HSD]",
-                                           "^FML[AS]_ZZZI_[HSD]$",
-                                           "^F(RECPS|RSQRTS)_ZZZ_[HSD]$")>;
+def : InstRW<[V1Write_4c_1V01], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>;
 
 // Floating point reciprocal estimate, F16
 def : InstRW<[V1Write_6c_4V0], (instrs FRECPE_ZZ_H, FRSQRTE_ZZ_H)>;
@@ -1683,13 +1784,13 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^FEXPA_ZZ_[HSD]$",
 def : InstRW<[V1Write_4c_1V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;
 
 // Dot product
-def : InstRW<[V1Write_4c_1V01], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
+def : InstRW<[V1Wr_ZBFDOT, V1Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
 
 // Matrix multiply accumulate
-def : InstRW<[V1Write_5c_1V01], (instrs BFMMLA_ZZZ)>;
+def : InstRW<[V1Wr_ZBFMMA, V1Rd_ZBFMMA], (instrs BFMMLA_ZZZ)>;
 
 // Multiply accumulate long
-def : InstRW<[V1Write_5c_1V01], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;
+def : InstRW<[V1Wr_ZBFMAL, V1Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;
 
 
 // SVE Load instructions
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s
index 694c931c029412..4de37f96000520 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s
@@ -228,26 +228,26 @@ bfmlalb z0.s, z0.h, z1.h
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      803
+# CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.57
 # CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
+# CHECK-NEXT:                     0123456
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.    .    .  .   mul	x0, x0, x0
-# CHECK-NEXT: [0,1]     D==eeER   .    .  .   madd	x0, x1, x2, x0
-# CHECK-NEXT: [0,2]     D====eeER .    .  .   madd	x0, x1, x2, x0
-# CHECK-NEXT: [0,3]     D======eeER    .  .   madd	x0, x0, x0, x0
-# CHECK-NEXT: [1,0]     D========eeER  .  .   mul	x0, x0, x0
-# CHECK-NEXT: [1,1]     D==========eeER.  .   madd	x0, x1, x2, x0
-# CHECK-NEXT: [1,2]     D============eeER .   madd	x0, x1, x2, x0
-# CHECK-NEXT: [1,3]     D==============eeER   madd	x0, x0, x0, x0
+# CHECK:      [0,0]     DeeER.    .    ..   mul	x0, x0, x0
+# CHECK-NEXT: [0,1]     D==eeER   .    ..   madd	x0, x1, x2, x0
+# CHECK-NEXT: [0,2]     D===eeER  .    ..   madd	x0, x1, x2, x0
+# CHECK-NEXT: [0,3]     D=====eeER.    ..   madd	x0, x0, x0, x0
+# CHECK-NEXT: [1,0]     D=======eeER   ..   mul	x0, x0, x0
+# CHECK-NEXT: [1,1]     D=========eeER ..   madd	x0, x1, x2, x0
+# CHECK-NEXT: [1,2]     D==========eeER..   madd	x0, x1, x2, x0
+# CHECK-NEXT: [1,3]     D============eeER   madd	x0, x0, x0, x0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -256,36 +256,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     5.0    0.5    0.0       mul	x0, x0, x0
-# CHECK-NEXT: 1.     2     7.0    0.0    0.0       madd	x0, x1, x2, x0
-# CHECK-NEXT: 2.     2     9.0    0.0    0.0       madd	x0, x1, x2, x0
-# CHECK-NEXT: 3.     2     11.0   0.0    0.0       madd	x0, x0, x0, x0
-# CHECK-NEXT:        2     8.0    0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	x0, x0, x0
+# CHECK-NEXT: 1.     2     6.5    0.0    0.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 2.     2     7.5    0.0    0.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 3.     2     9.5    0.0    0.0       madd	x0, x0, x0, x0
+# CHECK-NEXT:        2     7.0    0.1    0.0       <total>
 
 # CHECK:      [1] Code Region - smaddl
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      803
+# CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.57
 # CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
+# CHECK-NEXT:                     0123456
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.    .    .  .   mul	x0, x0, x0
-# CHECK-NEXT: [0,1]     D==eeER   .    .  .   smaddl	x0, w1, w2, x0
-# CHECK-NEXT: [0,2]     D====eeER .    .  .   smaddl	x0, w1, w2, x0
-# CHECK-NEXT: [0,3]     D======eeER    .  .   smaddl	x0, w0, w0, x0
-# CHECK-NEXT: [1,0]     D========eeER  .  .   mul	x0, x0, x0
-# CHECK-NEXT: [1,1]     D==========eeER.  .   smaddl	x0, w1, w2, x0
-# CHECK-NEXT: [1,2]     D============eeER .   smaddl	x0, w1, w2, x0
-# CHECK-NEXT: [1,3]     D==============eeER   smaddl	x0, w0, w0, x0
+# CHECK:      [0,0]     DeeER.    .    ..   mul	x0, x0, x0
+# CHECK-NEXT: [0,1]     D==eeER   .    ..   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [0,2]     D===eeER  .    ..   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [0,3]     D=====eeER.    ..   smaddl	x0, w0, w0, x0
+# CHECK-NEXT: [1,0]     D=======eeER   ..   mul	x0, x0, x0
+# CHECK-NEXT: [1,1]     D=========eeER ..   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [1,2]     D==========eeER..   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [1,3]     D============eeER   smaddl	x0, w0, w0, x0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -294,40 +294,40 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     5.0    0.5    0.0       mul	x0, x0, x0
-# CHECK-NEXT: 1.     2     7.0    0.0    0.0       smaddl	x0, w1, w2, x0
-# CHECK-NEXT: 2.     2     9.0    0.0    0.0       smaddl	x0, w1, w2, x0
-# CHECK-NEXT: 3.     2     11.0   0.0    0.0       smaddl	x0, w0, w0, x0
-# CHECK-NEXT:        2     8.0    0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	x0, x0, x0
+# CHECK-NEXT: 1.     2     6.5    0.0    0.0       smaddl	x0, w1, w2, x0
+# CHECK-NEXT: 2.     2     7.5    0.0    0.0       smaddl	x0, w1, w2, x0
+# CHECK-NEXT: 3.     2     9.5    0.0    0.0       smaddl	x0, w0, w0, x0
+# CHECK-NEXT:        2     7.0    0.1    0.0       <total>
 
 # CHECK:      [2] Code Region - fmadd
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
-# CHECK-NEXT: Total Cycles:      2103
+# CHECK-NEXT: Total Cycles:      1703
 # CHECK-NEXT: Total uOps:        600
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.35
+# CHECK-NEXT: IPC:               0.35
 # CHECK-NEXT: Block RThroughput: 1.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01234
-
-# CHECK:      [0,0]     DeeER.    .    .    .    .    .    .    .   .   fadd	d0, d0, d0
-# CHECK-NEXT: [0,1]     D==eeeeER .    .    .    .    .    .    .   .   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [0,2]     D======eeeER   .    .    .    .    .    .   .   fmul	d0, d0, d0
-# CHECK-NEXT: [0,3]     D=========eeeeER    .    .    .    .    .   .   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [0,4]     D=============eeeeER.    .    .    .    .   .   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [0,5]     D=================eeeeER .    .    .    .   .   fmadd	d0, d0, d1, d2
-# CHECK-NEXT: [1,0]     D=====================eeER    .    .    .   .   fadd	d0, d0, d0
-# CHECK-NEXT: [1,1]     D=======================eeeeER.    .    .   .   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [1,2]     D===========================eeeER  .    .   .   fmul	d0, d0, d0
-# CHECK-NEXT: [1,3]     D==============================eeeeER   .   .   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [1,4]     D==================================eeeeER   .   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [1,5]     D======================================eeeeER   fmadd	d0, d0, d1, d2
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeER.    .    .    .    .    .    ..   fadd	d0, d0, d0
+# CHECK-NEXT: [0,1]     D==eeeeER .    .    .    .    .    ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,2]     D======eeeER   .    .    .    .    ..   fmul	d0, d0, d0
+# CHECK-NEXT: [0,3]     D=======eeeeER .    .    .    .    ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,4]     D=========eeeeER    .    .    .    ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,5]     D=============eeeeER.    .    .    ..   fmadd	d0, d0, d1, d2
+# CHECK-NEXT: [1,0]     D=================eeER   .    .    ..   fadd	d0, d0, d0
+# CHECK-NEXT: [1,1]     D===================eeeeER    .    ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,2]     D=======================eeeER .    ..   fmul	d0, d0, d0
+# CHECK-NEXT: [1,3]     D========================eeeeER    ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,4]     D==========================eeeeER  ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,5]     D==============================eeeeER   fmadd	d0, d0, d1, d2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -336,38 +336,38 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     11.5   0.5    0.0       fadd	d0, d0, d0
-# CHECK-NEXT: 1.     2     13.5   0.0    0.0       fmadd	d0, d1, d2, d0
-# CHECK-NEXT: 2.     2     17.5   0.0    0.0       fmul	d0, d0, d0
-# CHECK-NEXT: 3.     2     20.5   0.0    0.0       fmadd	d0, d1, d2, d0
-# CHECK-NEXT: 4.     2     24.5   0.0    0.0       fmadd	d0, d1, d2, d0
-# CHECK-NEXT: 5.     2     28.5   0.0    0.0       fmadd	d0, d0, d1, d2
-# CHECK-NEXT:        2     19.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       fadd	d0, d0, d0
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fmul	d0, d0, d0
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 4.     2     18.5   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 5.     2     22.5   0.0    0.0       fmadd	d0, d0, d1, d2
+# CHECK-NEXT:        2     15.7   0.1    0.0       <total>
 
 # CHECK:      [3] Code Region - saba
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.25
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   saba	v0.4s, v0.4s, v1.4s
-# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2]     D========================eeeeER   .   saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3]     D============================eeeeER   saba	v0.4s, v0.4s, v1.4s
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   saba	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D=================eeeeER .  .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     D======================eeeeER   saba	v0.4s, v0.4s, v1.4s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -376,36 +376,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2.     2     17.0   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3.     2     21.0   0.0    0.0       saba	v0.4s, v0.4s, v1.4s
-# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       saba	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
 
 # CHECK:      [4] Code Region - sadalp
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.25
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   sadalp	v0.2d, v1.4s
-# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   sadalp	v0.2d, v1.4s
-# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   sadalp	v0.2d, v0.4s
-# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   sadalp	v0.2d, v1.4s
-# CHECK-NEXT: [1,2]     D========================eeeeER   .   sadalp	v0.2d, v1.4s
-# CHECK-NEXT: [1,3]     D============================eeeeER   sadalp	v0.2d, v0.4s
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   sadalp	v0.2d, v0.4s
+# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D=================eeeeER .  .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [1,3]     D======================eeeeER   sadalp	v0.2d, v0.4s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -414,36 +414,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sadalp	v0.2d, v1.4s
-# CHECK-NEXT: 2.     2     17.0   0.0    0.0       sadalp	v0.2d, v1.4s
-# CHECK-NEXT: 3.     2     21.0   0.0    0.0       sadalp	v0.2d, v0.4s
-# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       sadalp	v0.2d, v1.4s
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       sadalp	v0.2d, v1.4s
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       sadalp	v0.2d, v0.4s
+# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
 
 # CHECK:      [5] Code Region - sdot
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total Cycles:      1103
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.31
-# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.36
 # CHECK-NEXT: Block RThroughput: 0.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          01234
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D====eeeER.    .    .    .  .   sdot	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [0,2]     D=======eeeER  .    .    .  .   sdot	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [0,3]     D==========eeeER    .    .  .   sdot	v0.4s, v0.16b, v1.16b
-# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     D=================eeeER  .  .   sdot	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,2]     D====================eeeER  .   sdot	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,3]     D=======================eeeER   sdot	v0.4s, v0.16b, v1.16b
+# CHECK:      [0,0]     DeeeeER   .    .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeER.    .    .   .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,2]     D=====eeeER    .    .   .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,3]     D========eeeER .    .   .   sdot	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: [1,0]     D===========eeeeER  .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D===============eeeER   .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,2]     D================eeeER  .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,3]     D===================eeeER   sdot	v0.4s, v0.16b, v1.16b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -452,36 +452,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 2.     2     14.5   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 3.     2     17.5   0.0    0.0       sdot	v0.4s, v0.16b, v1.16b
-# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 2.     2     11.5   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 3.     2     14.5   0.0    0.0       sdot	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT:        2     10.8   0.1    0.0       <total>
 
 # CHECK:      [6] Code Region - smmla
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1303
+# CHECK-NEXT: Total Cycles:      1103
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.31
-# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.36
 # CHECK-NEXT: Block RThroughput: 0.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          01234
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D====eeeER.    .    .    .  .   smmla	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [0,2]     D=======eeeER  .    .    .  .   smmla	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [0,3]     D==========eeeER    .    .  .   smmla	v0.4s, v0.16b, v1.16b
-# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     D=================eeeER  .  .   smmla	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,2]     D====================eeeER  .   smmla	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,3]     D=======================eeeER   smmla	v0.4s, v0.16b, v1.16b
+# CHECK:      [0,0]     DeeeeER   .    .    .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeER.    .    .   .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,2]     D=====eeeER    .    .   .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [0,3]     D========eeeER .    .   .   smmla	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: [1,0]     D===========eeeeER  .   .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D===============eeeER   .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,2]     D================eeeER  .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,3]     D===================eeeER   smmla	v0.4s, v0.16b, v1.16b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -490,36 +490,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 2.     2     14.5   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 3.     2     17.5   0.0    0.0       smmla	v0.4s, v0.16b, v1.16b
-# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 2.     2     11.5   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 3.     2     14.5   0.0    0.0       smmla	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT:        2     10.8   0.1    0.0       <total>
 
 # CHECK:      [7] Code Region - mla
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.25
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   mla	v0.4s, v0.4s, v1.4s
-# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2]     D========================eeeeER   .   mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3]     D============================eeeeER   mla	v0.4s, v0.4s, v1.4s
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   mla	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D=================eeeeER .  .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     D======================eeeeER   mla	v0.4s, v0.4s, v1.4s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -528,36 +528,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2.     2     17.0   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3.     2     21.0   0.0    0.0       mla	v0.4s, v0.4s, v1.4s
-# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       mla	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
 
 # CHECK:      [8] Code Region - smlal2
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.25
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   smlal2	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2]     D========================eeeeER   .   smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3]     D============================eeeeER   smlal2	v0.4s, v0.8h, v1.8h
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   smlal2	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D=================eeeeER .  .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     D======================eeeeER   smlal2	v0.4s, v0.8h, v1.8h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -566,36 +566,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2.     2     17.0   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3.     2     21.0   0.0    0.0       smlal2	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       smlal2	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
 
 # CHECK:      [9] Code Region - ssra
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1603
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.25
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .    .   .   ssra	v0.2d, v1.2d, #1
-# CHECK-NEXT: [0,2]     D========eeeeER.    .    .    .   .   ssra	v0.2d, v1.2d, #1
-# CHECK-NEXT: [0,3]     D============eeeeER .    .    .   .   ssra	v0.2d, v0.2d, #1
-# CHECK-NEXT: [1,0]     D================eeeeER  .    .   .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     D====================eeeeER   .   .   ssra	v0.2d, v1.2d, #1
-# CHECK-NEXT: [1,2]     D========================eeeeER   .   ssra	v0.2d, v1.2d, #1
-# CHECK-NEXT: [1,3]     D============================eeeeER   ssra	v0.2d, v0.2d, #1
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D====eeeeER    .    .    .  .   ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   ssra	v0.2d, v0.2d, #1
+# CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D=================eeeeER .  .   ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: [1,3]     D======================eeeeER   ssra	v0.2d, v0.2d, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -604,36 +604,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       ssra	v0.2d, v1.2d, #1
-# CHECK-NEXT: 2.     2     17.0   0.0    0.0       ssra	v0.2d, v1.2d, #1
-# CHECK-NEXT: 3.     2     21.0   0.0    0.0       ssra	v0.2d, v0.2d, #1
-# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       ssra	v0.2d, v0.2d, #1
+# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
 
 # CHECK:      [10] Code Region - fcmla
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.27
-# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   fcmla	v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   fcmla	v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   fcmla	v0.2d, v0.2d, v1.2d, #90
-# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   fcmla	v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: [1,2]     D======================eeeeER . .   fcmla	v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: [1,3]     D==========================eeeeER   fcmla	v0.2d, v0.2d, v1.2d, #90
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fcmla	v0.2d, v0.2d, v1.2d, #90
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     D================eeeeER  .  .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [1,3]     D======================eeeeER   fcmla	v0.2d, v0.2d, v1.2d, #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -642,40 +642,40 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fcmla	v0.2d, v0.2d, v1.2d, #90
-# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fcmla	v0.2d, v0.2d, v1.2d, #90
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
 # CHECK:      [11] Code Region - fmla
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
-# CHECK-NEXT: Total Cycles:      2103
+# CHECK-NEXT: Total Cycles:      1703
 # CHECK-NEXT: Total uOps:        600
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.35
+# CHECK-NEXT: IPC:               0.35
 # CHECK-NEXT: Block RThroughput: 1.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01234
-
-# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    .    .    .   .   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [0,2]     D=======eeER   .    .    .    .    .    .   .   fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,3]     D=========eeeeER    .    .    .    .    .   .   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [0,4]     D=============eeeeER.    .    .    .    .   .   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [0,5]     D=================eeeeER .    .    .    .   .   fmla	v0.2d, v0.2d, v1.2d
-# CHECK-NEXT: [1,0]     D=====================eeeER   .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1]     D========================eeeeER    .    .   .   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,2]     D============================eeER  .    .   .   fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,3]     D==============================eeeeER   .   .   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,4]     D==================================eeeeER   .   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,5]     D======================================eeeeER   fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    ..   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D=eeeeER  .    .    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,2]     D=====eeER.    .    .    .    .    ..   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,3]     D=======eeeeER .    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,4]     D=========eeeeER    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,5]     D=============eeeeER.    .    .    ..   fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: [1,0]     D=================eeeER  .    .    ..   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     D==================eeeeER.    .    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,2]     D======================eeER   .    ..   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3]     D========================eeeeER    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,4]     D==========================eeeeER  ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,5]     D==============================eeeeER   fmla	v0.2d, v0.2d, v1.2d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -684,42 +684,42 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     11.5   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1.     2     14.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 2.     2     18.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 3.     2     20.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 4.     2     24.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 5.     2     28.5   0.0    0.0       fmla	v0.2d, v0.2d, v1.2d
-# CHECK-NEXT:        2     19.7   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 2.     2     14.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 4.     2     18.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 5.     2     22.5   0.0    0.0       fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT:        2     15.3   0.1    0.0       <total>
 
 # CHECK:      [12] Code Region - fmlal
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
-# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total Cycles:      2203
 # CHECK-NEXT: Total uOps:        600
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.24
-# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 1.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    .    .    . .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .    .    .    . .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [0,2]     D========eeER  .    .    .    .    .    .    .    . .   fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    .    .    .    .    . .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [0,4]     D===============eeeeeER  .    .    .    .    .    . .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [0,5]     D====================eeeeeER  .    .    .    .    . .   fmlal	v0.4s, v0.4h, v1.4h
-# CHECK-NEXT: [1,0]     D=========================eeeER    .    .    .    . .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1]     D============================eeeeeER    .    .    . .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,2]     D=================================eeER  .    .    . .   fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,3]     D===================================eeeeeER  .    . .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,4]     D========================================eeeeeER  . .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,5]     D=============================================eeeeeER   fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123456
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    .    ..   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .    .    ..   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,2]     D========eeER  .    .    .    .    .    .    ..   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    .    .    .    ..   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,4]     D============eeeeeER.    .    .    .    .    ..   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,5]     D=================eeeeeER.    .    .    .    ..   fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: [1,0]     D======================eeeER  .    .    .    ..   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     D=========================eeeeeER  .    .    ..   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,2]     D==============================eeER.    .    ..   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3]     D================================eeeeeER.    ..   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,4]     D==================================eeeeeER   ..   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,5]     D=======================================eeeeeER   fmlal	v0.4s, v0.4h, v1.4h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -728,38 +728,38 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     13.5   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1.     2     16.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 2.     2     21.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 3.     2     23.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 4.     2     28.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 5.     2     33.5   0.0    0.0       fmlal	v0.4s, v0.4h, v1.4h
-# CHECK-NEXT:        2     22.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     12.0   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     15.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 2.     2     20.0   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3.     2     22.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 4.     2     24.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 5.     2     29.0   0.0    0.0       fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT:        2     20.3   0.1    0.0       <total>
 
 # CHECK:      [13] Code Region - bfdot
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.27
-# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   bfdot	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   bfdot	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   bfdot	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   bfdot	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2]     D======================eeeeER . .   bfdot	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3]     D==========================eeeeER   bfdot	v0.4s, v0.8h, v1.8h
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   bfdot	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     D================eeeeER  .  .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     D======================eeeeER   bfdot	v0.4s, v0.8h, v1.8h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -768,36 +768,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       bfdot	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       bfdot	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
 # CHECK:      [14] Code Region - bfmmla
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total Cycles:      1603
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.22
-# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 1.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT:                     0123456789          01234
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .  .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .  .   bfmmla	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   bfmmla	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   bfmmla	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: [1,0]     D==================eeeER .    .    .  .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1]     D=====================eeeeeER .    .  .   bfmmla	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   bfmmla	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3]     D===============================eeeeeER   bfmmla	v0.4s, v0.8h, v1.8h
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .   .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D======eeeeeER .    .    .    .   .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D===========eeeeeER .    .    .   .   bfmmla	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     D================eeeER   .    .   .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     D===================eeeeeER   .   .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     D======================eeeeeER.   .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     D===========================eeeeeER   bfmmla	v0.4s, v0.8h, v1.8h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -806,36 +806,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     10.0   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2.     2     18.0   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3.     2     23.0   0.0    0.0       bfmmla	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     12.0   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     15.0   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     20.0   0.0    0.0       bfmmla	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
 
 # CHECK:      [15] Code Region - bfmlalb
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.27
-# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   bfmlalb	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   bfmlalb	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   bfmlalb	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   bfmlalb	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2]     D======================eeeeER . .   bfmlalb	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3]     D==========================eeeeER   bfmlalb	v0.4s, v0.8h, v1.8h
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   bfmlalb	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     D================eeeeER  .  .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     D======================eeeeER   bfmlalb	v0.4s, v0.8h, v1.8h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -844,36 +844,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       bfmlalb	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       bfmlalb	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
 # CHECK:      [16] Code Region - crc32cb
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      803
+# CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.57
 # CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
+# CHECK-NEXT:                     0123456
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.    .    .  .   mul	w0, w0, w0
-# CHECK-NEXT: [0,1]     D==eeER   .    .  .   crc32cb	w0, w0, w1
-# CHECK-NEXT: [0,2]     D====eeER .    .  .   crc32cb	w0, w0, w1
-# CHECK-NEXT: [0,3]     D======eeER    .  .   crc32cb	w0, w0, w0
-# CHECK-NEXT: [1,0]     D========eeER  .  .   mul	w0, w0, w0
-# CHECK-NEXT: [1,1]     D==========eeER.  .   crc32cb	w0, w0, w1
-# CHECK-NEXT: [1,2]     D============eeER .   crc32cb	w0, w0, w1
-# CHECK-NEXT: [1,3]     D==============eeER   crc32cb	w0, w0, w0
+# CHECK:      [0,0]     DeeER.    .    ..   mul	w0, w0, w0
+# CHECK-NEXT: [0,1]     D==eeER   .    ..   crc32cb	w0, w0, w1
+# CHECK-NEXT: [0,2]     D===eeER  .    ..   crc32cb	w0, w0, w1
+# CHECK-NEXT: [0,3]     D=====eeER.    ..   crc32cb	w0, w0, w0
+# CHECK-NEXT: [1,0]     D=======eeER   ..   mul	w0, w0, w0
+# CHECK-NEXT: [1,1]     D=========eeER ..   crc32cb	w0, w0, w1
+# CHECK-NEXT: [1,2]     D==========eeER..   crc32cb	w0, w0, w1
+# CHECK-NEXT: [1,3]     D============eeER   crc32cb	w0, w0, w0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -882,36 +882,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     5.0    0.5    0.0       mul	w0, w0, w0
-# CHECK-NEXT: 1.     2     7.0    0.0    0.0       crc32cb	w0, w0, w1
-# CHECK-NEXT: 2.     2     9.0    0.0    0.0       crc32cb	w0, w0, w1
-# CHECK-NEXT: 3.     2     11.0   0.0    0.0       crc32cb	w0, w0, w0
-# CHECK-NEXT:        2     8.0    0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	w0, w0, w0
+# CHECK-NEXT: 1.     2     6.5    0.0    0.0       crc32cb	w0, w0, w1
+# CHECK-NEXT: 2.     2     7.5    0.0    0.0       crc32cb	w0, w0, w1
+# CHECK-NEXT: 3.     2     9.5    0.0    0.0       crc32cb	w0, w0, w0
+# CHECK-NEXT:        2     7.0    0.1    0.0       <total>
 
 # CHECK:      [17] Code Region - Z sdot.s
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1403
+# CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        500
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.36
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.42
+# CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeER    .    .    .    .   sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: [0,2]     D========eeeER .    .    .    .   sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: [0,3]     D===========eeeER   .    .    .   sdot	z0.s, z0.b, z1.b
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeER.    .   sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,2]     D======================eeeER  .   sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,3]     D=========================eeeER   sdot	z0.s, z0.b, z1.b
+# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   sdot	z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D=================eeeER  ..   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,2]     D==================eeeER ..   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3]     D=====================eeeER   sdot	z0.s, z0.b, z1.b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -920,36 +920,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: 2.     2     16.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: 3.     2     19.0   0.0    0.0       sdot	z0.s, z0.b, z1.b
-# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: 2.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       sdot	z0.s, z0.b, z1.b
+# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
 
 # CHECK:      [18] Code Region - Z sudot
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1403
+# CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        500
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.36
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.42
+# CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeER    .    .    .    .   sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: [0,2]     D========eeeER .    .    .    .   sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: [0,3]     D===========eeeER   .    .    .   sdot	z0.s, z0.b, z1.b[1]
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeER.    .   sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: [1,2]     D======================eeeER  .   sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: [1,3]     D=========================eeeER   sdot	z0.s, z0.b, z1.b[1]
+# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   sdot	z0.s, z0.b, z1.b[1]
+# CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D=================eeeER  ..   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,2]     D==================eeeER ..   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,3]     D=====================eeeER   sdot	z0.s, z0.b, z1.b[1]
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -958,36 +958,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: 2.     2     16.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: 3.     2     19.0   0.0    0.0       sdot	z0.s, z0.b, z1.b[1]
-# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 2.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       sdot	z0.s, z0.b, z1.b[1]
+# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
 
 # CHECK:      [19] Code Region - Z sdot.d
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT:                     0123456789          0
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: [0,2]     D=========eeeeER    .    .    .    ..   sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: [0,3]     D=============eeeeER.    .    .    ..   sdot	z0.d, z0.h, z1.h
-# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D======================eeeeER .    ..   sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: [1,2]     D==========================eeeeER  ..   sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: [1,3]     D==============================eeeeER   sdot	z0.d, z0.h, z1.h
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   sdot	z0.d, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D===================eeeeER    .   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,2]     D====================eeeeER   .   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,3]     D========================eeeeER   sdot	z0.d, z0.h, z1.h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -996,36 +996,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: 1.     2     14.5   0.0    0.0       sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: 2.     2     18.5   0.0    0.0       sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: 3.     2     22.5   0.0    0.0       sdot	z0.d, z0.h, z1.h
-# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: 2.     2     14.0   0.0    0.0       sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: 3.     2     18.0   0.0    0.0       sdot	z0.d, z0.h, z1.h
+# CHECK-NEXT:        2     13.3   0.1    0.0       <total>
 
 # CHECK:      [20] Code Region - Z smmla
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1403
+# CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        500
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.36
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.42
+# CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeER    .    .    .    .   smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: [0,2]     D========eeeER .    .    .    .   smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: [0,3]     D===========eeeER   .    .    .   smmla	z0.s, z0.b, z1.b
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeER.    .   smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,2]     D======================eeeER  .   smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,3]     D=========================eeeER   smmla	z0.s, z0.b, z1.b
+# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   smmla	z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D=================eeeER  ..   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,2]     D==================eeeER ..   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3]     D=====================eeeER   smmla	z0.s, z0.b, z1.b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1034,36 +1034,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: 2.     2     16.0   0.0    0.0       smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: 3.     2     19.0   0.0    0.0       smmla	z0.s, z0.b, z1.b
-# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.0   0.0    0.0       smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: 2.     2     13.0   0.0    0.0       smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       smmla	z0.s, z0.b, z1.b
+# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
 
 # CHECK:      [21] Code Region - Z mla.d
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total Cycles:      1703
 # CHECK-NEXT: Total uOps:        800
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.40
-# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: uOps Per Cycle:    0.47
+# CHECK-NEXT: IPC:               0.23
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    . .   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .    . .   mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [0,2]     D==========eeeeeER  .    .    .    .    . .   mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [0,3]     D===============eeeeeER  .    .    .    . .   mla	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT: [1,0]     D====================eeeeeER  .    .    . .   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D=========================eeeeeER  .    . .   mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,2]     D==============================eeeeeER  . .   mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,3]     .D==================================eeeeeER   mla	z0.d, p0/m, z0.d, z1.d
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    ..   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,2]     D=======eeeeeER.    .    .    .    ..   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,3]     D============eeeeeER.    .    .    ..   mla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D======================eeeeeER.    ..   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     D========================eeeeeER   ..   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D============================eeeeeER   mla	z0.d, p0/m, z0.d, z1.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1072,36 +1072,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     11.0   0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: 1.     2     16.0   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 2.     2     21.0   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 3.     2     25.5   0.0    0.0       mla	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT:        2     18.4   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     16.5   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3.     2     21.0   0.0    0.0       mla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT:        2     15.4   0.1    0.0       <total>
 
 # CHECK:      [22] Code Region - Z mad.d
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total Cycles:      1703
 # CHECK-NEXT: Total uOps:        800
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.40
-# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: uOps Per Cycle:    0.47
+# CHECK-NEXT: IPC:               0.23
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    . .   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .    . .   mad	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [0,2]     D==========eeeeeER  .    .    .    .    . .   mad	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [0,3]     D===============eeeeeER  .    .    .    . .   mad	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT: [1,0]     D====================eeeeeER  .    .    . .   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D=========================eeeeeER  .    . .   mad	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,2]     D==============================eeeeeER  . .   mad	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,3]     .D==================================eeeeeER   mad	z0.d, p0/m, z0.d, z1.d
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    ..   mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,2]     D=======eeeeeER.    .    .    .    ..   mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,3]     D============eeeeeER.    .    .    ..   mad	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D======================eeeeeER.    ..   mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     D========================eeeeeER   ..   mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D============================eeeeeER   mad	z0.d, p0/m, z0.d, z1.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1110,36 +1110,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     11.0   0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: 1.     2     16.0   0.0    0.0       mad	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 2.     2     21.0   0.0    0.0       mad	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 3.     2     25.5   0.0    0.0       mad	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT:        2     18.4   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     16.5   0.0    0.0       mad	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3.     2     21.0   0.0    0.0       mad	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT:        2     15.4   0.1    0.0       <total>
 
 # CHECK:      [23] Code Region - Z msb.d
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total Cycles:      1703
 # CHECK-NEXT: Total uOps:        800
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.40
-# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: uOps Per Cycle:    0.47
+# CHECK-NEXT: IPC:               0.23
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .    . .   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .    . .   msb	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [0,2]     D==========eeeeeER  .    .    .    .    . .   msb	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [0,3]     D===============eeeeeER  .    .    .    . .   msb	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT: [1,0]     D====================eeeeeER  .    .    . .   mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D=========================eeeeeER  .    . .   msb	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,2]     D==============================eeeeeER  . .   msb	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,3]     .D==================================eeeeeER   msb	z0.d, p0/m, z0.d, z1.d
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    ..   msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,2]     D=======eeeeeER.    .    .    .    ..   msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,3]     D============eeeeeER.    .    .    ..   msb	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0]     D=================eeeeeER.    .    ..   mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D======================eeeeeER.    ..   msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     D========================eeeeeER   ..   msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D============================eeeeeER   msb	z0.d, p0/m, z0.d, z1.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1148,36 +1148,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     11.0   0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
-# CHECK-NEXT: 1.     2     16.0   0.0    0.0       msb	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 2.     2     21.0   0.0    0.0       msb	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 3.     2     25.5   0.0    0.0       msb	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT:        2     18.4   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, p0/m, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     16.5   0.0    0.0       msb	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3.     2     21.0   0.0    0.0       msb	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT:        2     15.4   0.1    0.0       <total>
 
 # CHECK:      [24] Code Region - Z fcmla ZPmZZ
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.22
-# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT:                     0123456789          012
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .  .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   fcmla	z0.d, p0/m, z0.d, z1.d, #90
-# CHECK-NEXT: [1,0]     D==================eeeER .    .    .  .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D=====================eeeeeER .    .  .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: [1,3]     D===============================eeeeeER   fcmla	z0.d, p0/m, z0.d, z1.d, #90
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .    . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   fcmla	z0.d, p0/m, z0.d, z1.d, #90
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D==================eeeeeER    . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [1,3]     D=========================eeeeeER   fcmla	z0.d, p0/m, z0.d, z1.d, #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1186,36 +1186,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     10.0   0.5    0.0       fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: 2.     2     18.0   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: 3.     2     23.0   0.0    0.0       fcmla	z0.d, p0/m, z0.d, z1.d, #90
-# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: 3.     2     18.5   0.0    0.0       fcmla	z0.d, p0/m, z0.d, z1.d, #90
+# CHECK-NEXT:        2     13.0   0.1    0.0       <total>
 
 # CHECK:      [25] Code Region - Z fcmla ZZZI
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.22
-# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT:                     0123456789          012
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .  .   fcmla	z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   fcmla	z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   fcmla	z0.s, z0.s, z1.s[1], #90
-# CHECK-NEXT: [1,0]     D==================eeeER .    .    .  .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D=====================eeeeeER .    .  .   fcmla	z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   fcmla	z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: [1,3]     D===============================eeeeeER   fcmla	z0.s, z0.s, z1.s[1], #90
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    . .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .    . .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   fcmla	z0.s, z0.s, z1.s[1], #90
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D==================eeeeeER    . .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [1,3]     D=========================eeeeeER   fcmla	z0.s, z0.s, z1.s[1], #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1224,36 +1224,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     10.0   0.5    0.0       fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: 2.     2     18.0   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: 3.     2     23.0   0.0    0.0       fcmla	z0.s, z0.s, z1.s[1], #90
-# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: 3.     2     18.5   0.0    0.0       fcmla	z0.s, z0.s, z1.s[1], #90
+# CHECK-NEXT:        2     13.0   0.1    0.0       <total>
 
 # CHECK:      [26] Code Region - Z fmla ZPmZZ
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.27
-# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   fmla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   fmla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   fmla	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   fmla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,2]     D======================eeeeER . .   fmla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,3]     D==========================eeeeER   fmla	z0.d, p0/m, z0.d, z1.d
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fmla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D================eeeeER  .  .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3]     D======================eeeeER   fmla	z0.d, p0/m, z0.d, z1.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1262,36 +1262,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fmla	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
 # CHECK:      [27] Code Region - Z fmla ZZZI
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.27
-# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   fmla	z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   fmla	z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   fmla	z0.d, z0.d, z1.d[1]
-# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   fmla	z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: [1,2]     D======================eeeeER . .   fmla	z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: [1,3]     D==========================eeeeER   fmla	z0.d, z0.d, z1.d[1]
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fmla	z0.d, z0.d, z1.d[1]
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D================eeeeER  .  .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [1,3]     D======================eeeeER   fmla	z0.d, z0.d, z1.d[1]
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1300,36 +1300,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fmla	z0.d, z0.d, z1.d[1]
-# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmla	z0.d, z0.d, z1.d[1]
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
 # CHECK:      [28] Code Region - Z bfdot
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1503
+# CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.27
-# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: uOps Per Cycle:    0.31
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    . .   bfdot	z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   bfdot	z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   bfdot	z0.s, z0.h, z1.h
-# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D==================eeeeER.    . .   bfdot	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2]     D======================eeeeER . .   bfdot	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3]     D==========================eeeeER   bfdot	z0.s, z0.h, z1.h
+# CHECK:      [0,0]     DeeeER    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .  .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D=====eeeeER   .    .    .  .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   bfdot	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D================eeeeER  .  .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2]     D==================eeeeER.  .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     D======================eeeeER   bfdot	z0.s, z0.h, z1.h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1338,36 +1338,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfdot	z0.s, z1.h, z2.h
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       bfdot	z0.s, z1.h, z2.h
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       bfdot	z0.s, z0.h, z1.h
-# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       bfdot	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
 # CHECK:      [29] Code Region - Z bfmmla
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total Cycles:      1603
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.22
-# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: uOps Per Cycle:    0.25
+# CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT:                     0123456789          01234
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .  .   bfmmla	z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   bfmmla	z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   bfmmla	z0.s, z0.h, z1.h
-# CHECK-NEXT: [1,0]     D==================eeeER .    .    .  .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D=====================eeeeeER .    .  .   bfmmla	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   bfmmla	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3]     D===============================eeeeeER   bfmmla	z0.s, z0.h, z1.h
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .   .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .   .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D======eeeeeER .    .    .    .   .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D===========eeeeeER .    .    .   .   bfmmla	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D================eeeER   .    .   .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D===================eeeeeER   .   .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2]     D======================eeeeeER.   .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     D===========================eeeeeER   bfmmla	z0.s, z0.h, z1.h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1376,36 +1376,36 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     10.0   0.5    0.0       fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
-# CHECK-NEXT: 2.     2     18.0   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
-# CHECK-NEXT: 3.     2     23.0   0.0    0.0       bfmmla	z0.s, z0.h, z1.h
-# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.0   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: 2.     2     15.0   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     20.0   0.0    0.0       bfmmla	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
 
 # CHECK:      [30] Code Region - bfmlalb
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    15
-# CHECK-NEXT: uOps Per Cycle:    0.22
-# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT:                     0123456789          012
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .  .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .  .   bfmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   bfmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   bfmlalb	z0.s, z0.h, z1.h
-# CHECK-NEXT: [1,0]     D==================eeeER .    .    .  .   fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D=====================eeeeeER .    .  .   bfmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   bfmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3]     D===============================eeeeeER   bfmlalb	z0.s, z0.h, z1.h
+# CHECK:      [0,0]     DeeeER    .    .    .    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    . .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .    . .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   bfmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     D==================eeeeeER    . .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     D=========================eeeeeER   bfmlalb	z0.s, z0.h, z1.h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1414,8 +1414,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     10.0   0.5    0.0       fmul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: 2.     2     18.0   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: 3.     2     23.0   0.0    0.0       bfmlalb	z0.s, z0.h, z1.h
-# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     18.5   0.0    0.0       bfmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     13.0   0.1    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s
index 1e8df4770d7950..65b73177c7b70a 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s
@@ -1365,8 +1365,8 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      2     0.25                        fcmgt	s10, s11, s12
 # CHECK-NEXT:  1      2     0.25                        fcmgt	v0.4s, v0.4s, #0.0
 # CHECK-NEXT:  1      2     0.25                        fcmgt	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:  1      2     0.25                        fcmla	v0.2s, v0.2s, v0.2s, #90
-# CHECK-NEXT:  1      2     0.25                        fcmla	v0.4s, v0.4s, v0.s[1], #0
+# CHECK-NEXT:  1      4     0.25                        fcmla	v0.2s, v0.2s, v0.2s, #90
+# CHECK-NEXT:  1      4     0.25                        fcmla	v0.4s, v0.4s, v0.s[1], #0
 # CHECK-NEXT:  1      2     0.25                        fcmle	d20, d21, #0.0
 # CHECK-NEXT:  1      2     0.25                        fcmle	s10, s11, #0.0
 # CHECK-NEXT:  1      2     0.25                        fcmle	v0.2d, v0.2d, #0.0
@@ -1651,7 +1651,7 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  7      8     1.00    *                   ld4r	{ v0.2d, v1.2d, v2.2d, v3.2d }, [sp]
 # CHECK-NEXT:  8      8     1.00    *                   ld4r	{ v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #16
 # CHECK-NEXT:  8      8     1.00    *                   ld4r	{ v0.4s, v1.4s, v2.4s, v3.4s }, [sp], x8
-# CHECK-NEXT:  1      2     0.25                        mla	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     0.50                        mla	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  1      4     0.50                        mls	v0.4h, v0.4h, v0.4h
 # CHECK-NEXT:  1      2     0.25                        mov	b0, v0.b[15]
 # CHECK-NEXT:  1      2     0.25                        mov	d6, v0.d[1]
@@ -1673,7 +1673,7 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      2     0.25                        movi	v0.2s, #8, msl #8
 # CHECK-NEXT:  1      2     0.25                        movi	v0.4s, #255, lsl #24
 # CHECK-NEXT:  1      2     0.25                        movi	v0.8b, #255
-# CHECK-NEXT:  1      2     0.25                        mul	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      4     0.50                        mul	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  1      2     0.25                        mvni	v0.2s, #0
 # CHECK-NEXT:  1      2     0.25                        mvni	v0.4s, #16, msl #16
 # CHECK-NEXT:  1      2     0.25                        neg	d29, d24
@@ -1780,10 +1780,10 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  2      4     1.00                        scvtf	v0.4s, v0.4s
 # CHECK-NEXT:  1      2     0.25                        scvtf	v0.4s, v0.4s, #3
 # CHECK-NEXT:  4      6     1.00                        scvtf	v0.8h, v0.8h
-# CHECK-NEXT:  1      2     0.25                        sdot	v0.2s, v0.8b, v0.4b[2]
-# CHECK-NEXT:  1      2     0.25                        sdot	v0.2s, v0.8b, v0.8b
-# CHECK-NEXT:  1      2     0.25                        sdot	v0.4s, v0.16b, v0.16b
-# CHECK-NEXT:  1      2     0.25                        sdot	v0.4s, v0.16b, v0.4b[2]
+# CHECK-NEXT:  1      3     0.25                        sdot	v0.2s, v0.8b, v0.4b[2]
+# CHECK-NEXT:  1      3     0.25                        sdot	v0.2s, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.25                        sdot	v0.4s, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.25                        sdot	v0.4s, v0.16b, v0.4b[2]
 # CHECK-NEXT:  1      2     0.25                        shadd	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  1      2     0.25                        shl	d7, d10, #12
 # CHECK-NEXT:  1      2     0.50                        shl	v0.16b, v0.16b, #3
@@ -1873,26 +1873,26 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      2     0.25                        sqadd	b20, b11, b15
 # CHECK-NEXT:  1      2     0.25                        sqadd	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      2     0.25                        sqadd	v0.2s, v0.2s, v0.2s
-# CHECK-NEXT:  1      2     0.25                        sqdmlal	d19, s24, s12
+# CHECK-NEXT:  1      4     0.50                        sqdmlal	d19, s24, s12
 # CHECK-NEXT:  1      4     0.50                        sqdmlal	d8, s9, v0.s[1]
 # CHECK-NEXT:  1      4     0.50                        sqdmlal	s0, h0, v0.h[3]
-# CHECK-NEXT:  1      2     0.25                        sqdmlal	s17, h27, h12
+# CHECK-NEXT:  1      4     0.50                        sqdmlal	s17, h27, h12
 # CHECK-NEXT:  1      4     0.50                        sqdmlal	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  1      4     0.50                        sqdmlal	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  1      4     0.50                        sqdmlal2	v0.2d, v0.4s, v0.4s
 # CHECK-NEXT:  1      4     0.50                        sqdmlal2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  1      2     0.25                        sqdmlsl	d12, s23, s13
+# CHECK-NEXT:  1      4     0.50                        sqdmlsl	d12, s23, s13
 # CHECK-NEXT:  1      4     0.50                        sqdmlsl	d8, s9, v0.s[1]
 # CHECK-NEXT:  1      4     0.50                        sqdmlsl	s0, h0, v0.h[3]
-# CHECK-NEXT:  1      2     0.25                        sqdmlsl	s14, h12, h25
+# CHECK-NEXT:  1      4     0.50                        sqdmlsl	s14, h12, h25
 # CHECK-NEXT:  1      4     0.50                        sqdmlsl	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  1      4     0.50                        sqdmlsl	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  1      4     0.50                        sqdmlsl2	v0.2d, v0.4s, v0.4s
 # CHECK-NEXT:  1      4     0.50                        sqdmlsl2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  1      2     0.25                        sqdmulh	h10, h11, h12
-# CHECK-NEXT:  1      2     0.25                        sqdmulh	h7, h15, v0.h[3]
-# CHECK-NEXT:  1      2     0.25                        sqdmulh	s15, s14, v0.s[1]
-# CHECK-NEXT:  1      2     0.25                        sqdmulh	s20, s21, s2
+# CHECK-NEXT:  1      4     0.50                        sqdmulh	h10, h11, h12
+# CHECK-NEXT:  1      4     0.50                        sqdmulh	h7, h15, v0.h[3]
+# CHECK-NEXT:  1      4     0.50                        sqdmulh	s15, s14, v0.s[1]
+# CHECK-NEXT:  1      4     0.50                        sqdmulh	s20, s21, s2
 # CHECK-NEXT:  1      4     0.50                        sqdmulh	v0.2s, v0.2s, v0.2s
 # CHECK-NEXT:  1      4     0.50                        sqdmulh	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT:  1      3     0.50                        sqdmull	d1, s1, v0.s[1]
@@ -1914,34 +1914,34 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      2     0.25                        sqneg	v0.4s, v0.4s
 # CHECK-NEXT:  1      2     0.25                        sqneg	v0.8b, v0.8b
 # CHECK-NEXT:  1      2     0.25                        sqneg	v0.8h, v0.8h
-# CHECK-NEXT:  1      2     0.25                        sqrdmlah	h0, h1, v2.h[3]
-# CHECK-NEXT:  1      2     0.25                        sqrdmlah	v0.4h, v1.4h, v2.h[3]
-# CHECK-NEXT:  1      2     0.25                        sqrdmlah	v0.8h, v1.8h, v2.h[3]
-# CHECK-NEXT:  1      2     0.25                        sqrdmlah	s0, s1, v2.s[1]
-# CHECK-NEXT:  1      2     0.25                        sqrdmlah	v0.2s, v1.2s, v2.s[1]
-# CHECK-NEXT:  1      2     0.25                        sqrdmlah	v0.4s, v1.4s, v2.s[1]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlah	h0, h1, v2.h[3]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlah	v0.4h, v1.4h, v2.h[3]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlah	v0.8h, v1.8h, v2.h[3]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlah	s0, s1, v2.s[1]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlah	v0.2s, v1.2s, v2.s[1]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlah	v0.4s, v1.4s, v2.s[1]
 # CHECK-NEXT:  1      4     0.50                        sqrdmlah	h0, h1, h2
 # CHECK-NEXT:  1      4     0.50                        sqrdmlah	v0.4h, v1.4h, v2.4h
 # CHECK-NEXT:  1      4     0.50                        sqrdmlah	v0.8h, v1.8h, v2.8h
 # CHECK-NEXT:  1      4     0.50                        sqrdmlah	s0, s1, s2
 # CHECK-NEXT:  1      4     0.50                        sqrdmlah	v0.2s, v1.2s, v2.2s
 # CHECK-NEXT:  1      4     0.50                        sqrdmlah	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT:  1      2     0.25                        sqrdmlsh	h0, h1, v2.h[3]
-# CHECK-NEXT:  1      2     0.25                        sqrdmlsh	v0.4h, v1.4h, v2.h[3]
-# CHECK-NEXT:  1      2     0.25                        sqrdmlsh	v0.8h, v1.8h, v2.h[3]
-# CHECK-NEXT:  1      2     0.25                        sqrdmlsh	s0, s1, v2.s[1]
-# CHECK-NEXT:  1      2     0.25                        sqrdmlsh	v0.2s, v1.2s, v2.s[1]
-# CHECK-NEXT:  1      2     0.25                        sqrdmlsh	v0.4s, v1.4s, v2.s[1]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlsh	h0, h1, v2.h[3]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlsh	v0.4h, v1.4h, v2.h[3]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlsh	v0.8h, v1.8h, v2.h[3]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlsh	s0, s1, v2.s[1]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlsh	v0.2s, v1.2s, v2.s[1]
+# CHECK-NEXT:  1      4     0.50                        sqrdmlsh	v0.4s, v1.4s, v2.s[1]
 # CHECK-NEXT:  1      4     0.50                        sqrdmlsh	h0, h1, h2
 # CHECK-NEXT:  1      4     0.50                        sqrdmlsh	v0.4h, v1.4h, v2.4h
 # CHECK-NEXT:  1      4     0.50                        sqrdmlsh	v0.8h, v1.8h, v2.8h
 # CHECK-NEXT:  1      4     0.50                        sqrdmlsh	s0, s1, s2
 # CHECK-NEXT:  1      4     0.50                        sqrdmlsh	v0.2s, v1.2s, v2.2s
 # CHECK-NEXT:  1      4     0.50                        sqrdmlsh	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT:  1      2     0.25                        sqrdmulh	h10, h11, h12
-# CHECK-NEXT:  1      2     0.25                        sqrdmulh	h7, h15, v0.h[3]
-# CHECK-NEXT:  1      2     0.25                        sqrdmulh	s15, s14, v0.s[1]
-# CHECK-NEXT:  1      2     0.25                        sqrdmulh	s20, s21, s2
+# CHECK-NEXT:  1      4     0.50                        sqrdmulh	h10, h11, h12
+# CHECK-NEXT:  1      4     0.50                        sqrdmulh	h7, h15, v0.h[3]
+# CHECK-NEXT:  1      4     0.50                        sqrdmulh	s15, s14, v0.s[1]
+# CHECK-NEXT:  1      4     0.50                        sqrdmulh	s20, s21, s2
 # CHECK-NEXT:  1      4     0.50                        sqrdmulh	v0.4h, v0.4h, v0.4h
 # CHECK-NEXT:  1      4     0.50                        sqrdmulh	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     0.50                        sqrshl	d31, d31, d31
@@ -2124,8 +2124,8 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  5      4     1.00           *            st4	{ v0.d, v1.d, v2.d, v3.d }[1], [x0], x5
 # CHECK-NEXT:  1      2     0.25                        sub	d15, d5, d16
 # CHECK-NEXT:  1      2     0.25                        sub	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT:  1      2     0.25                        sudot	v0.2s, v0.8b, v0.4b[2]
-# CHECK-NEXT:  1      2     0.25                        sudot	v0.4s, v0.16b, v0.4b[2]
+# CHECK-NEXT:  1      3     0.25                        sudot	v0.2s, v0.8b, v0.4b[2]
+# CHECK-NEXT:  1      3     0.25                        sudot	v0.4s, v0.16b, v0.4b[2]
 # CHECK-NEXT:  1      2     0.25                        suqadd	b19, b14
 # CHECK-NEXT:  1      2     0.25                        suqadd	d18, d22
 # CHECK-NEXT:  1      2     0.25                        suqadd	h20, h15
@@ -2222,10 +2222,10 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  2      4     1.00                        ucvtf	v0.4s, v0.4s
 # CHECK-NEXT:  1      2     0.25                        ucvtf	v0.4s, v0.4s, #3
 # CHECK-NEXT:  4      6     1.00                        ucvtf	v0.8h, v0.8h
-# CHECK-NEXT:  1      2     0.25                        udot	v0.2s, v0.8b, v0.4b[2]
-# CHECK-NEXT:  1      2     0.25                        udot	v0.2s, v0.8b, v0.8b
-# CHECK-NEXT:  1      2     0.25                        udot	v0.4s, v0.16b, v0.16b
-# CHECK-NEXT:  1      2     0.25                        udot	v0.4s, v0.16b, v0.4b[2]
+# CHECK-NEXT:  1      3     0.25                        udot	v0.2s, v0.8b, v0.4b[2]
+# CHECK-NEXT:  1      3     0.25                        udot	v0.2s, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.25                        udot	v0.4s, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.25                        udot	v0.4s, v0.16b, v0.4b[2]
 # CHECK-NEXT:  1      2     0.25                        uhadd	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      2     0.25                        uhadd	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      2     0.25                        uhsub	v0.4s, v0.4s, v0.4s
@@ -2356,10 +2356,10 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     0.50                        ursra	v0.4s, v0.4s, #3
 # CHECK-NEXT:  1      4     0.50                        ursra	v0.8b, v0.8b, #3
 # CHECK-NEXT:  1      4     0.50                        ursra	v0.8h, v0.8h, #3
-# CHECK-NEXT:  1      2     0.25                        usdot	v0.2s, v0.8b, v0.4b[2]
-# CHECK-NEXT:  1      2     0.25                        usdot	v0.2s, v0.8b, v0.8b
-# CHECK-NEXT:  1      2     0.25                        usdot	v0.4s, v0.16b, v0.16b
-# CHECK-NEXT:  1      2     0.25                        usdot	v0.4s, v0.16b, v0.4b[2]
+# CHECK-NEXT:  1      3     0.25                        usdot	v0.2s, v0.8b, v0.4b[2]
+# CHECK-NEXT:  1      3     0.25                        usdot	v0.2s, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.25                        usdot	v0.4s, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.25                        usdot	v0.4s, v0.16b, v0.4b[2]
 # CHECK-NEXT:  1      2     0.50                        ushl	d0, d0, d0
 # CHECK-NEXT:  1      2     0.50                        ushl	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      2     0.50                        ushl	v0.4s, v0.4s, v0.4s
@@ -2465,7 +2465,7 @@ zip2 v0.8h, v0.8h, v0.8h
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]
-# CHECK-NEXT:  -      -      -      -     26.67  49.17  49.17  18.75  7.75   7.75   7.75   394.50 377.00 349.00 331.50
+# CHECK-NEXT:  -      -      -      -     26.67  49.17  49.17  18.75  7.75   7.75   7.75   401.00 370.50 355.50 325.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    [9]    [10]   Instructions:
@@ -2892,7 +2892,7 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00    -      -      -      -     1.00   1.00   1.00   1.00   ld4r	{ v0.2d, v1.2d, v2.2d, v3.2d }, [sp]
 # CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00   0.25   0.25   0.25   0.25   1.00   1.00   1.00   1.00   ld4r	{ v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #16
 # CHECK-NEXT:  -      -      -      -     1.00   1.00   1.00   0.25   0.25   0.25   0.25   1.00   1.00   1.00   1.00   ld4r	{ v0.4s, v1.4s, v2.4s, v3.4s }, [sp], x8
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   mla	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     mla	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     mls	v0.4h, v0.4h, v0.4h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   mov	b0, v0.b[15]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   mov	d6, v0.d[1]
@@ -2914,7 +2914,7 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   movi	v0.2s, #8, msl #8
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   movi	v0.4s, #255, lsl #24
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   movi	v0.8b, #255
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   mul	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     mul	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   mvni	v0.2s, #0
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   mvni	v0.4s, #16, msl #16
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   neg	d29, d24
@@ -3114,26 +3114,26 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqadd	b20, b11, b15
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqadd	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqadd	v0.2s, v0.2s, v0.2s
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqdmlal	d19, s24, s12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlal	d19, s24, s12
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlal	d8, s9, v0.s[1]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlal	s0, h0, v0.h[3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqdmlal	s17, h27, h12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlal	s17, h27, h12
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlal	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlal	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlal2	v0.2d, v0.4s, v0.4s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlal2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqdmlsl	d12, s23, s13
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlsl	d12, s23, s13
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlsl	d8, s9, v0.s[1]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlsl	s0, h0, v0.h[3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqdmlsl	s14, h12, h25
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlsl	s14, h12, h25
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlsl	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlsl	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlsl2	v0.2d, v0.4s, v0.4s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmlsl2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqdmulh	h10, h11, h12
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqdmulh	h7, h15, v0.h[3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqdmulh	s15, s14, v0.s[1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqdmulh	s20, s21, s2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmulh	h10, h11, h12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmulh	h7, h15, v0.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmulh	s15, s14, v0.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmulh	s20, s21, s2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmulh	v0.2s, v0.2s, v0.2s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmulh	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqdmull	d1, s1, v0.s[1]
@@ -3155,34 +3155,34 @@ zip2 v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqneg	v0.4s, v0.4s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqneg	v0.8b, v0.8b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqneg	v0.8h, v0.8h
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlah	h0, h1, v2.h[3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlah	v0.4h, v1.4h, v2.h[3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlah	v0.8h, v1.8h, v2.h[3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlah	s0, s1, v2.s[1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlah	v0.2s, v1.2s, v2.s[1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlah	v0.4s, v1.4s, v2.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	h0, h1, v2.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	v0.4h, v1.4h, v2.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	v0.8h, v1.8h, v2.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	s0, s1, v2.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	v0.2s, v1.2s, v2.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	v0.4s, v1.4s, v2.s[1]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	h0, h1, h2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	v0.4h, v1.4h, v2.4h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	v0.8h, v1.8h, v2.8h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	s0, s1, s2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	v0.2s, v1.2s, v2.2s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlah	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlsh	h0, h1, v2.h[3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlsh	v0.4h, v1.4h, v2.h[3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlsh	v0.8h, v1.8h, v2.h[3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlsh	s0, s1, v2.s[1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlsh	v0.2s, v1.2s, v2.s[1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmlsh	v0.4s, v1.4s, v2.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	h0, h1, v2.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	v0.4h, v1.4h, v2.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	v0.8h, v1.8h, v2.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	s0, s1, v2.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	v0.2s, v1.2s, v2.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	v0.4s, v1.4s, v2.s[1]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	h0, h1, h2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	v0.4h, v1.4h, v2.4h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	v0.8h, v1.8h, v2.8h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	s0, s1, s2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	v0.2s, v1.2s, v2.2s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmlsh	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmulh	h10, h11, h12
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmulh	h7, h15, v0.h[3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmulh	s15, s14, v0.s[1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   sqrdmulh	s20, s21, s2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmulh	h10, h11, h12
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmulh	h7, h15, v0.h[3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmulh	s15, s14, v0.s[1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmulh	s20, s21, s2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmulh	v0.4h, v0.4h, v0.4h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50    -     sqrdmulh	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     0.50    -     0.50   sqrshl	d31, d31, d31

>From f9fa6cac3a0adc9e16c3465dbb847d52e3b5a316 Mon Sep 17 00:00:00 2001
From: Rin Dobrescu <rin.dobrescu at arm.com>
Date: Thu, 10 Oct 2024 09:38:25 +0000
Subject: [PATCH 3/3] Address comments and format code.

---
 .../Target/AArch64/AArch64SchedNeoverseV1.td  | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
index 1d7cb699f731aa..fb4d2f3d7bcd3a 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td
@@ -474,10 +474,12 @@ def V1Write_11c_9L01_9S_9V : SchedWriteRes<[V1UnitL01, V1UnitL01, V1UnitL01,
 
 // NOTE: SOG, p. 20, n. 2: Accumulator forwarding is not supported for
 // consumers of 64 bit multiply high operations?
+def V1Wr_IM   : SchedWriteRes<[V1UnitM]>  { let Latency = 2; }
+def V1Wr_IMA  : SchedWriteRes<[V1UnitM0]> { let Latency = 2; }
 def V1WriteIM : SchedWriteVariant<
-                  [SchedVar<NeoverseMULIdiomPred, [V1Write_2c_1M]>,
-                   SchedVar<NoSchedPred,          [V1Write_2c_1M0]>]>;
-def V1Rd_MA : SchedReadAdvance<1, [V1Write_2c_1M0]>;
+                  [SchedVar<NeoverseMULIdiomPred, [V1Wr_IM]>,
+                   SchedVar<NoSchedPred,          [V1Wr_IMA]>]>;
+def V1Rd_IMA : SchedReadAdvance<1, [V1Wr_IMA]>;
 
 def V1Wr_FMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; }
 def V1Rd_FMA : SchedReadAdvance<2, [WriteFMul, V1Wr_FMA]>;
@@ -634,21 +636,18 @@ def : InstRW<[V1Write_1c_1J], (instrs SETF8, SETF16, RMIF, CFINV)>;
 def : SchedAlias<WriteID32, V1Write_12c5_1M0>;
 def : SchedAlias<WriteID64, V1Write_20c5_1M0>;
 
-def           : SchedAlias<WriteIM32, V1Write_2c_1M>;
-def           : SchedAlias<WriteIM64, V1Write_2c_1M>;
+def : SchedAlias<WriteIM32, V1Write_2c_1M>;
+def : SchedAlias<WriteIM64, V1Write_2c_1M>;
 
 // Multiply
 // Multiply accumulate, W-form
 // Multiply accumulate, X-form
-/*def V1WriteIM : SchedWriteVariant<
-                  [SchedVar<NeoverseMULIdiomPred, [V1Write_2c_1M]>,
-                   SchedVar<NoSchedPred,          [V1Write_2c_1M0]>]>;
-def V1Rd_MA : SchedReadAdvance<1, [V1Write_2c_1M0]>;*/
-def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_MA], (instregex "^M(ADD|SUB)[WX]rrr$")>;
+def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_IMA],
+             (instregex "^M(ADD|SUB)[WX]rrr$")>;
 
 // Multiply accumulate long
 // Multiply long
-def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_MA],
+def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_IMA],
              (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
 // Multiply high
 def : InstRW<[V1Write_3c_1M, ReadIM, ReadIM], (instrs SMULHrr, UMULHrr)>;
@@ -770,7 +769,8 @@ def : InstRW<[V1Write_16c7_1V02], (instrs FSQRTDr)>;
 def : WriteRes<WriteFMul, [V1UnitV]> { let Latency = 3; }
 
 // FP multiply accumulate
-def : InstRW<[V1Wr_FMA, ReadDefault, ReadDefault, V1Rd_FMA], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
+def : InstRW<[V1Wr_FMA, ReadDefault, ReadDefault, V1Rd_FMA],
+             (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
 
 // FP round to integral
 def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ][HSD]r$",
@@ -930,7 +930,8 @@ def : InstRW<[V1Write_4c_2V13], (instregex "^(ADD|[SU]ADDL)Vv16i8v$",
 
 // ASIMD dot product
 // ASIMD dot product using signed and unsigned integers
-def : InstRW<[V1Wr_VDOT, V1Rd_VDOT], (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>;
+def : InstRW<[V1Wr_VDOT, V1Rd_VDOT],
+             (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>;
 
 // ASIMD matrix multiply-accumulate
 def : InstRW<[V1Wr_VMMA, V1Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>;
@@ -1536,7 +1537,8 @@ def : InstRW<[V1Write_20c7_1V0], (instregex "^[SU]DIVR?_ZPmZ_D",
 def : InstRW<[V1Wr_ZDOTB, V1Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_S$")>;
 
 // Dot product, 8 bit, using signed and unsigned integers
-def : InstRW<[V1Wr_ZUDOTB, V1Rd_ZUDOTB], (instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>;
+def : InstRW<[V1Wr_ZUDOTB, V1Rd_ZUDOTB],
+             (instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>;
 
 // Dot product, 16 bit
 def : InstRW<[V1Wr_ZDOTH, V1Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_D$")>;
@@ -1680,8 +1682,6 @@ def : InstRW<[V1Write_2c_1V0], (instregex "^FAC(GE|GT)_PPzZZ_[HSD]$",
 def : InstRW<[V1Write_3c_1V01], (instregex "^FCADD_ZPmZ_[HSD]$")>;
 
 // Floating point complex multiply add
-/*def : InstRW<[V1Write_5c_1V01], (instregex "^FCMLA_ZPmZZ_[HSD]$",
-                                           "^FCMLA_ZZZI_[HS]$")>;*/
 def : InstRW<[V1Wr_ZFCMA, ReadDefault, V1Rd_ZFCMA], (instregex "^FCMLA_ZPmZZ_[HSD]")>;
 def : InstRW<[V1Wr_ZFCMA, V1Rd_ZFCMA],              (instregex "^FCMLA_ZZZI_[HS]")>;
 



More information about the llvm-commits mailing list