[llvm] [AArch64] Change IssueWidth to 6 in AArch64SchedNeoverseV2.td (PR #142565)

David Sherwood via llvm-commits llvm-commits at lists.llvm.org
Fri Jun 6 02:22:46 PDT 2025


https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/142565

>From 42235a3d7ad0268e8b5bb5cd3970e0a0a2984b8b Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Wed, 4 Jun 2025 08:09:22 +0000
Subject: [PATCH] [AArch64] Change IssueWidth to 6 in AArch64SchedNeoverseV2.td

I think that the issue width for neoverse-v2 CPUs is set too
high and does not properly reflect the dispatch constraints.
I tested various values of IssueWidth (16, 8 and 6) with runs
of SPEC2017 on a neoverse-v2 machine and I got the highest
overall geomean score with an issue width of 6, although it's
only a marginal 0.14% improvement. I also observed a 1-2%
improvement when testing the Gromacs application with some
workloads. Here are some notable changes in SPEC2017 ref
runtimes, i.e. has a ~0.5% change or greater ('-' means
faster):

548.exchange2: -1.7%
510.parest: -0.78%
538.imagick: -0.73%
500.perlbench: -0.57%
525.x264: -0.55%
507.cactuBSSN: -0.5%
520.omnetpp: -0.48%
511.povray: +0.57%
544.nab: +0.65%
503.bwaves: +0.68%
526.blender: +0.75%

If this patch causes any major regressions post-commit it can
be easily reverted, but I think it should be an overall
improvement.
---
 .../Target/AArch64/AArch64SchedNeoverseV2.td  |    4 +-
 .../AArch64/Neoverse/V2-basic-instructions.s  |   10 +-
 .../AArch64/Neoverse/V2-clear-upper-regs.s    |  124 +-
 .../llvm-mca/AArch64/Neoverse/V2-forwarding.s |  692 +++---
 .../AArch64/Neoverse/V2-sve-instructions.s    |   18 +-
 .../llvm-mca/AArch64/Neoverse/V2-writeback.s  | 1950 +++++++++--------
 .../AArch64/Neoverse/V2-zero-lat-movs.s       |   24 +-
 7 files changed, 1413 insertions(+), 1409 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
index 39f7077ae4514..25864c88c254a 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
@@ -14,7 +14,9 @@
 //===----------------------------------------------------------------------===//
 
 def NeoverseV2Model : SchedMachineModel {
-  let IssueWidth            =  16; // Micro-ops dispatched at a time.
+  let IssueWidth            =   6; // Value based on a combination of dispatch
+                                   // constraints discussed in the SWOG, i.e. up
+                                   // to 8 MOps per cycle, and benchmark tuning.
   let MicroOpBufferSize     = 320; // Entries in micro-op re-order buffer.
   let LoadLatency           =   4; // Optimistic load latency.
   let MispredictPenalty     =  10; // Extra cycles for mispredicted branch.  NOTE: Copied from N2.
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
index 581dad6b68dcf..54b5f1644be48 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
@@ -2536,14 +2536,14 @@ drps
 # CHECK-NEXT:  1      2     0.50                        bics	x3, xzr, x3, lsl #1
 # CHECK-NEXT:  1      2     0.50                        tst	w3, w7, lsl #31
 # CHECK-NEXT:  1      2     0.50                        tst	x2, x20, asr #2
-# CHECK-NEXT:  1      0     0.06                        mov	x3, x6
-# CHECK-NEXT:  1      0     0.06                        mov	x3, xzr
-# CHECK-NEXT:  1      0     0.06                        mov	wzr, w2
-# CHECK-NEXT:  1      0     0.06                        mov	w3, w5
+# CHECK-NEXT:  1      0     0.17                        mov	x3, x6
+# CHECK-NEXT:  1      0     0.17                        mov	x3, xzr
+# CHECK-NEXT:  1      0     0.17                        mov	wzr, w2
+# CHECK-NEXT:  1      0     0.17                        mov	w3, w5
 # CHECK-NEXT:  1      1     0.17                        movz	w2, #0, lsl #16
 # CHECK-NEXT:  1      1     0.17                        mov	w2, #-1235
 # CHECK-NEXT:  1      1     0.17                        mov	x2, #5299989643264
-# CHECK-NEXT:  1      0     0.06                        mov	x2, #0
+# CHECK-NEXT:  1      0     0.17                        mov	x2, #0
 # CHECK-NEXT:  1      1     0.17                        movk	w3, #0
 # CHECK-NEXT:  1      1     0.17                        movz	x4, #0, lsl #16
 # CHECK-NEXT:  1      1     0.17                        movk	w5, #0, lsl #16
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
index fbf65e26e99a5..3398331a67f5b 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
@@ -58,7 +58,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -116,8 +116,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	b0, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	b0, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ldr	b0, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -126,9 +126,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	b0, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ldr	b0, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [1] Code Region - FPR16-bit
 
@@ -137,7 +137,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -195,8 +195,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	h0, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	h0, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ldr	h0, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -205,9 +205,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	h0, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ldr	h0, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [2] Code Region - FPR32-bit
 
@@ -216,7 +216,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -274,8 +274,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	s0, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	s0, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ldr	s0, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -284,9 +284,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	s0, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ldr	s0, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [3] Code Region - FPR64-bit
 
@@ -295,7 +295,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -353,8 +353,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	d0, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	d0, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ldr	d0, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -363,9 +363,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	d0, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ldr	d0, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [4] Code Region - FPR128-bit
 
@@ -374,7 +374,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -432,8 +432,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	q0, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	q0, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ldr	q0, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -442,9 +442,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	q0, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ldr	q0, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [5] Code Region - SIMD64-bit-b
 
@@ -453,7 +453,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -511,8 +511,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.8b }, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -521,9 +521,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [6] Code Region - SIMD64-bit-h
 
@@ -532,7 +532,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -590,8 +590,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.4h }, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -600,9 +600,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [7] Code Region - SIMD64-bit-s
 
@@ -611,7 +611,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -669,8 +669,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.2s }, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -679,9 +679,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [8] Code Region - SIMD64-bit-d
 
@@ -690,7 +690,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -748,8 +748,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.1d }, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -758,9 +758,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [9] Code Region - insr
 
@@ -769,7 +769,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      803
 # CHECK-NEXT: Total uOps:        300
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.37
 # CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -825,10 +825,10 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [0,1]     D======eeER    .    .    .    .   .   add	z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,0]     D========eeeeeeER   .    .    .   .   insr	z0.s, w0
 # CHECK-NEXT: [1,1]     D==============eeER .    .    .   .   add	z0.s, z0.s, z0.s
-# CHECK-NEXT: [2,0]     D================eeeeeeER.    .   .   insr	z0.s, w0
-# CHECK-NEXT: [2,1]     D======================eeER   .   .   add	z0.s, z0.s, z0.s
-# CHECK-NEXT: [3,0]     D========================eeeeeeER .   insr	z0.s, w0
-# CHECK-NEXT: [3,1]     D==============================eeER   add	z0.s, z0.s, z0.s
+# CHECK-NEXT: [2,0]     .D===============eeeeeeER.    .   .   insr	z0.s, w0
+# CHECK-NEXT: [2,1]     .D=====================eeER   .   .   add	z0.s, z0.s, z0.s
+# CHECK-NEXT: [3,0]     .D=======================eeeeeeER .   insr	z0.s, w0
+# CHECK-NEXT: [3,1]     .D=============================eeER   add	z0.s, z0.s, z0.s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -837,6 +837,6 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     13.0   0.3    0.0       insr	z0.s, w0
-# CHECK-NEXT: 1.     4     19.0   0.0    0.0       add	z0.s, z0.s, z0.s
-# CHECK-NEXT:        4     16.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     4     12.5   0.3    0.0       insr	z0.s, w0
+# CHECK-NEXT: 1.     4     18.5   0.0    0.0       add	z0.s, z0.s, z0.s
+# CHECK-NEXT:        4     15.5   0.1    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s
index 0f5ab183f5358..39a779b27fe7f 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s
@@ -315,7 +315,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.57
 # CHECK-NEXT: IPC:               0.57
 # CHECK-NEXT: Block RThroughput: 3.0
@@ -330,8 +330,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=====eeER.    ..   madd	x0, x0, x0, x0
 # CHECK-NEXT: [1,0]     D=======eeER   ..   mul	x0, x0, x0
 # CHECK-NEXT: [1,1]     D=========eeER ..   madd	x0, x1, x2, x0
-# CHECK-NEXT: [1,2]     D==========eeER..   madd	x0, x1, x2, x0
-# CHECK-NEXT: [1,3]     D============eeER   madd	x0, x0, x0, x0
+# CHECK-NEXT: [1,2]     .D=========eeER..   madd	x0, x1, x2, x0
+# CHECK-NEXT: [1,3]     .D===========eeER   madd	x0, x0, x0, x0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -342,9 +342,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	x0, x0, x0
 # CHECK-NEXT: 1.     2     6.5    0.0    0.0       madd	x0, x1, x2, x0
-# CHECK-NEXT: 2.     2     7.5    0.0    0.0       madd	x0, x1, x2, x0
-# CHECK-NEXT: 3.     2     9.5    0.0    0.0       madd	x0, x0, x0, x0
-# CHECK-NEXT:        2     7.0    0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     7.0    0.0    0.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 3.     2     9.0    0.0    0.0       madd	x0, x0, x0, x0
+# CHECK-NEXT:        2     6.8    0.1    0.0       <total>
 
 # CHECK:      [1] Code Region - smaddl
 
@@ -353,7 +353,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.57
 # CHECK-NEXT: IPC:               0.57
 # CHECK-NEXT: Block RThroughput: 3.0
@@ -368,8 +368,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=====eeER.    ..   smaddl	x0, w0, w0, x0
 # CHECK-NEXT: [1,0]     D=======eeER   ..   mul	x0, x0, x0
 # CHECK-NEXT: [1,1]     D=========eeER ..   smaddl	x0, w1, w2, x0
-# CHECK-NEXT: [1,2]     D==========eeER..   smaddl	x0, w1, w2, x0
-# CHECK-NEXT: [1,3]     D============eeER   smaddl	x0, w0, w0, x0
+# CHECK-NEXT: [1,2]     .D=========eeER..   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [1,3]     .D===========eeER   smaddl	x0, w0, w0, x0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -380,9 +380,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	x0, x0, x0
 # CHECK-NEXT: 1.     2     6.5    0.0    0.0       smaddl	x0, w1, w2, x0
-# CHECK-NEXT: 2.     2     7.5    0.0    0.0       smaddl	x0, w1, w2, x0
-# CHECK-NEXT: 3.     2     9.5    0.0    0.0       smaddl	x0, w0, w0, x0
-# CHECK-NEXT:        2     7.0    0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     7.0    0.0    0.0       smaddl	x0, w1, w2, x0
+# CHECK-NEXT: 3.     2     9.0    0.0    0.0       smaddl	x0, w0, w0, x0
+# CHECK-NEXT:        2     6.8    0.1    0.0       <total>
 
 # CHECK:      [2] Code Region - fmadd
 
@@ -391,7 +391,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1703
 # CHECK-NEXT: Total uOps:        600
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.35
 # CHECK-NEXT: IPC:               0.35
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -406,12 +406,12 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=======eeeeER .    .    .    .    ..   fmadd	d0, d1, d2, d0
 # CHECK-NEXT: [0,4]     D=========eeeeER    .    .    .    ..   fmadd	d0, d1, d2, d0
 # CHECK-NEXT: [0,5]     D=============eeeeER.    .    .    ..   fmadd	d0, d0, d1, d2
-# CHECK-NEXT: [1,0]     D=================eeER   .    .    ..   fadd	d0, d0, d0
-# CHECK-NEXT: [1,1]     D===================eeeeER    .    ..   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [1,2]     D=======================eeeER .    ..   fmul	d0, d0, d0
-# CHECK-NEXT: [1,3]     D========================eeeeER    ..   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [1,4]     D==========================eeeeER  ..   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [1,5]     D==============================eeeeER   fmadd	d0, d0, d1, d2
+# CHECK-NEXT: [1,0]     .D================eeER   .    .    ..   fadd	d0, d0, d0
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .    ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,2]     .D======================eeeER .    ..   fmul	d0, d0, d0
+# CHECK-NEXT: [1,3]     .D=======================eeeeER    ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,4]     .D=========================eeeeER  ..   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,5]     .D=============================eeeeER   fmadd	d0, d0, d1, d2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -420,13 +420,13 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.5    0.5    0.0       fadd	d0, d0, d0
-# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fmadd	d0, d1, d2, d0
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       fmul	d0, d0, d0
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmadd	d0, d1, d2, d0
-# CHECK-NEXT: 4.     2     18.5   0.0    0.0       fmadd	d0, d1, d2, d0
-# CHECK-NEXT: 5.     2     22.5   0.0    0.0       fmadd	d0, d0, d1, d2
-# CHECK-NEXT:        2     15.7   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       fadd	d0, d0, d0
+# CHECK-NEXT: 1.     2     11.0   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 2.     2     15.0   0.0    0.0       fmul	d0, d0, d0
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 4.     2     18.0   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 5.     2     22.0   0.0    0.0       fmadd	d0, d0, d1, d2
+# CHECK-NEXT:        2     15.2   0.1    0.0       <total>
 
 # CHECK:      [3] Code Region - saba
 
@@ -435,7 +435,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -450,8 +450,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   saba	v0.4s, v0.4s, v1.4s
 # CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: [1,1]     D=================eeeeER .  .   saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2]     D==================eeeeER.  .   saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3]     D======================eeeeER   saba	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   saba	v0.4s, v0.4s, v1.4s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -462,9 +462,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: 1.     2     11.5   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2.     2     12.5   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       saba	v0.4s, v0.4s, v1.4s
-# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       saba	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
 # CHECK:      [4] Code Region - sdot
 
@@ -473,7 +473,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1103
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.36
 # CHECK-NEXT: Block RThroughput: 0.8
@@ -488,8 +488,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D========eeeER .    .   .   sdot	v0.4s, v0.16b, v1.16b
 # CHECK-NEXT: [1,0]     D===========eeeeER  .   .   mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: [1,1]     D===============eeeER   .   sdot	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,2]     D================eeeER  .   sdot	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,3]     D===================eeeER   sdot	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: [1,2]     .D===============eeeER  .   sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,3]     .D==================eeeER   sdot	v0.4s, v0.16b, v1.16b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -500,9 +500,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: 1.     2     10.5   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 2.     2     11.5   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 3.     2     14.5   0.0    0.0       sdot	v0.4s, v0.16b, v1.16b
-# CHECK-NEXT:        2     10.8   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     11.0   0.0    0.0       sdot	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 3.     2     14.0   0.0    0.0       sdot	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT:        2     10.5   0.1    0.0       <total>
 
 # CHECK:      [5] Code Region - smmla
 
@@ -511,7 +511,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1103
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.36
 # CHECK-NEXT: Block RThroughput: 0.8
@@ -526,8 +526,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D========eeeER .    .   .   smmla	v0.4s, v0.16b, v1.16b
 # CHECK-NEXT: [1,0]     D===========eeeeER  .   .   mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: [1,1]     D===============eeeER   .   smmla	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,2]     D================eeeER  .   smmla	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,3]     D===================eeeER   smmla	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: [1,2]     .D===============eeeER  .   smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,3]     .D==================eeeER   smmla	v0.4s, v0.16b, v1.16b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -538,9 +538,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: 1.     2     10.5   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 2.     2     11.5   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 3.     2     14.5   0.0    0.0       smmla	v0.4s, v0.16b, v1.16b
-# CHECK-NEXT:        2     10.8   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     11.0   0.0    0.0       smmla	v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 3.     2     14.0   0.0    0.0       smmla	v0.4s, v0.16b, v1.16b
+# CHECK-NEXT:        2     10.5   0.1    0.0       <total>
 
 # CHECK:      [6] Code Region - mla
 
@@ -549,7 +549,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
@@ -564,8 +564,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   mla	v0.4s, v0.4s, v1.4s
 # CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: [1,1]     D=================eeeeER .  .   mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2]     D==================eeeeER.  .   mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3]     D======================eeeeER   mla	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   mla	v0.4s, v0.4s, v1.4s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -576,9 +576,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: 1.     2     11.5   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2.     2     12.5   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       mla	v0.4s, v0.4s, v1.4s
-# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       mla	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
 # CHECK:      [7] Code Region - sqrdmlah
 
@@ -587,7 +587,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.29
 # CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 3.5
@@ -602,8 +602,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   sqrdmlah	v0.4s, v0.4s, v1.4s
 # CHECK-NEXT: [1,0]     D==============eeeeER    .    .   mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: [1,1]     D==================eeeeER.    .   sqrdmlah	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2]     D====================eeeeER   .   sqrdmlah	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3]     D========================eeeeER   sqrdmlah	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   sqrdmlah	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   sqrdmlah	v0.4s, v0.4s, v1.4s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -614,9 +614,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: 1.     2     12.0   0.0    0.0       sqrdmlah	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2.     2     14.0   0.0    0.0       sqrdmlah	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3.     2     18.0   0.0    0.0       sqrdmlah	v0.4s, v0.4s, v1.4s
-# CHECK-NEXT:        2     13.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       sqrdmlah	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       sqrdmlah	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [8] Code Region - smlal2
 
@@ -625,7 +625,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 2.0
@@ -640,8 +640,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   smlal2	v0.4s, v0.8h, v1.8h
 # CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: [1,1]     D=================eeeeER .  .   smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2]     D==================eeeeER.  .   smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3]     D======================eeeeER   smlal2	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   smlal2	v0.4s, v0.8h, v1.8h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -652,9 +652,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: 1.     2     11.5   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2.     2     12.5   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       smlal2	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       smlal2	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
 # CHECK:      [9] Code Region - sadalp
 
@@ -663,7 +663,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -678,8 +678,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   sadalp	v0.2d, v0.4s
 # CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: [1,1]     D=================eeeeER .  .   sadalp	v0.2d, v1.4s
-# CHECK-NEXT: [1,2]     D==================eeeeER.  .   sadalp	v0.2d, v1.4s
-# CHECK-NEXT: [1,3]     D======================eeeeER   sadalp	v0.2d, v0.4s
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   sadalp	v0.2d, v0.4s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -690,9 +690,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: 1.     2     11.5   0.0    0.0       sadalp	v0.2d, v1.4s
-# CHECK-NEXT: 2.     2     12.5   0.0    0.0       sadalp	v0.2d, v1.4s
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       sadalp	v0.2d, v0.4s
-# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       sadalp	v0.2d, v1.4s
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       sadalp	v0.2d, v0.4s
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
 # CHECK:      [10] Code Region - ssra
 
@@ -701,7 +701,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -716,8 +716,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   ssra	v0.2d, v0.2d, #1
 # CHECK-NEXT: [1,0]     D=============eeeeER.    .  .   mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: [1,1]     D=================eeeeER .  .   ssra	v0.2d, v1.2d, #1
-# CHECK-NEXT: [1,2]     D==================eeeeER.  .   ssra	v0.2d, v1.2d, #1
-# CHECK-NEXT: [1,3]     D======================eeeeER   ssra	v0.2d, v0.2d, #1
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   ssra	v0.2d, v0.2d, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -728,9 +728,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: 1.     2     11.5   0.0    0.0       ssra	v0.2d, v1.2d, #1
-# CHECK-NEXT: 2.     2     12.5   0.0    0.0       ssra	v0.2d, v1.2d, #1
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       ssra	v0.2d, v0.2d, #1
-# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       ssra	v0.2d, v0.2d, #1
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
 # CHECK:      [11] Code Region - fcmla
 
@@ -739,7 +739,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -754,8 +754,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fcmla	v0.2d, v0.2d, v1.2d, #90
 # CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: [1,1]     D================eeeeER  .  .   fcmla	v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fcmla	v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: [1,3]     D======================eeeeER   fcmla	v0.2d, v0.2d, v1.2d, #90
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   fcmla	v0.2d, v0.2d, v1.2d, #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -766,9 +766,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT: 1.     2     10.5   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fcmla	v0.2d, v0.2d, v1.2d, #90
-# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       fcmla	v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fcmla	v0.2d, v0.2d, v1.2d, #90
+# CHECK-NEXT:        2     11.5   0.1    0.0       <total>
 
 # CHECK:      [12] Code Region - fmla
 
@@ -777,7 +777,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1703
 # CHECK-NEXT: Total uOps:        600
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.35
 # CHECK-NEXT: IPC:               0.35
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -792,12 +792,12 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=======eeeeER .    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
 # CHECK-NEXT: [0,4]     D=========eeeeER    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
 # CHECK-NEXT: [0,5]     D=============eeeeER.    .    .    ..   fmla	v0.2d, v0.2d, v1.2d
-# CHECK-NEXT: [1,0]     D=================eeeER  .    .    ..   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1]     D==================eeeeER.    .    ..   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,2]     D======================eeER   .    ..   fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,3]     D========================eeeeER    ..   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,4]     D==========================eeeeER  ..   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,5]     D==============================eeeeER   fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: [1,0]     .D================eeeER  .    .    ..   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     .D=================eeeeER.    .    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,2]     .D=====================eeER   .    ..   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3]     .D=======================eeeeER    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,4]     .D=========================eeeeER  ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,5]     .D=============================eeeeER   fmla	v0.2d, v0.2d, v1.2d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -806,13 +806,13 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 2.     2     14.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 4.     2     18.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 5.     2     22.5   0.0    0.0       fmla	v0.2d, v0.2d, v1.2d
-# CHECK-NEXT:        2     15.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     10.0   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 2.     2     14.0   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 4.     2     18.0   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 5.     2     22.0   0.0    0.0       fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT:        2     14.8   0.1    0.0       <total>
 
 # CHECK:      [13] Code Region - fmlal
 
@@ -821,7 +821,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1903
 # CHECK-NEXT: Total uOps:        600
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.32
 # CHECK-NEXT: IPC:               0.32
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -836,12 +836,12 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeeER    .    .    .    .    .   fmlal	v0.4s, v1.4h, v2.4h
 # CHECK-NEXT: [0,4]     D===========eeeeER  .    .    .    .    .   fmlal	v0.4s, v1.4h, v2.4h
 # CHECK-NEXT: [0,5]     D===============eeeeER   .    .    .    .   fmlal	v0.4s, v0.4h, v1.4h
-# CHECK-NEXT: [1,0]     D===================eeeER.    .    .    .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1]     D======================eeeeER .    .    .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,2]     D==========================eeER    .    .   fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,3]     D============================eeeeER.    .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,4]     D==============================eeeeER   .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,5]     D==================================eeeeER   fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: [1,0]     .D==================eeeER.    .    .    .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     .D=====================eeeeER .    .    .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,2]     .D=========================eeER    .    .   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3]     .D===========================eeeeER.    .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,4]     .D=============================eeeeER   .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,5]     .D=================================eeeeER   fmlal	v0.4s, v0.4h, v1.4h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -850,13 +850,13 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     10.5   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1.     2     13.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 2.     2     17.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 4.     2     21.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 5.     2     25.5   0.0    0.0       fmlal	v0.4s, v0.4h, v1.4h
-# CHECK-NEXT:        2     18.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     10.0   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 2.     2     17.0   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3.     2     19.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 4.     2     21.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 5.     2     25.0   0.0    0.0       fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT:        2     17.5   0.1    0.0       <total>
 
 # CHECK:      [14] Code Region - bfdot
 
@@ -865,7 +865,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1603
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.25
 # CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -880,8 +880,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D===========eeeeeER .    .    .   .   bfdot	v0.4s, v0.8h, v1.8h
 # CHECK-NEXT: [1,0]     D================eeeER   .    .   .   fmul	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT: [1,1]     D===================eeeeeER   .   .   bfdot	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2]     D======================eeeeeER.   .   bfdot	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3]     D===========================eeeeeER   bfdot	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,2]     .D=====================eeeeeER.   .   bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     .D==========================eeeeeER   bfdot	v0.4s, v0.8h, v1.8h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -892,9 +892,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     9.0    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT: 1.     2     12.0   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2.     2     15.0   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3.     2     20.0   0.0    0.0       bfdot	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     14.5   0.0    0.0       bfdot	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       bfdot	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
 
 # CHECK:      [15] Code Region - bfmmla
 
@@ -903,7 +903,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1903
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.21
 # CHECK-NEXT: IPC:               0.21
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -918,8 +918,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=============eeeeeeER   .    .    .    .   bfmmla	v0.4s, v0.8h, v1.8h
 # CHECK-NEXT: [1,0]     D===================eeeER.    .    .    .   fmul	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT: [1,1]     D======================eeeeeeER    .    .   bfmmla	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2]     D==========================eeeeeeER.    .   bfmmla	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3]     D================================eeeeeeER   bfmmla	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,2]     .D=========================eeeeeeER.    .   bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     .D===============================eeeeeeER   bfmmla	v0.4s, v0.8h, v1.8h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -930,9 +930,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     10.5   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT: 1.     2     13.5   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2.     2     17.5   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3.     2     23.5   0.0    0.0       bfmmla	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     17.0   0.0    0.0       bfmmla	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     23.0   0.0    0.0       bfmmla	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
 
 # CHECK:      [16] Code Region - bfmlalb
 
@@ -941,7 +941,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.27
 # CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -956,8 +956,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   bfmlalb	v0.4s, v0.8h, v1.8h
 # CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT: [1,1]     D==================eeeeeER    . .   bfmlalb	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   bfmlalb	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3]     D=========================eeeeeER   bfmlalb	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,2]     .D===================eeeeeER  . .   bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     .D========================eeeeeER   bfmlalb	v0.4s, v0.8h, v1.8h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -968,9 +968,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2.     2     13.5   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3.     2     18.5   0.0    0.0       bfmlalb	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT:        2     13.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     13.0   0.0    0.0       bfmlalb	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     18.0   0.0    0.0       bfmlalb	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [17] Code Region - crc32b
 
@@ -979,7 +979,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.57
 # CHECK-NEXT: IPC:               0.57
 # CHECK-NEXT: Block RThroughput: 3.0
@@ -994,8 +994,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=====eeER.    ..   crc32b	w0, w0, w0
 # CHECK-NEXT: [1,0]     D=======eeER   ..   mul	w0, w0, w0
 # CHECK-NEXT: [1,1]     D=========eeER ..   crc32b	w0, w0, w1
-# CHECK-NEXT: [1,2]     D==========eeER..   crc32b	w0, w0, w1
-# CHECK-NEXT: [1,3]     D============eeER   crc32b	w0, w0, w0
+# CHECK-NEXT: [1,2]     .D=========eeER..   crc32b	w0, w0, w1
+# CHECK-NEXT: [1,3]     .D===========eeER   crc32b	w0, w0, w0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1006,9 +1006,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	w0, w0, w0
 # CHECK-NEXT: 1.     2     6.5    0.0    0.0       crc32b	w0, w0, w1
-# CHECK-NEXT: 2.     2     7.5    0.0    0.0       crc32b	w0, w0, w1
-# CHECK-NEXT: 3.     2     9.5    0.0    0.0       crc32b	w0, w0, w0
-# CHECK-NEXT:        2     7.0    0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     7.0    0.0    0.0       crc32b	w0, w0, w1
+# CHECK-NEXT: 3.     2     9.0    0.0    0.0       crc32b	w0, w0, w0
+# CHECK-NEXT:        2     6.8    0.1    0.0       <total>
 
 # CHECK:      [18] Code Region - Z saba
 
@@ -1017,7 +1017,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -1030,10 +1030,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   saba	z0.d, z1.d, z2.d
 # CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   saba	z0.d, z1.d, z2.d
 # CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   saba	z0.d, z0.d, z1.d
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeeER    .   saba	z0.d, z1.d, z2.d
-# CHECK-NEXT: [1,2]     D====================eeeeER   .   saba	z0.d, z1.d, z2.d
-# CHECK-NEXT: [1,3]     D========================eeeeER   saba	z0.d, z0.d, z1.d
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   saba	z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   saba	z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   saba	z0.d, z0.d, z1.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1042,11 +1042,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       saba	z0.d, z1.d, z2.d
-# CHECK-NEXT: 2.     2     14.0   0.0    0.0       saba	z0.d, z1.d, z2.d
-# CHECK-NEXT: 3.     2     18.0   0.0    0.0       saba	z0.d, z0.d, z1.d
-# CHECK-NEXT:        2     13.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       saba	z0.d, z1.d, z2.d
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       saba	z0.d, z1.d, z2.d
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       saba	z0.d, z0.d, z1.d
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [19] Code Region - Z sadalp
 
@@ -1055,7 +1055,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -1068,10 +1068,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   sadalp	z0.d, p0/m, z1.s
 # CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   sadalp	z0.d, p0/m, z1.s
 # CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   sadalp	z0.d, p0/m, z0.s
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeeER    .   sadalp	z0.d, p0/m, z1.s
-# CHECK-NEXT: [1,2]     D====================eeeeER   .   sadalp	z0.d, p0/m, z1.s
-# CHECK-NEXT: [1,3]     D========================eeeeER   sadalp	z0.d, p0/m, z0.s
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   sadalp	z0.d, p0/m, z1.s
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   sadalp	z0.d, p0/m, z1.s
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   sadalp	z0.d, p0/m, z0.s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1080,11 +1080,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sadalp	z0.d, p0/m, z1.s
-# CHECK-NEXT: 2.     2     14.0   0.0    0.0       sadalp	z0.d, p0/m, z1.s
-# CHECK-NEXT: 3.     2     18.0   0.0    0.0       sadalp	z0.d, p0/m, z0.s
-# CHECK-NEXT:        2     13.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       sadalp	z0.d, p0/m, z1.s
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       sadalp	z0.d, p0/m, z1.s
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       sadalp	z0.d, p0/m, z0.s
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [20] Code Region - Z ssra
 
@@ -1093,7 +1093,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 1.5
@@ -1106,10 +1106,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   ssra	z0.d, z1.d, #1
 # CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   ssra	z0.d, z1.d, #1
 # CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   ssra	z0.d, z0.d, #1
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeeER    .   ssra	z0.d, z1.d, #1
-# CHECK-NEXT: [1,2]     D====================eeeeER   .   ssra	z0.d, z1.d, #1
-# CHECK-NEXT: [1,3]     D========================eeeeER   ssra	z0.d, z0.d, #1
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   ssra	z0.d, z1.d, #1
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   ssra	z0.d, z1.d, #1
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   ssra	z0.d, z0.d, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1118,11 +1118,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       ssra	z0.d, z1.d, #1
-# CHECK-NEXT: 2.     2     14.0   0.0    0.0       ssra	z0.d, z1.d, #1
-# CHECK-NEXT: 3.     2     18.0   0.0    0.0       ssra	z0.d, z0.d, #1
-# CHECK-NEXT:        2     13.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       ssra	z0.d, z1.d, #1
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       ssra	z0.d, z1.d, #1
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       ssra	z0.d, z0.d, #1
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [21] Code Region - Z cdot.s
 
@@ -1131,7 +1131,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.42
 # CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1144,10 +1144,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   cdot	z0.s, z1.b, z2.b, #90
 # CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   cdot	z0.s, z1.b, z2.b, #90
 # CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   cdot	z0.s, z0.b, z1.b, #90
-# CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D=================eeeER  ..   cdot	z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: [1,2]     D==================eeeER ..   cdot	z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: [1,3]     D=====================eeeER   cdot	z0.s, z0.b, z1.b, #90
+# CHECK-NEXT: [1,0]     .D===========eeeeeER.    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D================eeeER  ..   cdot	z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [1,2]     .D=================eeeER ..   cdot	z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [1,3]     .D====================eeeER   cdot	z0.s, z0.b, z1.b, #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1156,11 +1156,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     12.0   0.0    0.0       cdot	z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: 2.     2     13.0   0.0    0.0       cdot	z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: 3.     2     16.0   0.0    0.0       cdot	z0.s, z0.b, z1.b, #90
-# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       cdot	z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       cdot	z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: 3.     2     15.5   0.0    0.0       cdot	z0.s, z0.b, z1.b, #90
+# CHECK-NEXT:        2     11.5   0.1    0.0       <total>
 
 # CHECK:      [22] Code Region - Z cdot.d
 
@@ -1169,7 +1169,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -1182,10 +1182,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   cdot	z0.d, z1.h, z2.h, #90
 # CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   cdot	z0.d, z1.h, z2.h, #90
 # CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   cdot	z0.d, z0.h, z1.h, #90
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeeER    .   cdot	z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: [1,2]     D====================eeeeER   .   cdot	z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: [1,3]     D========================eeeeER   cdot	z0.d, z0.h, z1.h, #90
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   cdot	z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   cdot	z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   cdot	z0.d, z0.h, z1.h, #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1194,11 +1194,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       cdot	z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: 2.     2     14.0   0.0    0.0       cdot	z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: 3.     2     18.0   0.0    0.0       cdot	z0.d, z0.h, z1.h, #90
-# CHECK-NEXT:        2     13.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       cdot	z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       cdot	z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       cdot	z0.d, z0.h, z1.h, #90
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [23] Code Region - Z cmla.b
 
@@ -1207,7 +1207,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -1220,10 +1220,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   cmla	z0.b, z1.b, z2.b, #90
 # CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   cmla	z0.b, z1.b, z2.b, #90
 # CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   cmla	z0.b, z0.b, z1.b, #90
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeeER    .   cmla	z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: [1,2]     D====================eeeeER   .   cmla	z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: [1,3]     D========================eeeeER   cmla	z0.b, z0.b, z1.b, #90
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   cmla	z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   cmla	z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   cmla	z0.b, z0.b, z1.b, #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1232,11 +1232,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       cmla	z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: 2.     2     14.0   0.0    0.0       cmla	z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: 3.     2     18.0   0.0    0.0       cmla	z0.b, z0.b, z1.b, #90
-# CHECK-NEXT:        2     13.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       cmla	z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       cmla	z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       cmla	z0.b, z0.b, z1.b, #90
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [24] Code Region - Z cmla.d
 
@@ -1245,7 +1245,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1803
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.28
 # CHECK-NEXT: IPC:               0.22
 # CHECK-NEXT: Block RThroughput: 4.0
@@ -1258,10 +1258,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .  .   cmla	z0.d, z1.d, z2.d, #90
 # CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   cmla	z0.d, z1.d, z2.d, #90
 # CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   cmla	z0.d, z0.d, z1.d, #90
-# CHECK-NEXT: [1,0]     D==================eeeeeER    .    .  .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D=======================eeeeeER    .  .   cmla	z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   cmla	z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: [1,3]     D===============================eeeeeER   cmla	z0.d, z0.d, z1.d, #90
+# CHECK-NEXT: [1,0]     .D=================eeeeeER    .    .  .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D======================eeeeeER    .  .   cmla	z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [1,2]     .D=========================eeeeeER .  .   cmla	z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [1,3]     .D==============================eeeeeER   cmla	z0.d, z0.d, z1.d, #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1270,11 +1270,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     10.0   0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     15.0   0.0    0.0       cmla	z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: 2.     2     18.0   0.0    0.0       cmla	z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: 3.     2     23.0   0.0    0.0       cmla	z0.d, z0.d, z1.d, #90
-# CHECK-NEXT:        2     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       cmla	z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: 2.     2     17.5   0.0    0.0       cmla	z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: 3.     2     22.5   0.0    0.0       cmla	z0.d, z0.d, z1.d, #90
+# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
 
 # CHECK:      [25] Code Region - Z sdot.s
 
@@ -1283,7 +1283,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.42
 # CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1296,10 +1296,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   sdot	z0.s, z1.b, z2.b
 # CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   sdot	z0.s, z1.b, z2.b
 # CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   sdot	z0.s, z0.b, z1.b
-# CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D=================eeeER  ..   sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,2]     D==================eeeER ..   sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,3]     D=====================eeeER   sdot	z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,0]     .D===========eeeeeER.    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D================eeeER  ..   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,2]     .D=================eeeER ..   sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3]     .D====================eeeER   sdot	z0.s, z0.b, z1.b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1308,11 +1308,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     12.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: 2.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b
-# CHECK-NEXT: 3.     2     16.0   0.0    0.0       sdot	z0.s, z0.b, z1.b
-# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       sdot	z0.s, z1.b, z2.b
+# CHECK-NEXT: 3.     2     15.5   0.0    0.0       sdot	z0.s, z0.b, z1.b
+# CHECK-NEXT:        2     11.5   0.1    0.0       <total>
 
 # CHECK:      [26] Code Region - Z sudot
 
@@ -1321,7 +1321,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.42
 # CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1334,10 +1334,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeER    .    .    ..   sdot	z0.s, z1.b, z2.b[1]
 # CHECK-NEXT: [0,2]     D======eeeER   .    .    ..   sdot	z0.s, z1.b, z2.b[1]
 # CHECK-NEXT: [0,3]     D=========eeeER.    .    ..   sdot	z0.s, z0.b, z1.b[1]
-# CHECK-NEXT: [1,0]     D============eeeeeER.    ..   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D=================eeeER  ..   sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: [1,2]     D==================eeeER ..   sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: [1,3]     D=====================eeeER   sdot	z0.s, z0.b, z1.b[1]
+# CHECK-NEXT: [1,0]     .D===========eeeeeER.    ..   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D================eeeER  ..   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,2]     .D=================eeeER ..   sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,3]     .D====================eeeER   sdot	z0.s, z0.b, z1.b[1]
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1346,11 +1346,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     12.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: 2.     2     13.0   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: 3.     2     16.0   0.0    0.0       sdot	z0.s, z0.b, z1.b[1]
-# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       sdot	z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 3.     2     15.5   0.0    0.0       sdot	z0.s, z0.b, z1.b[1]
+# CHECK-NEXT:        2     11.5   0.1    0.0       <total>
 
 # CHECK:      [27] Code Region - Z sdot.d
 
@@ -1359,7 +1359,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -1372,10 +1372,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   sdot	z0.d, z1.h, z2.h
 # CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   sdot	z0.d, z1.h, z2.h
 # CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   sdot	z0.d, z0.h, z1.h
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeeER    .   sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: [1,2]     D====================eeeeER   .   sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: [1,3]     D========================eeeeER   sdot	z0.d, z0.h, z1.h
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   sdot	z0.d, z0.h, z1.h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1384,11 +1384,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: 2.     2     14.0   0.0    0.0       sdot	z0.d, z1.h, z2.h
-# CHECK-NEXT: 3.     2     18.0   0.0    0.0       sdot	z0.d, z0.h, z1.h
-# CHECK-NEXT:        2     13.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       sdot	z0.d, z1.h, z2.h
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       sdot	z0.d, z0.h, z1.h
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [28] Code Region - Z smmla
 
@@ -1397,7 +1397,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1103
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.36
 # CHECK-NEXT: Block RThroughput: 0.8
@@ -1412,8 +1412,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D========eeeER .    .   .   smmla	z0.s, z0.b, z1.b
 # CHECK-NEXT: [1,0]     D===========eeeeER  .   .   mul	z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D===============eeeER   .   smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,2]     D================eeeER  .   smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,3]     D===================eeeER   smmla	z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,2]     .D===============eeeER  .   smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3]     .D==================eeeER   smmla	z0.s, z0.b, z1.b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1424,9 +1424,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     6.5    0.5    0.0       mul	z0.s, z0.s, z0.s
 # CHECK-NEXT: 1.     2     10.5   0.0    0.0       smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: 2.     2     11.5   0.0    0.0       smmla	z0.s, z1.b, z2.b
-# CHECK-NEXT: 3.     2     14.5   0.0    0.0       smmla	z0.s, z0.b, z1.b
-# CHECK-NEXT:        2     10.8   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     11.0   0.0    0.0       smmla	z0.s, z1.b, z2.b
+# CHECK-NEXT: 3.     2     14.0   0.0    0.0       smmla	z0.s, z0.b, z1.b
+# CHECK-NEXT:        2     10.5   0.1    0.0       <total>
 
 # CHECK:      [29] Code Region - Z mla.b
 
@@ -1435,7 +1435,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 4.0
@@ -1448,10 +1448,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   mla	z0.b, p0/m, z1.b, z2.b
 # CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   mla	z0.b, p0/m, z1.b, z2.b
 # CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   mla	z0.b, p0/m, z0.b, z1.b
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeeER    .   mla	z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: [1,2]     D====================eeeeER   .   mla	z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: [1,3]     D========================eeeeER   mla	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   mla	z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   mla	z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   mla	z0.b, p0/m, z0.b, z1.b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1460,11 +1460,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       mla	z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: 2.     2     14.0   0.0    0.0       mla	z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: 3.     2     18.0   0.0    0.0       mla	z0.b, p0/m, z0.b, z1.b
-# CHECK-NEXT:        2     13.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       mla	z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       mla	z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       mla	z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [30] Code Region - Z mla.d
 
@@ -1473,7 +1473,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1803
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.28
 # CHECK-NEXT: IPC:               0.22
 # CHECK-NEXT: Block RThroughput: 4.0
@@ -1486,10 +1486,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .  .   mla	z0.d, p0/m, z1.d, z2.d
 # CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   mla	z0.d, p0/m, z1.d, z2.d
 # CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   mla	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT: [1,0]     D==================eeeeeER    .    .  .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D=======================eeeeeER    .  .   mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,3]     D===============================eeeeeER   mla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0]     .D=================eeeeeER    .    .  .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D======================eeeeeER    .  .   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2]     .D=========================eeeeeER .  .   mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D==============================eeeeeER   mla	z0.d, p0/m, z0.d, z1.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1498,11 +1498,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     10.0   0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     15.0   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 2.     2     18.0   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 3.     2     23.0   0.0    0.0       mla	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT:        2     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2.     2     17.5   0.0    0.0       mla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3.     2     22.5   0.0    0.0       mla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
 
 # CHECK:      [31] Code Region - Z smlalb
 
@@ -1511,7 +1511,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.36
 # CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -1524,10 +1524,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   smlalb	z0.d, z1.s, z2.s
 # CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   smlalb	z0.d, z1.s, z2.s
 # CHECK-NEXT: [0,3]     D==========eeeeER   .    .    .   smlalb	z0.d, z0.s, z1.s
-# CHECK-NEXT: [1,0]     D==============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D===================eeeeER    .   smlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: [1,2]     D====================eeeeER   .   smlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: [1,3]     D========================eeeeER   smlalb	z0.d, z0.s, z1.s
+# CHECK-NEXT: [1,0]     .D=============eeeeeER   .    .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D==================eeeeER    .   smlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,2]     .D===================eeeeER   .   smlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,3]     .D=======================eeeeER   smlalb	z0.d, z0.s, z1.s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1536,11 +1536,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.0   0.0    0.0       smlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: 2.     2     14.0   0.0    0.0       smlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: 3.     2     18.0   0.0    0.0       smlalb	z0.d, z0.s, z1.s
-# CHECK-NEXT:        2     13.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       smlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       smlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: 3.     2     17.5   0.0    0.0       smlalb	z0.d, z0.s, z1.s
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [32] Code Region - Z sqdmlalb
 
@@ -1549,7 +1549,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.33
 # CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -1562,10 +1562,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    . .   sqdmlalb	z0.d, z1.s, z2.s
 # CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   sqdmlalb	z0.d, z1.s, z2.s
 # CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   sqdmlalb	z0.d, z0.s, z1.s
-# CHECK-NEXT: [1,0]     D===============eeeeeER  .    . .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D====================eeeeER   . .   sqdmlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: [1,2]     D======================eeeeER . .   sqdmlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: [1,3]     D==========================eeeeER   sqdmlalb	z0.d, z0.s, z1.s
+# CHECK-NEXT: [1,0]     .D==============eeeeeER  .    . .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D===================eeeeER   . .   sqdmlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,2]     .D=====================eeeeER . .   sqdmlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,3]     .D=========================eeeeER   sqdmlalb	z0.d, z0.s, z1.s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1574,11 +1574,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.5   0.0    0.0       sqdmlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       sqdmlalb	z0.d, z1.s, z2.s
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       sqdmlalb	z0.d, z0.s, z1.s
-# CHECK-NEXT:        2     14.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sqdmlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: 2.     2     15.0   0.0    0.0       sqdmlalb	z0.d, z1.s, z2.s
+# CHECK-NEXT: 3.     2     19.0   0.0    0.0       sqdmlalb	z0.d, z0.s, z1.s
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
 
 # CHECK:      [33] Code Region - Z sqrdmlah.b
 
@@ -1587,7 +1587,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.33
 # CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -1600,10 +1600,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    . .   sqrdmlah	z0.b, z1.b, z2.b
 # CHECK-NEXT: [0,2]     D=======eeeeER .    .    .    . .   sqrdmlah	z0.b, z1.b, z2.b
 # CHECK-NEXT: [0,3]     D===========eeeeER  .    .    . .   sqrdmlah	z0.b, z0.b, z1.b
-# CHECK-NEXT: [1,0]     D===============eeeeeER  .    . .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D====================eeeeER   . .   sqrdmlah	z0.b, z1.b, z2.b
-# CHECK-NEXT: [1,2]     D======================eeeeER . .   sqrdmlah	z0.b, z1.b, z2.b
-# CHECK-NEXT: [1,3]     D==========================eeeeER   sqrdmlah	z0.b, z0.b, z1.b
+# CHECK-NEXT: [1,0]     .D==============eeeeeER  .    . .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D===================eeeeER   . .   sqrdmlah	z0.b, z1.b, z2.b
+# CHECK-NEXT: [1,2]     .D=====================eeeeER . .   sqrdmlah	z0.b, z1.b, z2.b
+# CHECK-NEXT: [1,3]     .D=========================eeeeER   sqrdmlah	z0.b, z0.b, z1.b
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1612,11 +1612,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     13.5   0.0    0.0       sqrdmlah	z0.b, z1.b, z2.b
-# CHECK-NEXT: 2.     2     15.5   0.0    0.0       sqrdmlah	z0.b, z1.b, z2.b
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       sqrdmlah	z0.b, z0.b, z1.b
-# CHECK-NEXT:        2     14.3   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     8.0    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     13.0   0.0    0.0       sqrdmlah	z0.b, z1.b, z2.b
+# CHECK-NEXT: 2.     2     15.0   0.0    0.0       sqrdmlah	z0.b, z1.b, z2.b
+# CHECK-NEXT: 3.     2     19.0   0.0    0.0       sqrdmlah	z0.b, z0.b, z1.b
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
 
 # CHECK:      [34] Code Region - Z sqrdmlah.d
 
@@ -1625,7 +1625,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1803
 # CHECK-NEXT: Total uOps:        500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.28
 # CHECK-NEXT: IPC:               0.22
 # CHECK-NEXT: Block RThroughput: 4.0
@@ -1638,10 +1638,10 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .  .   sqrdmlah	z0.d, z1.d, z2.d
 # CHECK-NEXT: [0,2]     D========eeeeeER    .    .    .    .  .   sqrdmlah	z0.d, z1.d, z2.d
 # CHECK-NEXT: [0,3]     D=============eeeeeER    .    .    .  .   sqrdmlah	z0.d, z0.d, z1.d
-# CHECK-NEXT: [1,0]     D==================eeeeeER    .    .  .   mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1]     D=======================eeeeeER    .  .   sqrdmlah	z0.d, z1.d, z2.d
-# CHECK-NEXT: [1,2]     D==========================eeeeeER .  .   sqrdmlah	z0.d, z1.d, z2.d
-# CHECK-NEXT: [1,3]     D===============================eeeeeER   sqrdmlah	z0.d, z0.d, z1.d
+# CHECK-NEXT: [1,0]     .D=================eeeeeER    .    .  .   mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1]     .D======================eeeeeER    .  .   sqrdmlah	z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,2]     .D=========================eeeeeER .  .   sqrdmlah	z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D==============================eeeeeER   sqrdmlah	z0.d, z0.d, z1.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1650,11 +1650,11 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     10.0   0.5    0.0       mul	z0.d, z0.d, z0.d
-# CHECK-NEXT: 1.     2     15.0   0.0    0.0       sqrdmlah	z0.d, z1.d, z2.d
-# CHECK-NEXT: 2.     2     18.0   0.0    0.0       sqrdmlah	z0.d, z1.d, z2.d
-# CHECK-NEXT: 3.     2     23.0   0.0    0.0       sqrdmlah	z0.d, z0.d, z1.d
-# CHECK-NEXT:        2     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       mul	z0.d, z0.d, z0.d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       sqrdmlah	z0.d, z1.d, z2.d
+# CHECK-NEXT: 2.     2     17.5   0.0    0.0       sqrdmlah	z0.d, z1.d, z2.d
+# CHECK-NEXT: 3.     2     22.5   0.0    0.0       sqrdmlah	z0.d, z0.d, z1.d
+# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
 
 # CHECK:      [35] Code Region - Z fcmla ZPmZZ
 
@@ -1663,7 +1663,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.27
 # CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1678,8 +1678,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   fcmla	z0.d, p0/m, z0.d, z1.d, #90
 # CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: [1,1]     D==================eeeeeER    . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: [1,3]     D=========================eeeeeER   fcmla	z0.d, p0/m, z0.d, z1.d, #90
+# CHECK-NEXT: [1,2]     .D===================eeeeeER  . .   fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [1,3]     .D========================eeeeeER   fcmla	z0.d, p0/m, z0.d, z1.d, #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1690,9 +1690,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: 1.     2     11.5   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: 2.     2     13.5   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: 3.     2     18.5   0.0    0.0       fcmla	z0.d, p0/m, z0.d, z1.d, #90
-# CHECK-NEXT:        2     13.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     13.0   0.0    0.0       fcmla	z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: 3.     2     18.0   0.0    0.0       fcmla	z0.d, p0/m, z0.d, z1.d, #90
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [36] Code Region - Z fcmla ZZZI
 
@@ -1701,7 +1701,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.27
 # CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1716,8 +1716,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   fcmla	z0.s, z0.s, z1.s[1], #90
 # CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: [1,1]     D==================eeeeeER    . .   fcmla	z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   fcmla	z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: [1,3]     D=========================eeeeeER   fcmla	z0.s, z0.s, z1.s[1], #90
+# CHECK-NEXT: [1,2]     .D===================eeeeeER  . .   fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [1,3]     .D========================eeeeeER   fcmla	z0.s, z0.s, z1.s[1], #90
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1728,9 +1728,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: 1.     2     11.5   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: 2.     2     13.5   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: 3.     2     18.5   0.0    0.0       fcmla	z0.s, z0.s, z1.s[1], #90
-# CHECK-NEXT:        2     13.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     13.0   0.0    0.0       fcmla	z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: 3.     2     18.0   0.0    0.0       fcmla	z0.s, z0.s, z1.s[1], #90
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
 
 # CHECK:      [37] Code Region - Z fmla ZPmZZ
 
@@ -1739,7 +1739,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1754,8 +1754,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fmla	z0.d, p0/m, z0.d, z1.d
 # CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: [1,1]     D================eeeeER  .  .   fmla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fmla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,3]     D======================eeeeER   fmla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   fmla	z0.d, p0/m, z0.d, z1.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1766,9 +1766,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmla	z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       fmla	z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fmla	z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT:        2     11.5   0.1    0.0       <total>
 
 # CHECK:      [38] Code Region - Z fmla ZZZI
 
@@ -1777,7 +1777,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1792,8 +1792,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fmla	z0.d, z0.d, z1.d[1]
 # CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: [1,1]     D================eeeeER  .  .   fmla	z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fmla	z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: [1,3]     D======================eeeeER   fmla	z0.d, z0.d, z1.d[1]
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   fmla	z0.d, z0.d, z1.d[1]
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1804,9 +1804,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmla	z0.d, z0.d, z1.d[1]
-# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       fmla	z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fmla	z0.d, z0.d, z1.d[1]
+# CHECK-NEXT:        2     11.5   0.1    0.0       <total>
 
 # CHECK:      [39] Code Region - Z fmlalb ZZZ
 
@@ -1815,7 +1815,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1303
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.31
 # CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1830,8 +1830,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=========eeeeER    .    .  .   fmlalb	z0.s, z0.h, z1.h
 # CHECK-NEXT: [1,0]     D=============eeeER .    .  .   fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: [1,1]     D================eeeeER  .  .   fmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2]     D==================eeeeER.  .   fmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3]     D======================eeeeER   fmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,2]     .D=================eeeeER.  .   fmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     .D=====================eeeeER   fmlalb	z0.s, z0.h, z1.h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1842,9 +1842,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     7.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: 1.     2     10.5   0.0    0.0       fmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: 2.     2     12.5   0.0    0.0       fmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmlalb	z0.s, z0.h, z1.h
-# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       fmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       fmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     11.5   0.1    0.0       <total>
 
 # CHECK:      [40] Code Region - Z bfdot
 
@@ -1853,7 +1853,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1603
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.25
 # CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1868,8 +1868,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D===========eeeeeER .    .    .   .   bfdot	z0.s, z0.h, z1.h
 # CHECK-NEXT: [1,0]     D================eeeER   .    .   .   fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: [1,1]     D===================eeeeeER   .   .   bfdot	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2]     D======================eeeeeER.   .   bfdot	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3]     D===========================eeeeeER   bfdot	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,2]     .D=====================eeeeeER.   .   bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     .D==========================eeeeeER   bfdot	z0.s, z0.h, z1.h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1880,9 +1880,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     9.0    0.5    0.0       fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: 1.     2     12.0   0.0    0.0       bfdot	z0.s, z1.h, z2.h
-# CHECK-NEXT: 2.     2     15.0   0.0    0.0       bfdot	z0.s, z1.h, z2.h
-# CHECK-NEXT: 3.     2     20.0   0.0    0.0       bfdot	z0.s, z0.h, z1.h
-# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     14.5   0.0    0.0       bfdot	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       bfdot	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     13.8   0.1    0.0       <total>
 
 # CHECK:      [41] Code Region - Z bfmmla
 
@@ -1891,7 +1891,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1903
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.21
 # CHECK-NEXT: IPC:               0.21
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1906,8 +1906,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=============eeeeeeER   .    .    .    .   bfmmla	z0.s, z0.h, z1.h
 # CHECK-NEXT: [1,0]     D===================eeeER.    .    .    .   fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: [1,1]     D======================eeeeeeER    .    .   bfmmla	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2]     D==========================eeeeeeER.    .   bfmmla	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3]     D================================eeeeeeER   bfmmla	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,2]     .D=========================eeeeeeER.    .   bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     .D===============================eeeeeeER   bfmmla	z0.s, z0.h, z1.h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1918,9 +1918,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     10.5   0.5    0.0       fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: 1.     2     13.5   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
-# CHECK-NEXT: 2.     2     17.5   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
-# CHECK-NEXT: 3.     2     23.5   0.0    0.0       bfmmla	z0.s, z0.h, z1.h
-# CHECK-NEXT:        2     16.3   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     17.0   0.0    0.0       bfmmla	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     23.0   0.0    0.0       bfmmla	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     16.0   0.1    0.0       <total>
 
 # CHECK:      [42] Code Region - bfmlalb
 
@@ -1929,7 +1929,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      1503
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.27
 # CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -1944,8 +1944,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D==========eeeeeER  .    .    . .   bfmlalb	z0.s, z0.h, z1.h
 # CHECK-NEXT: [1,0]     D===============eeeER    .    . .   fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: [1,1]     D==================eeeeeER    . .   bfmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2]     D====================eeeeeER  . .   bfmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3]     D=========================eeeeeER   bfmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,2]     .D===================eeeeeER  . .   bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3]     .D========================eeeeeER   bfmlalb	z0.s, z0.h, z1.h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1956,6 +1956,6 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	z0.d, z0.d, z0.d
 # CHECK-NEXT: 1.     2     11.5   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: 2.     2     13.5   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
-# CHECK-NEXT: 3.     2     18.5   0.0    0.0       bfmlalb	z0.s, z0.h, z1.h
-# CHECK-NEXT:        2     13.0   0.1    0.0       <total>
+# CHECK-NEXT: 2.     2     13.0   0.0    0.0       bfmlalb	z0.s, z1.h, z2.h
+# CHECK-NEXT: 3.     2     18.0   0.0    0.0       bfmlalb	z0.s, z0.h, z1.h
+# CHECK-NEXT:        2     12.8   0.1    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
index 6cba45cdd42be..49af4df3d8ff5 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
@@ -5071,19 +5071,19 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  2      2     1.00                        movs	p0.b, p0/z, p0.b
 # CHECK-NEXT:  2      2     1.00                        movs	p15.b, p15.b
 # CHECK-NEXT:  2      2     1.00                        movs	p15.b, p15/z, p15.b
-# CHECK-NEXT:  1      1     0.06                  U     mrs	x3, ID_AA64ZFR0_EL1
-# CHECK-NEXT:  1      1     0.06                  U     mrs	x3, ZCR_EL1
-# CHECK-NEXT:  1      1     0.06                  U     mrs	x3, ZCR_EL12
-# CHECK-NEXT:  1      1     0.06                  U     mrs	x3, ZCR_EL2
-# CHECK-NEXT:  1      1     0.06                  U     mrs	x3, ZCR_EL3
+# CHECK-NEXT:  1      1     0.17                  U     mrs	x3, ID_AA64ZFR0_EL1
+# CHECK-NEXT:  1      1     0.17                  U     mrs	x3, ZCR_EL1
+# CHECK-NEXT:  1      1     0.17                  U     mrs	x3, ZCR_EL12
+# CHECK-NEXT:  1      1     0.17                  U     mrs	x3, ZCR_EL2
+# CHECK-NEXT:  1      1     0.17                  U     mrs	x3, ZCR_EL3
 # CHECK-NEXT:  1      4     1.00                        msb	z0.b, p7/m, z1.b, z31.b
 # CHECK-NEXT:  1      5     1.00                        msb	z0.d, p7/m, z1.d, z31.d
 # CHECK-NEXT:  1      4     1.00                        msb	z0.h, p7/m, z1.h, z31.h
 # CHECK-NEXT:  1      4     1.00                        msb	z0.s, p7/m, z1.s, z31.s
-# CHECK-NEXT:  1      1     0.06                  U     msr	ZCR_EL1, x3
-# CHECK-NEXT:  1      1     0.06                  U     msr	ZCR_EL12, x3
-# CHECK-NEXT:  1      1     0.06                  U     msr	ZCR_EL2, x3
-# CHECK-NEXT:  1      1     0.06                  U     msr	ZCR_EL3, x3
+# CHECK-NEXT:  1      1     0.17                  U     msr	ZCR_EL1, x3
+# CHECK-NEXT:  1      1     0.17                  U     msr	ZCR_EL12, x3
+# CHECK-NEXT:  1      1     0.17                  U     msr	ZCR_EL2, x3
+# CHECK-NEXT:  1      1     0.17                  U     msr	ZCR_EL3, x3
 # CHECK-NEXT:  1      4     0.50                        mul	z0.b, p7/m, z0.b, z31.b
 # CHECK-NEXT:  1      4     0.50                        mul	z0.b, z1.b, z2.b
 # CHECK-NEXT:  2      5     1.00                        mul	z0.d, p7/m, z0.d, z31.d
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s
index 1ef746813966d..c7a93d1b4ed35 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s
@@ -733,7 +733,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    1.97
 # CHECK-NEXT: IPC:               0.98
 # CHECK-NEXT: Block RThroughput: 1.7
@@ -745,8 +745,8 @@ ldr  x2, [x1], #254
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.1d }, [x27], #8
 # CHECK-NEXT: [0,1]     D=eeeeeeER. .   ld1	{ v1.2d }, [x27], #16
 # CHECK-NEXT: [0,2]     D==eeeeeeER .   ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,4]     D====eeeeeeER   ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .D==eeeeeeER.   ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,4]     .D===eeeeeeER   ld1	{ v1.4s }, [x27], #16
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -758,9 +758,9 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d }, [x27], #8
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
 # CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1	{ v1.4s }, [x27], #16
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT:        1     2.6    0.2    0.0       <total>
 
 # CHECK:      [1] Code Region - G02
 
@@ -769,7 +769,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    1.97
 # CHECK-NEXT: IPC:               0.98
 # CHECK-NEXT: Block RThroughput: 1.7
@@ -781,8 +781,8 @@ ldr  x2, [x1], #254
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.8b }, [x27], #8
 # CHECK-NEXT: [0,1]     D=eeeeeeER. .   ld1	{ v1.8h }, [x27], #16
 # CHECK-NEXT: [0,2]     D==eeeeeeER .   ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,4]     D====eeeeeeER   ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D==eeeeeeER.   ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,4]     .D===eeeeeeER   ld1	{ v1.2d }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -794,9 +794,9 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b }, [x27], #8
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
 # CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1	{ v1.2d }, [x27], x28
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT:        1     2.6    0.2    0.0       <total>
 
 # CHECK:      [2] Code Region - G03
 
@@ -805,7 +805,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    1.97
 # CHECK-NEXT: IPC:               0.98
 # CHECK-NEXT: Block RThroughput: 1.7
@@ -817,8 +817,8 @@ ldr  x2, [x1], #254
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.2s }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eeeeeeER. .   ld1	{ v1.4h }, [x27], x28
 # CHECK-NEXT: [0,2]     D==eeeeeeER .   ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,4]     D====eeeeeeER   ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D==eeeeeeER.   ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,4]     .D===eeeeeeER   ld1	{ v1.8h }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -830,9 +830,9 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
 # CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1	{ v1.8h }, [x27], x28
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT:        1     2.6    0.2    0.0       <total>
 
 # CHECK:      [3] Code Region - G04
 
@@ -841,7 +841,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.76
 # CHECK-NEXT: IPC:               0.98
 # CHECK-NEXT: Block RThroughput: 3.0
@@ -852,9 +852,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.16b }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eeeeeeER. .   ld1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,2]     D==eeeeeeER .   ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,4]     D====eeeeeeER   ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,2]     .D=eeeeeeER .   ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,3]     .D==eeeeeeER.   ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,4]     . D==eeeeeeER   ld1	{ v1.4h, v2.4h }, [x27], #16
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -865,10 +865,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [4] Code Region - G05
 
@@ -877,7 +877,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               0.98
 # CHECK-NEXT: Block RThroughput: 3.3
@@ -888,9 +888,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.4s, v2.4s }, [x27], #32
 # CHECK-NEXT: [0,1]     D=eeeeeeER. .   ld1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,2]     D==eeeeeeER .   ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ld1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,4]     D====eeeeeeER   ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,2]     .D=eeeeeeER .   ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,3]     .D==eeeeeeER.   ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,4]     . D==eeeeeeER   ld1	{ v1.1d, v2.1d }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -901,10 +901,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], #32
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [5] Code Region - G06
 
@@ -913,7 +913,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               0.98
 # CHECK-NEXT: Block RThroughput: 3.3
@@ -924,9 +924,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.2d, v2.2d }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eeeeeeER. .   ld1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeER .   ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ld1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,4]     D====eeeeeeER   ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,2]     .D=eeeeeeER .   ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D==eeeeeeER.   ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,4]     . D==eeeeeeER   ld1	{ v1.8b, v2.8b }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -937,10 +937,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [6] Code Region - G07
 
@@ -949,7 +949,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1800
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.54
 # CHECK-NEXT: IPC:               0.98
 # CHECK-NEXT: Block RThroughput: 4.3
@@ -960,9 +960,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.8h, v2.8h }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eeeeeeER. .   ld1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeER .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,4]     .D===eeeeeeER   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,2]     .D=eeeeeeER .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,3]     . D=eeeeeeER.   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,4]     .  D=eeeeeeER   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -973,10 +973,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT:        1     1.8    0.2    0.0       <total>
 
 # CHECK:      [7] Code Region - G08
 
@@ -985,7 +985,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.94
 # CHECK-NEXT: IPC:               0.98
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -995,10 +995,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,1]     D=eeeeeeER. .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,2]     D==eeeeeeER .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,4]     .D===eeeeeeER   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     .DeeeeeeER. .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,2]     . DeeeeeeER .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3]     .  DeeeeeeER.   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,4]     .   DeeeeeeER   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1008,11 +1008,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [8] Code Region - G09
 
@@ -1021,7 +1021,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.94
 # CHECK-NEXT: IPC:               0.98
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -1031,10 +1031,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeeeER. .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeER .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,4]     .D===eeeeeeER   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeeeER. .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,2]     . DeeeeeeER .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeeeeeeER.   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,4]     .   DeeeeeeER   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1044,11 +1044,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [9] Code Region - G10
 
@@ -1057,7 +1057,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      608
 # CHECK-NEXT: Total uOps:        2200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.62
 # CHECK-NEXT: IPC:               0.82
 # CHECK-NEXT: Block RThroughput: 5.7
@@ -1067,10 +1067,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeER .  .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeeeER.  .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeER  .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D==eeeeeeeER.   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,4]     .D===eeeeeeeER   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,1]     .DeeeeeeER.  .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,2]     . DeeeeeeER  .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeeeeeeeER.   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,4]     .   DeeeeeeeER   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1080,11 +1080,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT:        1     2.6    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [10] Code Region - G11
 
@@ -1093,7 +1093,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      675
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.70
 # CHECK-NEXT: IPC:               0.74
 # CHECK-NEXT: Block RThroughput: 6.7
@@ -1103,10 +1103,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeER.   .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eeeeeeeER   .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,2]     D==eeeeeeeER  .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,3]     .D===eeeeeeeER.   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,4]     .D====eeeeeeeER   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,1]     .DeeeeeeeER   .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,2]     . DeeeeeeeER  .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3]     .  D=eeeeeeeER.   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,4]     .   D=eeeeeeeER   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1116,11 +1116,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 3.     1     4.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT:        1     3.0    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 3.     1     2.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT:        1     1.4    0.4    0.0       <total>
 
 # CHECK:      [11] Code Region - G12
 
@@ -1129,7 +1129,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      675
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.70
 # CHECK-NEXT: IPC:               0.74
 # CHECK-NEXT: Block RThroughput: 6.7
@@ -1139,10 +1139,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeER.   .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,1]     D=eeeeeeeER   .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeeER  .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===eeeeeeeER.   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,4]     .D====eeeeeeeER   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeeeeER   .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,2]     . DeeeeeeeER  .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D=eeeeeeeER.   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,4]     .   D=eeeeeeeER   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1152,11 +1152,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT:        1     3.0    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT:        1     1.4    0.4    0.0       <total>
 
 # CHECK:      [12] Code Region - G13
 
@@ -1165,7 +1165,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      1210
 # CHECK-NEXT: Total uOps:        2300
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    1.90
 # CHECK-NEXT: IPC:               0.41
 # CHECK-NEXT: Block RThroughput: 5.7
@@ -1175,10 +1175,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789          01
 
 # CHECK:      [0,0]     DeeeeeeeER.    .    ..   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeeeeER    .    ..   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeeER   .    ..   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===eeeeeeeER .    ..   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,4]     .D==========eeeeeeeeER   ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,1]     .DeeeeeeeER    .    ..   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   .    ..   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D=eeeeeeeER .    ..   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,4]     .   D=======eeeeeeeeER   ld1	{ v1.b }[0], [x27], #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1188,11 +1188,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT:        1     4.2    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT:        1     2.6    0.4    0.0       <total>
 
 # CHECK:      [13] Code Region - G14
 
@@ -1201,10 +1201,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.37
 # CHECK-NEXT: IPC:               0.12
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
@@ -1212,9 +1212,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld1	{ v1.b }[8], [x27], #1
 # CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .    .    .    . .   ld1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,2]     D================eeeeeeeeER   .    .    . .   ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     D========================eeeeeeeeER.    . .   ld1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,4]     D================================eeeeeeeeER   ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,2]     .D===============eeeeeeeeER   .    .    . .   ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .D=======================eeeeeeeeER.    . .   ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,4]     . D==============================eeeeeeeeER   ld1	{ v1.h }[4], [x27], #2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1225,10 +1225,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.b }[8], [x27], #1
 # CHECK-NEXT: 1.     1     9.0    0.0    0.0       ld1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: 2.     1     17.0   0.0    0.0       ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: 3.     1     25.0   0.0    0.0       ld1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: 4.     1     33.0   0.0    0.0       ld1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT:        1     17.0   0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     16.0   0.0    0.0       ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     24.0   0.0    0.0       ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 4.     1     31.0   0.0    0.0       ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT:        1     16.2   0.2    0.0       <total>
 
 # CHECK:      [14] Code Region - G15
 
@@ -1237,10 +1237,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.37
 # CHECK-NEXT: IPC:               0.12
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
@@ -1248,9 +1248,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld1	{ v1.h }[0], [x27], x28
 # CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .    .    .    . .   ld1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,2]     D================eeeeeeeeER   .    .    . .   ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,3]     D========================eeeeeeeeER.    . .   ld1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,4]     D================================eeeeeeeeER   ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,2]     .D===============eeeeeeeeER   .    .    . .   ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,3]     .D=======================eeeeeeeeER.    . .   ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,4]     . D==============================eeeeeeeeER   ld1	{ v1.d }[0], [x27], #8
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1261,10 +1261,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.h }[0], [x27], x28
 # CHECK-NEXT: 1.     1     9.0    0.0    0.0       ld1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: 2.     1     17.0   0.0    0.0       ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: 3.     1     25.0   0.0    0.0       ld1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: 4.     1     33.0   0.0    0.0       ld1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT:        1     17.0   0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     16.0   0.0    0.0       ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 3.     1     24.0   0.0    0.0       ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 4.     1     31.0   0.0    0.0       ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT:        1     16.2   0.2    0.0       <total>
 
 # CHECK:      [15] Code Region - G16
 
@@ -1273,10 +1273,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    1.25
 # CHECK-NEXT: IPC:               0.42
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -1284,9 +1284,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld1	{ v1.d }[0], [x27], x28
 # CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld1r	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,2]     D==eeeeeeeeER .   ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: [0,3]     D===eeeeeeeeER.   ld1r	{ v1.2s }, [x27], #4
-# CHECK-NEXT: [0,4]     D====eeeeeeeeER   ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: [0,2]     .D=eeeeeeeeER .   ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: [0,3]     .D==eeeeeeeeER.   ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: [0,4]     . D==eeeeeeeeER   ld1r	{ v1.4h }, [x27], #2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1297,10 +1297,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.d }[0], [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1r	{ v1.1d }, [x27], #8
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1r	{ v1.2s }, [x27], #4
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1r	{ v1.4h }, [x27], #2
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [16] Code Region - G17
 
@@ -1309,10 +1309,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.94
 # CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -1320,9 +1320,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld1r	{ v1.4s }, [x27], #4
 # CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld1r	{ v1.8b }, [x27], #1
-# CHECK-NEXT: [0,2]     D==eeeeeeeeER .   ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: [0,3]     D===eeeeeeeeER.   ld1r	{ v1.16b }, [x27], #1
-# CHECK-NEXT: [0,4]     D====eeeeeeeeER   ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,2]     .D=eeeeeeeeER .   ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: [0,3]     .D==eeeeeeeeER.   ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: [0,4]     . D==eeeeeeeeER   ld1r	{ v1.1d }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1333,10 +1333,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.4s }, [x27], #4
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1r	{ v1.8b }, [x27], #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1r	{ v1.16b }, [x27], #1
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1r	{ v1.1d }, [x27], x28
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [17] Code Region - G18
 
@@ -1345,10 +1345,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.94
 # CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -1356,9 +1356,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld1r	{ v1.2d }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld1r	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeeeER .   ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     D===eeeeeeeeER.   ld1r	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,4]     D====eeeeeeeeER   ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,2]     .D=eeeeeeeeER .   ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D==eeeeeeeeER.   ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,4]     . D==eeeeeeeeER   ld1r	{ v1.8b }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1369,10 +1369,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.2d }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1r	{ v1.2s }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld1r	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ld1r	{ v1.8b }, [x27], x28
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [18] Code Region - G19
 
@@ -1381,10 +1381,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        1900
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.73
 # CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 2.0
+# CHECK-NEXT: Block RThroughput: 3.2
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -1392,9 +1392,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld1r	{ v1.8h }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld1r	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeeeER .   ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,3]     D===eeeeeeeeER.   ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,4]     .D===eeeeeeeeER   ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,2]     .D=eeeeeeeeER .   ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,3]     . D=eeeeeeeeER.   ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,4]     .  D=eeeeeeeeER   ld2	{ v1.4h, v2.4h }, [x27], #16
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1405,10 +1405,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.8h }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld1r	{ v1.16b }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT:        1     1.8    0.2    0.0       <total>
 
 # CHECK:      [19] Code Region - G20
 
@@ -1417,20 +1417,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.71
 # CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,2]     D==eeeeeeeeER .   ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,3]     .D==eeeeeeeeER.   ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,4]     .D===eeeeeeeeER   ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeeeeeER  .   ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER .   ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,3]     .  DeeeeeeeeER.   ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER   ld2	{ v1.2d, v2.2d }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1440,11 +1440,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT:        1     2.6    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [20] Code Region - G21
 
@@ -1453,20 +1453,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.31
 # CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.7
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeeeER .   ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D==eeeeeeeeER.   ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,4]     .D===eeeeeeeeER   ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeeeeeER  .   ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER .   ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeeeeeeeeER.   ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER   ld2	{ v1.8h, v2.8h }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1476,11 +1476,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT:        1     2.6    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [21] Code Region - G22
 
@@ -1489,20 +1489,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      3310
 # CHECK-NEXT: Total uOps:        2100
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.63
 # CHECK-NEXT: IPC:               0.15
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .    .    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,2]     D================eeeeeeeeER   .    .    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,3]     .D=======================eeeeeeeeER.    . .   ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,4]     .D===============================eeeeeeeeER   ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     .D=======eeeeeeeeER .    .    .    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,2]     . D==============eeeeeeeeER   .    .    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,3]     .  D=====================eeeeeeeeER.    . .   ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,4]     .   D============================eeeeeeeeER   ld2	{ v1.b, v2.b }[8], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1512,11 +1512,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 2.     1     17.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 3.     1     24.0   0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: 4.     1     32.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT:        1     16.6   0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 2.     1     15.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 3.     1     22.0   0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 4.     1     29.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT:        1     15.0   0.2    0.0       <total>
 
 # CHECK:      [22] Code Region - G23
 
@@ -1525,20 +1525,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.50
 # CHECK-NEXT: IPC:               0.12
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .    .    .    . .   ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,2]     D================eeeeeeeeER   .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,3]     D========================eeeeeeeeER.    . .   ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,4]     .D===============================eeeeeeeeER   ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,1]     .D=======eeeeeeeeER .    .    .    .    . .   ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,2]     . D==============eeeeeeeeER   .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D=====================eeeeeeeeER.    . .   ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,4]     .   D============================eeeeeeeeER   ld2	{ v1.s, v2.s }[0], [x27], #8
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1548,11 +1548,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 2.     1     17.0   0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 3.     1     25.0   0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: 4.     1     32.0   0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT:        1     16.8   0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 2.     1     15.0   0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 3.     1     22.0   0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 4.     1     29.0   0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT:        1     15.0   0.2    0.0       <total>
 
 # CHECK:      [23] Code Region - G24
 
@@ -1561,20 +1561,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      2603
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.77
 # CHECK-NEXT: IPC:               0.19
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
 # CHECK-NEXT: Index     0123456789          012345678
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .  .   ld2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .  .   ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,2]     D================eeeeeeeeER .   ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,3]     D=================eeeeeeeeER.   ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,4]     .D=================eeeeeeeeER   ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,1]     .D=======eeeeeeeeER .    .  .   ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,2]     . D==============eeeeeeeeER .   ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D==============eeeeeeeeER.   ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,4]     .   D==============eeeeeeeeER   ld2r	{ v1.2d, v2.2d }, [x27], #16
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1584,11 +1584,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 2.     1     17.0   0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 3.     1     18.0   0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 4.     1     18.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT:        1     12.6   0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 2.     1     15.0   0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 3.     1     15.0   0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT:        1     10.8   0.2    0.0       <total>
 
 # CHECK:      [24] Code Region - G25
 
@@ -1597,20 +1597,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.92
 # CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2r	{ v1.2s, v2.2s }, [x27], #8
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: [0,2]     D==eeeeeeeeER .   ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: [0,3]     D===eeeeeeeeER.   ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: [0,4]     .D===eeeeeeeeER   ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,1]     .DeeeeeeeeER  .   ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER .   ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,3]     .  DeeeeeeeeER.   ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER   ld2r	{ v1.8h, v2.8h }, [x27], #4
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1620,11 +1620,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], #8
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [25] Code Region - G26
 
@@ -1633,20 +1633,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.92
 # CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2r	{ v1.16b, v2.16b }, [x27], #2
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeeeER .   ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     D===eeeeeeeeER.   ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,4]     .D===eeeeeeeeER   ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeeeeeER  .   ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER .   ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeeeeeeeeER.   ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER   ld2r	{ v1.4h, v2.4h }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1656,11 +1656,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], #2
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [26] Code Region - G27
 
@@ -1669,20 +1669,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2300
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.51
 # CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 2.8
+# CHECK-NEXT: Block RThroughput: 3.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2r	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeeeeeER .   ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     D===eeeeeeeeER.   ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,4]     .D===eeeeeeeeER   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,1]     .DeeeeeeeeER  .   ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER .   ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeeeeeeeeER.   ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1692,33 +1692,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [27] Code Region - G28
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      510
+# CHECK-NEXT: Total Cycles:      709
 # CHECK-NEXT: Total uOps:        3200
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.27
-# CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.51
+# CHECK-NEXT: IPC:               0.71
+# CHECK-NEXT: Block RThroughput: 5.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER   .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,2]     .D=eeeeeeeeER .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,3]     .D==eeeeeeeeER.   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,4]     . D==eeeeeeeeER   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK:      [0,0]     DeeeeeeeeER    .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1]     .DeeeeeeeeER   .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3]     .   DeeeeeeeeER.   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,4]     .    DeeeeeeeeER   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1728,33 +1728,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT:        1     1.0    0.4    0.0       <total>
 
 # CHECK:      [28] Code Region - G29
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      510
+# CHECK-NEXT: Total Cycles:      809
 # CHECK-NEXT: Total uOps:        3300
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.47
-# CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 4.3
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.08
+# CHECK-NEXT: IPC:               0.62
+# CHECK-NEXT: Block RThroughput: 5.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     0123456
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER   .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,2]     .D=eeeeeeeeER .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D==eeeeeeeeER.   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,4]     . D==eeeeeeeeER   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     . DeeeeeeeeER  ..   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,2]     .   DeeeeeeeeER..   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeeeeeeeER.   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .DeeeeeeeeER   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1764,33 +1764,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT:        1     1.0    0.6    0.0       <total>
 
 # CHECK:      [29] Code Region - G30
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      1910
+# CHECK-NEXT: Total Cycles:      1911
 # CHECK-NEXT: Total uOps:        3200
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    1.68
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    1.67
 # CHECK-NEXT: IPC:               0.26
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 5.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .  .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER   .    .    .  .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,2]     .D=eeeeeeeeER  .    .    .  .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=========eeeeeeeeER    .  .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,4]     . D================eeeeeeeeER   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .   .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeeeeeER   .    .    .   .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,2]     .  DeeeeeeeeER .    .    .   .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    D======eeeeeeeeER   .   .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,4]     .    .D=============eeeeeeeeER   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1800,11 +1800,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     10.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 4.     1     17.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT:        1     6.4    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     7.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT:        1     4.8    0.4    0.0       <total>
 
 # CHECK:      [30] Code Region - G31
 
@@ -1813,20 +1813,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.75
 # CHECK-NEXT: IPC:               0.12
-# CHECK-NEXT: Block RThroughput: 3.8
+# CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,2]     .D===============eeeeeeeeER   .    .    . .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,3]     .D=======================eeeeeeeeER.    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,4]     . D==============================eeeeeeeeER   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .D=======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,2]     . D==============eeeeeeeeER   .    .    . .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,3]     .  D=====================eeeeeeeeER.    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,4]     .   D============================eeeeeeeeER   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1836,11 +1836,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 2.     1     16.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 3.     1     24.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 4.     1     31.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT:        1     16.2   0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 2.     1     15.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 3.     1     22.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 4.     1     29.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT:        1     15.0   0.2    0.0       <total>
 
 # CHECK:      [31] Code Region - G32
 
@@ -1849,20 +1849,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.75
 # CHECK-NEXT: IPC:               0.12
-# CHECK-NEXT: Block RThroughput: 3.8
+# CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .    .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,2]     .D===============eeeeeeeeER   .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .D=======================eeeeeeeeER.    . .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,4]     . D==============================eeeeeeeeER   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .D=======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,2]     . D==============eeeeeeeeER   .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D=====================eeeeeeeeER.    . .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,4]     .   D============================eeeeeeeeER   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1872,33 +1872,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 2.     1     16.0   0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 3.     1     24.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 4.     1     31.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT:        1     16.2   0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     8.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 2.     1     15.0   0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     22.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 4.     1     29.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT:        1     15.0   0.2    0.0       <total>
 
 # CHECK:      [32] Code Region - G33
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      510
+# CHECK-NEXT: Total Cycles:      709
 # CHECK-NEXT: Total uOps:        3200
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.27
-# CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.51
+# CHECK-NEXT: IPC:               0.71
+# CHECK-NEXT: Block RThroughput: 5.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER   .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: [0,2]     .D=eeeeeeeeER .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: [0,3]     .D==eeeeeeeeER.   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: [0,4]     . D==eeeeeeeeER   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK:      [0,0]     DeeeeeeeeER    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     .DeeeeeeeeER   .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,2]     .  DeeeeeeeeER .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,3]     .   DeeeeeeeeER.   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,4]     .    DeeeeeeeeER   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1908,33 +1908,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT:        1     1.0    0.4    0.0       <total>
 
 # CHECK:      [33] Code Region - G34
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      510
+# CHECK-NEXT: Total Cycles:      809
 # CHECK-NEXT: Total uOps:        3300
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.47
-# CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 4.3
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.08
+# CHECK-NEXT: IPC:               0.62
+# CHECK-NEXT: Block RThroughput: 5.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     0123456
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER   .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: [0,2]     .D=eeeeeeeeER .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: [0,3]     .D==eeeeeeeeER.   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,4]     . D==eeeeeeeeER   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1]     .DeeeeeeeeER   ..   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,2]     .  DeeeeeeeeER ..   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,3]     .    DeeeeeeeeER.   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .DeeeeeeeeER   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1944,33 +1944,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT:        1     1.0    0.6    0.0       <total>
 
 # CHECK:      [34] Code Region - G35
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      510
+# CHECK-NEXT: Total Cycles:      709
 # CHECK-NEXT: Total uOps:        3200
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.27
-# CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.51
+# CHECK-NEXT: IPC:               0.71
+# CHECK-NEXT: Block RThroughput: 5.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER   .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeeeeeER  .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,2]     .D=eeeeeeeeER .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D==eeeeeeeeER.   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,4]     . D==eeeeeeeeER   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK:      [0,0]     DeeeeeeeeER    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeeeeeER   .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .   DeeeeeeeeER.   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,4]     .    DeeeeeeeeER   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1980,33 +1980,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT:        1     1.0    0.4    0.0       <total>
 
 # CHECK:      [35] Code Region - G36
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      710
+# CHECK-NEXT: Total Cycles:      1010
 # CHECK-NEXT: Total uOps:        4500
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.34
-# CHECK-NEXT: IPC:               0.70
-# CHECK-NEXT: Block RThroughput: 7.0
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.46
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     0123456789
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeeeeeeeeeER  ..   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER  ..   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,3]     . D=eeeeeeeeER ..   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,4]     .  D==eeeeeeeeeER   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK:      [0,0]     DeeeeeeeeER    .   .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeeeeeeeeER .   .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,2]     .   DeeeeeeeeER.   .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .DeeeeeeeeER  .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,4]     .    .  DeeeeeeeeeER   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2016,33 +2016,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 4.     1     3.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT:        1     1.6    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT:        1     1.0    1.0    0.0       <total>
 
 # CHECK:      [36] Code Region - G37
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      810
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        4900
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.05
-# CHECK-NEXT: IPC:               0.62
-# CHECK-NEXT: Block RThroughput: 8.0
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.86
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 8.2
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234567
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    . .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,1]     .DeeeeeeeeeER  . .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,2]     . DeeeeeeeeeER . .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,3]     .  DeeeeeeeeeER. .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,4]     .   D===eeeeeeeeER   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1]     . DeeeeeeeeeER .  .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,2]     .   DeeeeeeeeeER  .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .DeeeeeeeeeER.   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .  DeeeeeeeeER   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2052,33 +2052,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 4.     1     4.0    3.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT:        1     1.6    0.8    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT:        1     1.0    1.0    0.0       <total>
 
 # CHECK:      [37] Code Region - G38
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      809
+# CHECK-NEXT: Total Cycles:      1010
 # CHECK-NEXT: Total uOps:        4900
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.06
-# CHECK-NEXT: IPC:               0.62
-# CHECK-NEXT: Block RThroughput: 8.0
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.85
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 8.2
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     0123456789
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    ..   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeeeeeeeeeER  ..   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER  ..   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeeeeeeeeeER..   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,4]     .   D=eeeeeeeeeER   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK:      [0,0]     DeeeeeeeeER    .   .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeeeeeeeeER .   .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,2]     .   DeeeeeeeeER.   .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .DeeeeeeeeeER .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .  DeeeeeeeeeER   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2088,11 +2088,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT:        1     1.2    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT:        1     1.0    1.0    0.0       <total>
 
 # CHECK:      [38] Code Region - G39
 
@@ -2101,20 +2101,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        4000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    1.00
 # CHECK-NEXT: IPC:               0.12
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,2]     .D===============eeeeeeeeER   .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .D=======================eeeeeeeeER.    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,4]     . D==============================eeeeeeeeER   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,2]     .   D============eeeeeeeeER   .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .D==================eeeeeeeeER.    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,4]     .    .  D========================eeeeeeeeER   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2124,11 +2124,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 2.     1     16.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 3.     1     24.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 4.     1     31.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT:        1     16.2   0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
 
 # CHECK:      [39] Code Region - G40
 
@@ -2137,20 +2137,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        4000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    1.00
 # CHECK-NEXT: IPC:               0.12
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,1]     D========eeeeeeeeER .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,2]     .D===============eeeeeeeeER   .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,3]     .D=======================eeeeeeeeER.    . .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,4]     . D==============================eeeeeeeeER   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,2]     .   D============eeeeeeeeER   .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,3]     .    .D==================eeeeeeeeER.    . .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,4]     .    .  D========================eeeeeeeeER   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2160,11 +2160,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 2.     1     16.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 3.     1     24.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 4.     1     31.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT:        1     16.2   0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 2.     1     13.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 3.     1     19.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 4.     1     25.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT:        1     13.0   0.2    0.0       <total>
 
 # CHECK:      [40] Code Region - G41
 
@@ -2173,20 +2173,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      1903
 # CHECK-NEXT: Total uOps:        4100
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.15
 # CHECK-NEXT: IPC:               0.26
-# CHECK-NEXT: Block RThroughput: 5.3
+# CHECK-NEXT: Block RThroughput: 6.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
 # CHECK-NEXT: Index     0123456789          01
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    ..   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,1]     D========eeeeeeeeER ..   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,2]     .D========eeeeeeeeER..   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,3]     . D========eeeeeeeeER.   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: [0,4]     .  D========eeeeeeeeER   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,1]     . D======eeeeeeeeER ..   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,2]     .   D=====eeeeeeeeER..   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .D====eeeeeeeeER.   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,4]     .    .  D===eeeeeeeeER   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2196,33 +2196,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 3.     1     9.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT:        1     7.4    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     7.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3.     1     5.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT:        1     4.6    0.2    0.0       <total>
 
 # CHECK:      [41] Code Region - G42
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      659
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        4300
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.53
-# CHECK-NEXT: IPC:               0.76
-# CHECK-NEXT: Block RThroughput: 6.0
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.26
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.2
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: [0,1]     .DeeeeeeeeER   .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: [0,3]     .  DeeeeeeeeER .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: [0,4]     .   D=eeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1]     . DeeeeeeeeER  .  .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,2]     .   DeeeeeeeeER.  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,3]     .    .DeeeeeeeeER .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,4]     .    .  DeeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2232,33 +2232,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT:        1     1.2    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT:        1     1.0    1.0    0.0       <total>
 
 # CHECK:      [42] Code Region - G43
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      610
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        4200
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.89
-# CHECK-NEXT: IPC:               0.82
-# CHECK-NEXT: Block RThroughput: 5.7
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.16
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeeeeeeeeER   .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     . D=eeeeeeeeER .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,4]     .  D==eeeeeeeeER   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeeeeeeeER  .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,2]     .   DeeeeeeeeER.  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .DeeeeeeeeER .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .  DeeeeeeeeER   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2268,33 +2268,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 4.     1     3.0    1.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT:        1     1.6    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT:        1     1.0    1.0    0.0       <total>
 
 # CHECK:      [43] Code Region - G44
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      508
+# CHECK-NEXT: Total Cycles:      808
 # CHECK-NEXT: Total uOps:        3400
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    6.69
-# CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 4.3
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.21
+# CHECK-NEXT: IPC:               0.62
+# CHECK-NEXT: Block RThroughput: 5.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeeeeeeeeER.   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     . D=eeeeeeE-R   ldp	s1, s2, [x27], #248
-# CHECK-NEXT: [0,4]     .  D=eeeeeeER   ldp	d1, d2, [x27], #496
+# CHECK:      [0,0]     DeeeeeeeeER    .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeeeeeeeER  .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,2]     .   DeeeeeeeeER.   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .DeeeeeeER.   ldp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,4]     .    . DeeeeeeER   ldp	d1, d2, [x27], #496
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2304,11 +2304,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    1.0       ldp	s1, s2, [x27], #248
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldp	d1, d2, [x27], #496
-# CHECK-NEXT:        1     1.4    0.2    0.2       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       ldp	s1, s2, [x27], #248
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ldp	d1, d2, [x27], #496
+# CHECK-NEXT:        1     1.0    0.8    0.0       <total>
 
 # CHECK:      [44] Code Region - G45
 
@@ -2317,20 +2317,20 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        2300
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.54
 # CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 2.3
+# CHECK-NEXT: Block RThroughput: 3.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeER ..   ldp	q1, q2, [x27], #992
-# CHECK-NEXT: [0,1]     D=eeeeeeER..   ldp	s1, s2, [x27, #248]!
-# CHECK-NEXT: [0,2]     D==eeeeeeER.   ldp	d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,3]     .D==eeeeeeER   ldp	q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,4]     .D===eeeeE-R   ldp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,1]     .DeeeeeeER..   ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,2]     . DeeeeeeER.   ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,3]     .  DeeeeeeER   ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,4]     .   DeeeeE-R   ldp	w1, w2, [x27], #248
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2340,11 +2340,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	q1, q2, [x27], #992
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       ldp	s1, s2, [x27, #248]!
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ldp	d1, d2, [x27, #496]!
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ldp	q1, q2, [x27, #992]!
-# CHECK-NEXT: 4.     1     4.0    0.0    1.0       ldp	w1, w2, [x27], #248
-# CHECK-NEXT:        1     2.6    0.2    0.2       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 4.     1     1.0    0.0    1.0       ldp	w1, w2, [x27], #248
+# CHECK-NEXT:        1     1.0    0.2    0.2       <total>
 
 # CHECK:      [45] Code Region - G46
 
@@ -2353,10 +2353,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        2100
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.14
 # CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 3.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01
@@ -2364,9 +2364,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeER   ..   ldp	x1, x2, [x27], #496
 # CHECK-NEXT: [0,1]     D=eeeeER  ..   ldp	w1, w2, [x27, #248]!
-# CHECK-NEXT: [0,2]     D==eeeeER ..   ldp	x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,3]     D===eeeeeER.   ldpsw	x1, x2, [x27], #248
-# CHECK-NEXT: [0,4]     .D===eeeeeER   ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: [0,2]     .D=eeeeER ..   ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,3]     . D=eeeeeER.   ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: [0,4]     .  D=eeeeeER   ldpsw	x1, x2, [x27, #248]!
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2377,10 +2377,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	x1, x2, [x27], #496
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ldp	w1, w2, [x27, #248]!
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ldp	x1, x2, [x27, #496]!
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ldpsw	x1, x2, [x27], #248
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ldpsw	x1, x2, [x27, #248]!
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT:        1     1.8    0.2    0.0       <total>
 
 # CHECK:      [46] Code Region - G47
 
@@ -2389,10 +2389,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -2400,9 +2400,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeER . .   ldr	b1, [x27], #254
 # CHECK-NEXT: [0,1]     D=eeeeeeER. .   ldr	h1, [x27], #254
-# CHECK-NEXT: [0,2]     D==eeeeeeER .   ldr	s1, [x27], #254
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ldr	d1, [x27], #254
-# CHECK-NEXT: [0,4]     D====eeeeeeER   ldr	q1, [x27], #254
+# CHECK-NEXT: [0,2]     .D=eeeeeeER .   ldr	s1, [x27], #254
+# CHECK-NEXT: [0,3]     .D==eeeeeeER.   ldr	d1, [x27], #254
+# CHECK-NEXT: [0,4]     . D==eeeeeeER   ldr	q1, [x27], #254
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2413,10 +2413,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27], #254
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ldr	h1, [x27], #254
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ldr	s1, [x27], #254
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ldr	d1, [x27], #254
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ldr	q1, [x27], #254
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	s1, [x27], #254
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ldr	d1, [x27], #254
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldr	q1, [x27], #254
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [47] Code Region - G48
 
@@ -2425,10 +2425,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               0.98
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -2436,9 +2436,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeeeeeER . .   ldr	b1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eeeeeeER. .   ldr	h1, [x27, #254]!
-# CHECK-NEXT: [0,2]     D==eeeeeeER .   ldr	s1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D===eeeeeeER.   ldr	d1, [x27, #254]!
-# CHECK-NEXT: [0,4]     D====eeeeeeER   ldr	q1, [x27, #254]!
+# CHECK-NEXT: [0,2]     .D=eeeeeeER .   ldr	s1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .D==eeeeeeER.   ldr	d1, [x27, #254]!
+# CHECK-NEXT: [0,4]     . D==eeeeeeER   ldr	q1, [x27, #254]!
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2449,10 +2449,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ldr	h1, [x27, #254]!
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       ldr	s1, [x27, #254]!
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ldr	d1, [x27, #254]!
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ldr	q1, [x27, #254]!
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	s1, [x27, #254]!
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ldr	d1, [x27, #254]!
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldr	q1, [x27, #254]!
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [48] Code Region - G49
 
@@ -2461,7 +2461,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        1000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    1.98
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 1.7
@@ -2473,8 +2473,8 @@ ldr  x2, [x1], #254
 # CHECK:      [0,0]     DeeeeER   .   ldr	w1, [x27], #254
 # CHECK-NEXT: [0,1]     D=eeeeER  .   ldr	x1, [x27], #254
 # CHECK-NEXT: [0,2]     D==eeeeER .   ldr	w1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D===eeeeER.   ldr	x1, [x27, #254]!
-# CHECK-NEXT: [0,4]     D====eeeeER   ldrb	w1, [x27], #254
+# CHECK-NEXT: [0,3]     .D==eeeeER.   ldr	x1, [x27, #254]!
+# CHECK-NEXT: [0,4]     .D===eeeeER   ldrb	w1, [x27], #254
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2486,9 +2486,9 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	w1, [x27], #254
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ldr	x1, [x27], #254
 # CHECK-NEXT: 2.     1     3.0    0.0    0.0       ldr	w1, [x27, #254]!
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ldr	x1, [x27, #254]!
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ldrb	w1, [x27], #254
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ldr	x1, [x27, #254]!
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ldrb	w1, [x27], #254
+# CHECK-NEXT:        1     2.6    0.2    0.0       <total>
 
 # CHECK:      [49] Code Region - G50
 
@@ -2497,7 +2497,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        1000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    1.98
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 1.7
@@ -2509,8 +2509,8 @@ ldr  x2, [x1], #254
 # CHECK:      [0,0]     DeeeeER   .   ldrb	w1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eeeeER  .   ldrh	w1, [x27], #254
 # CHECK-NEXT: [0,2]     D==eeeeER .   ldrh	w1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D===eeeeER.   ldrsb	w1, [x27], #254
-# CHECK-NEXT: [0,4]     D====eeeeER   ldrsb	x1, [x27], #254
+# CHECK-NEXT: [0,3]     .D==eeeeER.   ldrsb	w1, [x27], #254
+# CHECK-NEXT: [0,4]     .D===eeeeER   ldrsb	x1, [x27], #254
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2522,9 +2522,9 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrb	w1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ldrh	w1, [x27], #254
 # CHECK-NEXT: 2.     1     3.0    0.0    0.0       ldrh	w1, [x27, #254]!
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ldrsb	w1, [x27], #254
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ldrsb	x1, [x27], #254
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ldrsb	w1, [x27], #254
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ldrsb	x1, [x27], #254
+# CHECK-NEXT:        1     2.6    0.2    0.0       <total>
 
 # CHECK:      [50] Code Region - G51
 
@@ -2533,7 +2533,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        1000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    1.98
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 1.7
@@ -2545,8 +2545,8 @@ ldr  x2, [x1], #254
 # CHECK:      [0,0]     DeeeeER   .   ldrsb	w1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eeeeER  .   ldrsb	x1, [x27, #254]!
 # CHECK-NEXT: [0,2]     D==eeeeER .   ldrsh	w1, [x27], #254
-# CHECK-NEXT: [0,3]     D===eeeeER.   ldrsh	x1, [x27], #254
-# CHECK-NEXT: [0,4]     D====eeeeER   ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .D==eeeeER.   ldrsh	x1, [x27], #254
+# CHECK-NEXT: [0,4]     .D===eeeeER   ldrsh	w1, [x27, #254]!
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2558,9 +2558,9 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrsb	w1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ldrsb	x1, [x27, #254]!
 # CHECK-NEXT: 2.     1     3.0    0.0    0.0       ldrsh	w1, [x27], #254
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       ldrsh	x1, [x27], #254
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       ldrsh	w1, [x27, #254]!
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       ldrsh	x1, [x27], #254
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       ldrsh	w1, [x27, #254]!
+# CHECK-NEXT:        1     2.6    0.2    0.0       <total>
 
 # CHECK:      [51] Code Region - G52
 
@@ -2569,10 +2569,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        1200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.38
 # CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 1.0
+# CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
@@ -2580,8 +2580,8 @@ ldr  x2, [x1], #254
 # CHECK:      [0,0]     DeeeeER .   ldrsh	x1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eeeeER.   ldrsw	x1, [x27], #254
 # CHECK-NEXT: [0,2]     D==eeeeER   ldrsw	x1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D===eeE-R   st1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,4]     D====eeER   st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,3]     .D==eeE-R   st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,4]     .D===eeER   st1	{ v1.2d }, [x27], #16
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2593,9 +2593,9 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrsh	x1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       ldrsw	x1, [x27], #254
 # CHECK-NEXT: 2.     1     3.0    0.0    0.0       ldrsw	x1, [x27, #254]!
-# CHECK-NEXT: 3.     1     4.0    0.0    1.0       st1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
-# CHECK-NEXT:        1     3.0    0.2    0.2       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    1.0       st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT:        1     2.6    0.2    0.2       <total>
 
 # CHECK:      [52] Code Region - G53
 
@@ -2604,7 +2604,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.98
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -2614,9 +2614,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.2s }, [x27], #8
 # CHECK-NEXT: [0,1]     D=eeER  .   st1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,2]     D==eeER .   st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,3]     D===eeER.   st1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: [0,4]     D====eeER   st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,2]     .D=eeER .   st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .D==eeER.   st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,4]     . D==eeER   st1	{ v1.8h }, [x27], #16
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2627,10 +2627,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s }, [x27], #8
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [53] Code Region - G54
 
@@ -2639,7 +2639,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.98
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -2649,9 +2649,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.16b }, [x27], #16
 # CHECK-NEXT: [0,1]     D=eeER  .   st1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeER .   st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     D===eeER.   st1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,4]     D====eeER   st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,2]     .D=eeER .   st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D==eeER.   st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,4]     . D==eeER   st1	{ v1.4h }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2662,10 +2662,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b }, [x27], #16
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [54] Code Region - G55
 
@@ -2674,7 +2674,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.98
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -2684,9 +2684,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.4s }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eeER  .   st1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeER .   st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     D===eeER.   st1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,4]     D====eeER   st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,2]     .D=eeER .   st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D==eeER.   st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,4]     . D==eeER   st1	{ v1.1d, v2.1d }, [x27], #16
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2697,10 +2697,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [55] Code Region - G56
 
@@ -2709,7 +2709,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        1900
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.77
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 3.5
@@ -2718,10 +2718,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eeER  .   st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,2]     D==eeER .   st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,3]     D===eeER.   st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,4]     .D===eeER   st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,1]     .DeeER  .   st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,2]     .D=eeER .   st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3]     . D=eeER.   st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,4]     .  D=eeER   st1	{ v1.8b, v2.8b }, [x27], #16
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2731,11 +2731,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT:        1     1.6    0.2    0.0       <total>
 
 # CHECK:      [56] Code Region - G57
 
@@ -2744,7 +2744,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2100
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.17
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 4.0
@@ -2753,10 +2753,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eeER  .   st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,2]     D==eeER .   st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D==eeER.   st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,4]     .D===eeER   st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeER  .   st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,2]     . DeeER .   st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeeER.   st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,4]     .   DeeER   st1	{ v1.2s, v2.2s }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2766,11 +2766,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT:        1     2.6    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [57] Code Region - G58
 
@@ -2779,7 +2779,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2100
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.17
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 4.0
@@ -2788,10 +2788,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeER  .   st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,2]     D==eeER .   st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     D===eeER.   st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,4]     .D===eeER   st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeER  .   st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,2]     . DeeER .   st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeeER.   st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,4]     .   DeeER   st1	{ v1.16b, v2.16b }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2801,11 +2801,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [58] Code Region - G59
 
@@ -2814,7 +2814,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        2900
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.13
 # CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 6.0
@@ -2823,10 +2823,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeER.   .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     D=eeER   .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,2]     .D=eeER  .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,3]     .D===eeER.   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,4]     . D===eeER   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     .DeeER   .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,2]     . DeeER  .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,3]     .  D=eeER.   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,4]     .   D=eeER   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2836,32 +2836,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 3.     1     4.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT:        1     2.6    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 3.     1     2.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT:        1     1.4    0.4    0.0       <total>
 
 # CHECK:      [59] Code Region - G60
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      703
+# CHECK-NEXT: Total Cycles:      704
 # CHECK-NEXT: Total uOps:        3100
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    4.41
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.40
 # CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 6.5
 
 # CHECK:      Timeline view:
+# CHECK-NEXT:                     0
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.   .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,1]     D=eeER   .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,2]     .D=eeER  .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,3]     .D===eeER.   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,4]     . D===eeER   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK:      [0,0]     DeeER.    .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1]     .DeeER    .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,2]     .  DeeER  .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,3]     .   DeeER .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,4]     .    D=eeER   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2871,11 +2872,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 3.     1     4.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT:        1     2.6    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT:        1     1.2    0.6    0.0       <total>
 
 # CHECK:      [60] Code Region - G61
 
@@ -2884,7 +2885,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        2900
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.13
 # CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 6.0
@@ -2893,10 +2894,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeER.   .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeER   .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,2]     .D=eeER  .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D==eeER .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,4]     . D===eeER   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeER   .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,2]     . DeeER  .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeeER .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,4]     .   D=eeER   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2906,11 +2907,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 4.     1     4.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT:        1     2.4    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT:        1     1.2    0.4    0.0       <total>
 
 # CHECK:      [61] Code Region - G62
 
@@ -2919,7 +2920,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      704
 # CHECK-NEXT: Total uOps:        3100
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.40
 # CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 6.5
@@ -2929,10 +2930,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeER.    .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeER    .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,2]     .D==eeER  .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,3]     .D===eeER .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,4]     . D====eeER   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeeER    .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,2]     . D=eeER  .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3]     .   DeeER .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,4]     .    D=eeER   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2942,33 +2943,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 2.     1     3.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 4.     1     5.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT:        1     3.0    0.6    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 2.     1     2.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT:        1     1.4    0.6    0.0       <total>
 
 # CHECK:      [62] Code Region - G63
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      804
+# CHECK-NEXT: Total Cycles:      805
 # CHECK-NEXT: Total uOps:        3700
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.60
 # CHECK-NEXT: IPC:               0.62
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
+# CHECK-NEXT:                     012
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.    ..   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,1]     D=eeER    ..   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,2]     .D==eeER  ..   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,3]     . D==eeER ..   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,4]     . D=====eeER   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1]     . DeeER   . .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,2]     .  D=eeER . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3]     .    DeeER. .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,4]     .    . D=eeER   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2978,32 +2979,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 2.     1     3.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 4.     1     6.0    2.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT:        1     3.0    0.8    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 2.     1     2.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 4.     1     2.0    2.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT:        1     1.4    1.0    0.0       <total>
 
 # CHECK:      [63] Code Region - G64
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      703
+# CHECK-NEXT: Total Cycles:      704
 # CHECK-NEXT: Total uOps:        3300
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.69
 # CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
+# CHECK-NEXT:                     0
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.   .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeER   .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,2]     .D==eeER .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===eeER.   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,4]     . D===eeER   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK:      [0,0]     DeeER.    .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeER   .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,2]     .  D=eeER .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .   D=eeER.   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .DeeER   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3013,33 +3015,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 2.     1     3.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT:        1     2.8    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 2.     1     2.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT:        1     1.4    0.6    0.0       <total>
 
 # CHECK:      [64] Code Region - G65
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      706
+# CHECK-NEXT: Total Cycles:      707
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    4.25
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    4.24
 # CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     0123
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeeER    . .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,2]     .D===eeeeER .   st1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,3]     . D===eeeeER.   st1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,4]     . D====eeeeER   st1	{ v1.b }[0], [x27], x28
+# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeER   .  .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,2]     .   DeeeeER  .   st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,3]     .    D=eeeeER.   st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,4]     .    .D=eeeeER   st1	{ v1.b }[0], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3049,11 +3051,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 2.     1     4.0    2.0    0.0       st1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT:        1     3.0    0.6    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 3.     1     2.0    1.0    0.0       st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT:        1     1.4    0.8    0.0       <total>
 
 # CHECK:      [65] Code Region - G66
 
@@ -3062,7 +3064,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.95
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -3072,10 +3074,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeER   .   st1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeER  .   st1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,2]     D==eeeeER .   st1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,3]     D===eeeeER.   st1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,4]     .D===eeeeER   st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeER  .   st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,2]     . DeeeeER .   st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,3]     .  DeeeeER.   st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,4]     .   DeeeeER   st1	{ v1.h }[4], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3085,11 +3087,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [66] Code Region - G67
 
@@ -3098,7 +3100,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      605
 # CHECK-NEXT: Total uOps:        2300
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.80
 # CHECK-NEXT: IPC:               0.83
 # CHECK-NEXT: Block RThroughput: 6.0
@@ -3108,10 +3110,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeER   .   st1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,1]     D=eeeeER  .   st1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeER .   st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,3]     D===eeeeER.   st1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,4]     .D===eeeeER   st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeeeeER  .   st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,2]     . DeeeeER .   st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .  DeeeeER.   st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,4]     .   DeeeeER   st2	{ v1.2d, v2.2d }, [x27], #32
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3121,11 +3123,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [67] Code Region - G68
 
@@ -3134,7 +3136,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      705
 # CHECK-NEXT: Total uOps:        2600
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.69
 # CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 7.0
@@ -3144,10 +3146,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeER   ..   st2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,1]     D=eeeeER  ..   st2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,2]     D==eeeeER ..   st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,3]     .D==eeeeER..   st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,4]     .D====eeeeER   st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeeeeER  ..   st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,2]     . DeeeeER ..   st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,3]     .  DeeeeER..   st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,4]     .   D=eeeeER   st2	{ v1.8h, v2.8h }, [x27], #32
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3157,33 +3159,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 4.     1     5.0    1.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT:        1     2.8    0.4    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT:        1     1.2    0.4    0.0       <total>
 
 # CHECK:      [68] Code Region - G69
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      805
+# CHECK-NEXT: Total Cycles:      806
 # CHECK-NEXT: Total uOps:        2900
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.60
 # CHECK-NEXT: IPC:               0.62
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     0123
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   . .   st2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,1]     D=eeeeER  . .   st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,2]     .D===eeeeER .   st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D====eeeeER.   st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,4]     .D=====eeeeER   st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK:      [0,0]     DeeeeER   .  .   st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1]     . DeeeeER .  .   st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,2]     .  D=eeeeER  .   st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .   D==eeeeER.   st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,4]     .    D==eeeeER   st2	{ v1.4s, v2.4s }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3193,33 +3195,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 2.     1     4.0    2.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     5.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 4.     1     6.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT:        1     3.6    0.6    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 2.     1     2.0    1.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    1.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT:        1     2.0    0.8    0.0       <total>
 
 # CHECK:      [69] Code Region - G70
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      706
+# CHECK-NEXT: Total Cycles:      707
 # CHECK-NEXT: Total uOps:        2600
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.68
 # CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     0123
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   . .   st2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeER  . .   st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,2]     .D=eeeeER . .   st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D====eeeeER.   st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,4]     .D=====eeeeER   st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK:      [0,0]     DeeeeER   .  .   st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeER  .  .   st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,2]     .  DeeeeER.  .   st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .   D=eeeeER .   st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,4]     .    D==eeeeER   st2	{ v1.b, v2.b }[8], [x27], #2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3229,11 +3231,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     5.0    2.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 4.     1     6.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT:        1     3.2    0.6    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    1.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 4.     1     3.0    1.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT:        1     1.6    0.8    0.0       <total>
 
 # CHECK:      [70] Code Region - G71
 
@@ -3242,7 +3244,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.95
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -3252,10 +3254,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeER   .   st2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeER  .   st2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,2]     D==eeeeER .   st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,3]     D===eeeeER.   st2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,4]     .D===eeeeER   st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeER  .   st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,2]     . DeeeeER .   st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,3]     .  DeeeeER.   st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,4]     .   DeeeeER   st2	{ v1.h, v2.h }[0], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3265,11 +3267,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [71] Code Region - G72
 
@@ -3278,7 +3280,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.95
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -3288,10 +3290,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeER   .   st2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeER  .   st2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,2]     D==eeeeER .   st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     D===eeeeER.   st2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,4]     .D===eeeeER   st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeER  .   st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,2]     . DeeeeER .   st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  DeeeeER.   st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,4]     .   DeeeeER   st2	{ v1.d, v2.d }[0], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3301,11 +3303,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [72] Code Region - G73
 
@@ -3314,7 +3316,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      807
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.72
 # CHECK-NEXT: IPC:               0.62
 # CHECK-NEXT: Block RThroughput: 7.0
@@ -3325,9 +3327,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeER .    .   .   st2g	x26, [x27], #4064
 # CHECK-NEXT: [0,1]     D=eER.    .   .   st2g	x26, [x27, #4064]!
-# CHECK-NEXT: [0,2]     D==eeeeeeER   .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,3]     .D==eeeeeER   .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,4]     .D======eeeeeER   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,2]     .D=eeeeeeER   .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3]     .  DeeeeeER   .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,4]     .    D==eeeeeER   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3338,32 +3340,32 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2g	x26, [x27], #4064
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       st2g	x26, [x27, #4064]!
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 4.     1     7.0    3.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT:        1     3.2    0.8    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 4.     1     3.0    3.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT:        1     1.8    0.8    0.0       <total>
 
 # CHECK:      [73] Code Region - G74
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      1405
+# CHECK-NEXT: Total Cycles:      1406
 # CHECK-NEXT: Total uOps:        4700
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    3.35
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    3.34
 # CHECK-NEXT: IPC:               0.36
 # CHECK-NEXT: Block RThroughput: 14.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeeeeER .    .   .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,1]     .DeeeeeER .    .   .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,2]     . D===eeeeeeER .   .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,3]     .  D===eeeeeeER.   .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,4]     .   D=======eeeeeeER   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK:      [0,0]     DeeeeeeER .    .    .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     . DeeeeeER.    .    .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,2]     .   D==eeeeeeER.    .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3]     .    .D=eeeeeeER    .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,4]     .    .  D====eeeeeeER   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3373,33 +3375,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 2.     1     4.0    3.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 4.     1     8.0    4.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT:        1     3.6    1.6    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 2.     1     3.0    3.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 4.     1     5.0    4.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT:        1     2.4    1.8    0.0       <total>
 
 # CHECK:      [74] Code Region - G75
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      1206
+# CHECK-NEXT: Total Cycles:      1207
 # CHECK-NEXT: Total uOps:        4100
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.40
 # CHECK-NEXT: IPC:               0.41
 # CHECK-NEXT: Block RThroughput: 12.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234567
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    . .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeeER .    . .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,2]     .D===eeeeeeER  . .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     . D===eeeeeER  . .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,4]     .  D======eeeeeeER   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK:      [0,0]     DeeeeeER  .    .  .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeeeeER.    .  .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,2]     .   DeeeeeeER  .  .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .DeeeeeER .  .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .  D==eeeeeeER   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3409,33 +3411,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 2.     1     4.0    2.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 4.     1     7.0    3.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT:        1     3.6    1.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 4.     1     3.0    3.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT:        1     1.4    1.4    0.0       <total>
 
 # CHECK:      [75] Code Region - G76
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      1106
+# CHECK-NEXT: Total Cycles:      1107
 # CHECK-NEXT: Total uOps:        3800
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    3.44
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    3.43
 # CHECK-NEXT: IPC:               0.45
 # CHECK-NEXT: Block RThroughput: 11.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01234567
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    ..   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeeeeeER .    ..   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,2]     .D====eeeeeER  ..   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,3]     . D====eeeeeER ..   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,4]     . D=======eeeeeER   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK:      [0,0]     DeeeeeeER .    . .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeeeeER.    . .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,2]     .   D==eeeeeER . .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,3]     .    .D=eeeeeER. .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,4]     .    .  D==eeeeeER   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3445,20 +3447,20 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 2.     1     5.0    3.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 3.     1     5.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 4.     1     8.0    2.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT:        1     4.0    1.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 2.     1     3.0    3.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT:        1     2.0    1.4    0.0       <total>
 
 # CHECK:      [76] Code Region - G77
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      1005
+# CHECK-NEXT: Total Cycles:      1006
 # CHECK-NEXT: Total uOps:        3500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.48
 # CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 10.0
@@ -3468,10 +3470,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeER  .    .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,1]     D=eeeeeER .    .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,2]     .D===eeeeeER   .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .D====eeeeeER  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,4]     . D======eeeeeER   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,1]     . DeeeeeER.    .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,2]     .   DeeeeeER   .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .DeeeeeER .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,4]     .    .  DeeeeeER   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3481,33 +3483,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 2.     1     4.0    2.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 3.     1     5.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 4.     1     7.0    2.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT:        1     3.8    1.0    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT:        1     1.0    1.0    0.0       <total>
 
 # CHECK:      [77] Code Region - G78
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      1304
+# CHECK-NEXT: Total Cycles:      1305
 # CHECK-NEXT: Total uOps:        4300
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.30
 # CHECK-NEXT: IPC:               0.38
 # CHECK-NEXT: Block RThroughput: 13.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01234567
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    ..   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeeER .    ..   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,2]     .D===eeeeeER   ..   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,3]     . D===eeeeeER  ..   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,4]     .  D=====eeeeeeER   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK:      [0,0]     DeeeeeER  .    . .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     . DeeeeeER.    . .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,2]     .   DeeeeeER   . .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .DeeeeeER . .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,4]     .    .   DeeeeeeER   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3517,33 +3519,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 2.     1     4.0    2.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 4.     1     6.0    2.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT:        1     3.4    1.0    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT:        1     1.0    1.0    0.0       <total>
 
 # CHECK:      [78] Code Region - G79
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      2399
+# CHECK-NEXT: Total Cycles:      2400
 # CHECK-NEXT: Total uOps:        6900
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.88
 # CHECK-NEXT: IPC:               0.21
 # CHECK-NEXT: Block RThroughput: 24.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123
 
-# CHECK:      [0,0]     DeeeeeeER .    .    . .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,1]     .DeeeeeeeER    .    . .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,2]     . D====eeeeeeER.    . .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,3]     .  D=========eeeeeeeER.   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,4]     .    D========eeeeeeeER   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK:      [0,0]     DeeeeeeER .    .    .  .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     . DeeeeeeeER   .    .  .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,2]     .    D=eeeeeeER.    .  .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3]     .    . D=====eeeeeeeER .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,4]     .    .    D====eeeeeeeER   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3553,33 +3555,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 2.     1     5.0    4.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 3.     1     10.0   5.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT:        1     5.2    2.0    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 2.     1     2.0    2.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 3.     1     6.0    5.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 4.     1     5.0    1.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT:        1     3.0    2.0    0.0       <total>
 
 # CHECK:      [79] Code Region - G80
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      1903
+# CHECK-NEXT: Total Cycles:      1904
 # CHECK-NEXT: Total uOps:        5700
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    3.00
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    2.99
 # CHECK-NEXT: IPC:               0.26
 # CHECK-NEXT: Block RThroughput: 19.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01
+# CHECK-NEXT: Index     0123456789          012
 
-# CHECK:      [0,0]     DeeeeeER  .    .    ..   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeeeeeeER.    .    ..   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,2]     . D=====eeeeeeER    ..   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .  D=====eeeeeeeER  ..   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,4]     .   D=========eeeeeeER   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK:      [0,0]     DeeeeeER  .    .    . .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DeeeeeeER   .    . .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,2]     .    D===eeeeeeER   . .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . D==eeeeeeeER . .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,4]     .    .    D====eeeeeeER   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3589,33 +3591,33 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 2.     1     6.0    5.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     6.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 4.     1     10.0   4.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT:        1     4.8    2.0    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 2.     1     4.0    4.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 4.     1     5.0    4.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT:        1     2.8    2.0    0.0       <total>
 
 # CHECK:      [80] Code Region - G81
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      500
-# CHECK-NEXT: Total Cycles:      1658
+# CHECK-NEXT: Total Cycles:      1659
 # CHECK-NEXT: Total uOps:        4900
 
-# CHECK:      Dispatch Width:    16
-# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               0.30
 # CHECK-NEXT: Block RThroughput: 16.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123
+# CHECK-NEXT: Index     0123456789          01234
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .  .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     . DeeeeeeeER   .    .  .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,2]     .  D=========eeeeeeER  .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,3]     .  D===========eeeeeeER.   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,4]     .  D============eeeeeeER   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK:      [0,0]     DeeeeeeeER.    .    .   .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DeeeeeeeER  .    .   .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,2]     .    .D======eeeeeeER   .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,3]     .    . D========eeeeeeER.   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,4]     .    .  D========eeeeeeER   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3626,10 +3628,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
 # CHECK-NEXT: 1.     1     1.0    1.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 2.     1     10.0   9.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 3.     1     12.0   1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT:        1     7.4    2.4    0.0       <total>
+# CHECK-NEXT: 2.     1     7.0    7.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 3.     1     9.0    2.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT:        1     5.4    2.2    0.0       <total>
 
 # CHECK:      [81] Code Region - G82
 
@@ -3638,7 +3640,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      757
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.30
 # CHECK-NEXT: IPC:               0.66
 # CHECK-NEXT: Block RThroughput: 7.5
@@ -3648,10 +3650,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeER .   .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,1]     D=eeeeeeER.   .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,2]     D===eeeeeeER  .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,3]     .D===eeeeeeER .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,4]     .D=====eeeeeeER   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     .DeeeeeeER.   .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,2]     . D=eeeeeeER  .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,3]     .  D=eeeeeeER .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,4]     .   D==eeeeeeER   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3661,11 +3663,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 2.     1     4.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 4.     1     6.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT:        1     3.4    0.6    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 2.     1     2.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 4.     1     3.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT:        1     1.8    0.6    0.0       <total>
 
 # CHECK:      [82] Code Region - G83
 
@@ -3674,7 +3676,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      704
 # CHECK-NEXT: Total uOps:        2700
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.84
 # CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 7.0
@@ -3684,10 +3686,10 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeER .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,1]     D=eeeeeeER.   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,2]     .D==eeeeER.   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,3]     .D===eeeeER   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,4]     . D===eE--R   stg	x26, [x27], #4064
+# CHECK-NEXT: [0,1]     .DeeeeeeER.   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,2]     . D=eeeeER.   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,3]     .   DeeeeER   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,4]     .    DeE--R   stg	x26, [x27], #4064
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3697,11 +3699,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 2.     1     3.0    1.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 4.     1     4.0    0.0    2.0       stg	x26, [x27], #4064
-# CHECK-NEXT:        1     2.8    0.4    0.4       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 2.     1     2.0    1.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 4.     1     1.0    0.0    2.0       stg	x26, [x27], #4064
+# CHECK-NEXT:        1     1.2    0.4    0.4       <total>
 
 # CHECK:      [83] Code Region - G84
 
@@ -3710,19 +3712,19 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        1700
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.37
 # CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 2.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeER .  .   stg	x26, [x27, #4064]!
 # CHECK-NEXT: [0,1]     D=eER.  .   stgp	x1, x2, [x27], #992
-# CHECK-NEXT: [0,2]     D==eER  .   stgp	x1, x2, [x27, #992]!
-# CHECK-NEXT: [0,3]     D===eeER.   stp	s1, s2, [x27], #248
-# CHECK-NEXT: [0,4]     .D===eeER   stp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,2]     .D=eER  .   stgp	x1, x2, [x27, #992]!
+# CHECK-NEXT: [0,3]     . D=eeER.   stp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,4]     .  D=eeER   stp	d1, d2, [x27], #496
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3733,10 +3735,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       stg	x26, [x27, #4064]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       stgp	x1, x2, [x27], #992
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       stgp	x1, x2, [x27, #992]!
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       stp	s1, s2, [x27], #248
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       stp	d1, d2, [x27], #496
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stgp	x1, x2, [x27, #992]!
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       stp	s1, s2, [x27], #248
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       stp	d1, d2, [x27], #496
+# CHECK-NEXT:        1     1.8    0.2    0.0       <total>
 
 # CHECK:      [84] Code Region - G85
 
@@ -3745,19 +3747,19 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.84
 # CHECK-NEXT: IPC:               0.71
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeER.   .   stp	q1, q2, [x27], #992
-# CHECK-NEXT: [0,1]     D==eeER  .   stp	s1, s2, [x27, #248]!
-# CHECK-NEXT: [0,2]     D===eeER .   stp	d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,3]     .D===eeER.   stp	q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,4]     .D=====eER   stp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,1]     .D=eeER  .   stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,2]     . D=eeER .   stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,3]     .  D=eeER.   stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,4]     .   D==eER   stp	w1, w2, [x27], #248
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3767,11 +3769,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	q1, q2, [x27], #992
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       stp	s1, s2, [x27, #248]!
-# CHECK-NEXT: 2.     1     4.0    0.0    0.0       stp	d1, d2, [x27, #496]!
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       stp	q1, q2, [x27, #992]!
-# CHECK-NEXT: 4.     1     6.0    0.0    0.0       stp	w1, w2, [x27], #248
-# CHECK-NEXT:        1     3.6    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       stp	w1, w2, [x27], #248
+# CHECK-NEXT:        1     2.0    0.2    0.0       <total>
 
 # CHECK:      [85] Code Region - G86
 
@@ -3780,19 +3782,19 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        1700
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.37
 # CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 2.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeER .  .   stp	x1, x2, [x27], #496
 # CHECK-NEXT: [0,1]     D=eER.  .   stp	w1, w2, [x27, #248]!
-# CHECK-NEXT: [0,2]     D==eER  .   stp	x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,3]     D===eeER.   str	b1, [x27], #254
-# CHECK-NEXT: [0,4]     .D===eeER   str	h1, [x27], #254
+# CHECK-NEXT: [0,2]     .D=eER  .   stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,3]     . D=eeER.   str	b1, [x27], #254
+# CHECK-NEXT: [0,4]     .  D=eeER   str	h1, [x27], #254
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3803,10 +3805,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	x1, x2, [x27], #496
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       stp	w1, w2, [x27, #248]!
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       stp	x1, x2, [x27, #496]!
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       str	b1, [x27], #254
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       str	h1, [x27], #254
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       str	b1, [x27], #254
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       str	h1, [x27], #254
+# CHECK-NEXT:        1     1.8    0.2    0.0       <total>
 
 # CHECK:      [86] Code Region - G87
 
@@ -3815,19 +3817,19 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   str	s1, [x27], #254
-# CHECK-NEXT: [0,1]     D=eeER  .   str	d1, [x27], #254
-# CHECK-NEXT: [0,2]     D==eeER .   str	q1, [x27], #254
-# CHECK-NEXT: [0,3]     D===eeER.   str	b1, [x27, #254]!
-# CHECK-NEXT: [0,4]     .D===eeER   str	h1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeeER  .   str	d1, [x27], #254
+# CHECK-NEXT: [0,2]     . DeeER .   str	q1, [x27], #254
+# CHECK-NEXT: [0,3]     .  DeeER.   str	b1, [x27, #254]!
+# CHECK-NEXT: [0,4]     .   DeeER   str	h1, [x27, #254]!
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3837,11 +3839,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27], #254
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       str	d1, [x27], #254
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       str	q1, [x27], #254
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       str	b1, [x27, #254]!
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       str	h1, [x27, #254]!
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       str	d1, [x27], #254
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       str	q1, [x27], #254
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       str	b1, [x27, #254]!
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       str	h1, [x27, #254]!
+# CHECK-NEXT:        1     1.0    0.2    0.0       <total>
 
 # CHECK:      [87] Code Region - G88
 
@@ -3850,19 +3852,19 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      503
 # CHECK-NEXT: Total uOps:        1800
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.58
 # CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     01234567
 
 # CHECK:      [0,0]     DeeER. .   str	s1, [x27, #254]!
-# CHECK-NEXT: [0,1]     D=eeER .   str	d1, [x27, #254]!
-# CHECK-NEXT: [0,2]     D==eeER.   str	q1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D===eER.   str	w1, [x27], #254
-# CHECK-NEXT: [0,4]     .D===eER   str	x1, [x27], #254
+# CHECK-NEXT: [0,1]     .DeeER .   str	d1, [x27, #254]!
+# CHECK-NEXT: [0,2]     . DeeER.   str	q1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .  DeER.   str	w1, [x27], #254
+# CHECK-NEXT: [0,4]     .  D=eER   str	x1, [x27], #254
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3872,11 +3874,11 @@ ldr  x2, [x1], #254
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27, #254]!
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       str	d1, [x27, #254]!
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       str	q1, [x27, #254]!
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       str	w1, [x27], #254
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       str	x1, [x27], #254
-# CHECK-NEXT:        1     2.8    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       str	d1, [x27, #254]!
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       str	q1, [x27, #254]!
+# CHECK-NEXT: 3.     1     1.0    0.0    0.0       str	w1, [x27], #254
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       str	x1, [x27], #254
+# CHECK-NEXT:        1     1.2    0.2    0.0       <total>
 
 # CHECK:      [88] Code Region - G89
 
@@ -3885,7 +3887,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      503
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.98
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -3895,9 +3897,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeER . .   str	w1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eER. .   str	x1, [x27, #254]!
-# CHECK-NEXT: [0,2]     D==eER .   strb	w1, [x27], #254
-# CHECK-NEXT: [0,3]     D===eER.   strb	w1, [x27, #254]!
-# CHECK-NEXT: [0,4]     D====eER   strh	w1, [x27], #254
+# CHECK-NEXT: [0,2]     .D=eER .   strb	w1, [x27], #254
+# CHECK-NEXT: [0,3]     .D==eER.   strb	w1, [x27, #254]!
+# CHECK-NEXT: [0,4]     . D==eER   strh	w1, [x27], #254
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3908,10 +3910,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	w1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       str	x1, [x27, #254]!
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       strb	w1, [x27], #254
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       strb	w1, [x27, #254]!
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       strh	w1, [x27], #254
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       strb	w1, [x27], #254
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       strb	w1, [x27, #254]!
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       strh	w1, [x27], #254
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [89] Code Region - G90
 
@@ -3920,7 +3922,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      503
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    2.98
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 2.5
@@ -3930,9 +3932,9 @@ ldr  x2, [x1], #254
 
 # CHECK:      [0,0]     DeER . .   strh	w1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eER. .   stz2g	x26, [x27], #4064
-# CHECK-NEXT: [0,2]     D==eER .   stz2g	x26, [x27, #4064]!
-# CHECK-NEXT: [0,3]     D===eER.   stzg	x26, [x27], #4064
-# CHECK-NEXT: [0,4]     D====eER   stzg	x26, [x27, #4064]!
+# CHECK-NEXT: [0,2]     .D=eER .   stz2g	x26, [x27, #4064]!
+# CHECK-NEXT: [0,3]     .D==eER.   stzg	x26, [x27], #4064
+# CHECK-NEXT: [0,4]     . D==eER   stzg	x26, [x27, #4064]!
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3943,10 +3945,10 @@ ldr  x2, [x1], #254
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       strh	w1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       stz2g	x26, [x27], #4064
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       stz2g	x26, [x27, #4064]!
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       stzg	x26, [x27], #4064
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       stzg	x26, [x27, #4064]!
-# CHECK-NEXT:        1     3.0    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stz2g	x26, [x27, #4064]!
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       stzg	x26, [x27], #4064
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       stzg	x26, [x27, #4064]!
+# CHECK-NEXT:        1     2.2    0.2    0.0       <total>
 
 # CHECK:      [90] Code Region - G91
 
@@ -3955,7 +3957,7 @@ ldr  x2, [x1], #254
 # CHECK-NEXT: Total Cycles:      110
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    3.64
 # CHECK-NEXT: IPC:               1.82
 # CHECK-NEXT: Block RThroughput: 0.7
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-zero-lat-movs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-zero-lat-movs.s
index 1690d9669b396..3ddb525327015 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-zero-lat-movs.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-zero-lat-movs.s
@@ -23,18 +23,18 @@ mov  x1, x2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      0     0.06                        mov	x1, #0
-# CHECK-NEXT:  1      0     0.06                        mov	x1, xzr
-# CHECK-NEXT:  1      0     0.06                        mov	w1, #0
-# CHECK-NEXT:  1      0     0.06                        mov	w1, wzr
-# CHECK-NEXT:  1      0     0.06                        fmov	h1, wzr
-# CHECK-NEXT:  1      0     0.06                        fmov	h1, xzr
-# CHECK-NEXT:  1      0     0.06                        fmov	s1, wzr
-# CHECK-NEXT:  1      0     0.06                        fmov	d1, xzr
-# CHECK-NEXT:  1      0     0.06                        movi	d1, #0000000000000000
-# CHECK-NEXT:  1      0     0.06                        movi	v1.2d, #0000000000000000
-# CHECK-NEXT:  1      0     0.06                        mov	w1, w2
-# CHECK-NEXT:  1      0     0.06                        mov	x1, x2
+# CHECK-NEXT:  1      0     0.17                        mov	x1, #0
+# CHECK-NEXT:  1      0     0.17                        mov	x1, xzr
+# CHECK-NEXT:  1      0     0.17                        mov	w1, #0
+# CHECK-NEXT:  1      0     0.17                        mov	w1, wzr
+# CHECK-NEXT:  1      0     0.17                        fmov	h1, wzr
+# CHECK-NEXT:  1      0     0.17                        fmov	h1, xzr
+# CHECK-NEXT:  1      0     0.17                        fmov	s1, wzr
+# CHECK-NEXT:  1      0     0.17                        fmov	d1, xzr
+# CHECK-NEXT:  1      0     0.17                        movi	d1, #0000000000000000
+# CHECK-NEXT:  1      0     0.17                        movi	v1.2d, #0000000000000000
+# CHECK-NEXT:  1      0     0.17                        mov	w1, w2
+# CHECK-NEXT:  1      0     0.17                        mov	x1, x2
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0.0] - V2UnitB



More information about the llvm-commits mailing list