[llvm] [AArch64] Corrected Latency Descriptions for NeoverseV2/N2 Scheduler (PR #147339)

via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 7 09:19:23 PDT 2025


https://github.com/yafet-a created https://github.com/llvm/llvm-project/pull/147339

Certain vector instructions for the Neoverse V2 and N2 Schedulers had been using incorrect latency descriptions based on errors from the Software Optimisation Guide ([SWOG](https://developer.arm.com/documentation/109898/latest/)). This PR updates the Neoverse V2 and N2 Schedulers to reflect the correct latencies along with having updated the relevant mca tests.

>From be0be810f8802f87751307870e7f86074e90c251 Mon Sep 17 00:00:00 2001
From: Yafet Beyene <ybeyene at ybeyene-mlt.client.nvidia.com>
Date: Mon, 30 Jun 2025 14:15:29 +0100
Subject: [PATCH] Updated incorrect Neoverse instructions from the SWOG

---
 .../Target/AArch64/AArch64SchedNeoverseN2.td  |   21 +-
 .../Target/AArch64/AArch64SchedNeoverseV2.td  |   17 +-
 .../AArch64/Neoverse/N2-sve-instructions.s    |   18 +-
 .../llvm-mca/AArch64/Neoverse/N2-writeback.s  | 4018 ++++++++---------
 .../AArch64/Neoverse/V2-neon-instructions.s   |   30 +-
 5 files changed, 2053 insertions(+), 2051 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
index 91a707910a7f3..59e5afefa97ab 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
@@ -175,8 +175,8 @@ def N2Write_2c_1L01_1V : SchedWriteRes<[N2UnitL01, N2UnitV]> {
   let NumMicroOps = 2;
 }
 
-def N2Write_4c_1V1_1V : SchedWriteRes<[N2UnitV1, N2UnitV]> {
-  let Latency     = 4;
+def N2Write_5c_1V1_1V : SchedWriteRes<[N2UnitV1, N2UnitV]> {
+  let Latency     = 5;
   let NumMicroOps = 2;
 }
 
@@ -294,11 +294,6 @@ def N2Write_9c_1L_1V : SchedWriteRes<[N2UnitL, N2UnitV]> {
   let NumMicroOps = 2;
 }
 
-def N2Write_4c_2V1 : SchedWriteRes<[N2UnitV1, N2UnitV1]> {
-  let Latency     = 4;
-  let NumMicroOps = 2;
-}
-
 //===----------------------------------------------------------------------===//
 // Define generic 3 micro-op types
 
@@ -1006,14 +1001,14 @@ def : InstRW<[N2Write_4c_1V1],
              (instregex "^SABAv", "^UABAv", "^SABALv", "^UABALv")>;
 
 // ASIMD arith, reduce, 4H/4S
-def : InstRW<[N2Write_2c_1V1], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>;
+def : InstRW<[N2Write_3c_1V1], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>;
 
 // ASIMD arith, reduce, 8B/8H
-def : InstRW<[N2Write_4c_1V1_1V],
+def : InstRW<[N2Write_5c_1V1_1V],
              (instregex "^(ADDV|[SU]ADDLV)v8(i8|i16)v$")>;
 
 // ASIMD arith, reduce, 16B
-def : InstRW<[N2Write_4c_1V1], (instrs ADDVv16i8v, SADDLVv16i8v,
+def : InstRW<[N2Write_6c_1V1], (instrs ADDVv16i8v, SADDLVv16i8v,
                                        UADDLVv16i8v)>;
 
 // ASIMD dot product
@@ -1025,15 +1020,15 @@ def : InstRW<[N2Write_3c_1V],
 def : InstRW<[N2Write_3c_1V], (instrs SMMLA, UMMLA, USMMLA)>;
 
 // ASIMD max/min, reduce, 4H/4S
-def : InstRW<[N2Write_2c_1V1], (instregex "^[SU](MAX|MIN)Vv4i16v$",
+def : InstRW<[N2Write_3c_1V1], (instregex "^[SU](MAX|MIN)Vv4i16v$",
                                           "^[SU](MAX|MIN)Vv4i32v$")>;
 
 // ASIMD max/min, reduce, 8B/8H
-def : InstRW<[N2Write_4c_1V1_1V], (instregex "^[SU](MAX|MIN)Vv8i8v$",
+def : InstRW<[N2Write_5c_1V1_1V], (instregex "^[SU](MAX|MIN)Vv8i8v$",
                                              "^[SU](MAX|MIN)Vv8i16v$")>;
 
 // ASIMD max/min, reduce, 16B
-def : InstRW<[N2Write_4c_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
+def : InstRW<[N2Write_6c_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
 
 // ASIMD multiply
 def : InstRW<[N2Write_4c_1V0], (instregex "^MULv", "^SQ(R)?DMULHv")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
index 8d3a4553d4b73..59dc7847c9125 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
@@ -157,6 +157,7 @@ def V2Write_20c_1V0  : SchedWriteRes<[V2UnitV0]>  { let Latency = 20;
 def V2Write_2c_1V1   : SchedWriteRes<[V2UnitV1]>  { let Latency = 2; }
 def V2Write_2c_1V13  : SchedWriteRes<[V2UnitV13]> { let Latency = 2; }
 def V2Write_3c_1V1   : SchedWriteRes<[V2UnitV1]>  { let Latency = 3; }
+def V2Write_3c_1V13  : SchedWriteRes<[V2UnitV13]> { let Latency = 3; }
 def V2Write_4c_1V1   : SchedWriteRes<[V2UnitV1]>  { let Latency = 4; }
 def V2Write_4c_1V13  : SchedWriteRes<[V2UnitV13]> { let Latency = 4; }
 def V2Write_6c_1V1   : SchedWriteRes<[V2UnitV1]>  { let Latency = 6; }
@@ -261,6 +262,11 @@ def V2Write_4c_1V13_1V : SchedWriteRes<[V2UnitV13, V2UnitV]> {
   let NumMicroOps = 2;
 }
 
+def V2Write_5c_1V13_1V : SchedWriteRes<[V2UnitV13, V2UnitV]> {
+  let Latency     = 5;
+  let NumMicroOps = 2;
+}
+
 def V2Write_4c_2V0 : SchedWriteRes<[V2UnitV0, V2UnitV0]> {
   let Latency     = 4;
   let NumMicroOps = 2;
@@ -381,6 +387,11 @@ def V2Write_4c_2V13 : SchedWriteRes<[V2UnitV13, V2UnitV13]> {
   let NumMicroOps = 2;
 }
 
+def V2Write_6c_2V13 : SchedWriteRes<[V2UnitV13, V2UnitV13]> {
+  let Latency     = 6;
+  let NumMicroOps = 2;
+}
+
 def V2Write_8c_1M0_1V01 : SchedWriteRes<[V2UnitM0, V2UnitV01]> {
   let Latency     = 8;
   let NumMicroOps = 2;
@@ -1468,14 +1479,14 @@ def : SchedAlias<WriteVq, V2Write_2c_1V>;
 def : InstRW<[V2Wr_VA, V2Rd_VA], (instregex "^[SU]ABAL?v")>;
 
 // ASIMD arith, reduce, 4H/4S
-def : InstRW<[V2Write_2c_1V13], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>;
+def : InstRW<[V2Write_3c_1V13], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>;
 
 // ASIMD arith, reduce, 8B/8H
-def : InstRW<[V2Write_4c_1V13_1V],
+def : InstRW<[V2Write_5c_1V13_1V],
              (instregex "^(ADDV|[SU]ADDLV)v8(i8|i16)v$")>;
 
 // ASIMD arith, reduce, 16B
-def : InstRW<[V2Write_4c_2V13], (instregex "^(ADDV|[SU]ADDLV)v16i8v$")>;
+def : InstRW<[V2Write_6c_2V13], (instregex "^(ADDV|[SU]ADDLV)v16i8v$")>;
 
 // ASIMD dot product
 // ASIMD dot product using signed and unsigned integers
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
index ef9d4463ebe52..99e39567b1ad6 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
@@ -5066,19 +5066,19 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  2      2     1.00                        movs	p0.b, p0/z, p0.b
 # CHECK-NEXT:  2      2     1.00                        movs	p15.b, p15.b
 # CHECK-NEXT:  2      2     1.00                        movs	p15.b, p15/z, p15.b
-# CHECK-NEXT:  1      1     0.20                  U     mrs	x3, ID_AA64ZFR0_EL1
-# CHECK-NEXT:  1      1     0.20                  U     mrs	x3, ZCR_EL1
-# CHECK-NEXT:  1      1     0.20                  U     mrs	x3, ZCR_EL12
-# CHECK-NEXT:  1      1     0.20                  U     mrs	x3, ZCR_EL2
-# CHECK-NEXT:  1      1     0.20                  U     mrs	x3, ZCR_EL3
+# CHECK-NEXT:  1      1     0.10                  U     mrs	x3, ID_AA64ZFR0_EL1
+# CHECK-NEXT:  1      1     0.10                  U     mrs	x3, ZCR_EL1
+# CHECK-NEXT:  1      1     0.10                  U     mrs	x3, ZCR_EL12
+# CHECK-NEXT:  1      1     0.10                  U     mrs	x3, ZCR_EL2
+# CHECK-NEXT:  1      1     0.10                  U     mrs	x3, ZCR_EL3
 # CHECK-NEXT:  1      4     1.00                        msb	z0.b, p7/m, z1.b, z31.b
 # CHECK-NEXT:  2      5     2.00                        msb	z0.d, p7/m, z1.d, z31.d
 # CHECK-NEXT:  1      4     1.00                        msb	z0.h, p7/m, z1.h, z31.h
 # CHECK-NEXT:  1      4     1.00                        msb	z0.s, p7/m, z1.s, z31.s
-# CHECK-NEXT:  1      1     0.20                  U     msr	ZCR_EL1, x3
-# CHECK-NEXT:  1      1     0.20                  U     msr	ZCR_EL12, x3
-# CHECK-NEXT:  1      1     0.20                  U     msr	ZCR_EL2, x3
-# CHECK-NEXT:  1      1     0.20                  U     msr	ZCR_EL3, x3
+# CHECK-NEXT:  1      1     0.10                  U     msr	ZCR_EL1, x3
+# CHECK-NEXT:  1      1     0.10                  U     msr	ZCR_EL12, x3
+# CHECK-NEXT:  1      1     0.10                  U     msr	ZCR_EL2, x3
+# CHECK-NEXT:  1      1     0.10                  U     msr	ZCR_EL3, x3
 # CHECK-NEXT:  1      4     1.00                        mul	z0.b, p7/m, z0.b, z31.b
 # CHECK-NEXT:  1      4     1.00                        mul	z0.b, z1.b, z2.b
 # CHECK-NEXT:  2      5     2.00                        mul	z0.d, p7/m, z0.d, z31.d
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
index dee46a304582b..5ffaf9138d482 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
@@ -1185,10 +1185,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -1197,13 +1197,13 @@ add x0, x27, 1
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.1d }, [x27], #8
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D=eeeeeeER.   ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,7]     . D==eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,9]     .  D==eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1215,14 +1215,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d }, [x27], #8
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.1    2.0       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.1    2.0       <total>
 
 # CHECK:      [1] Code Region - G02
 
@@ -1231,10 +1231,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -1243,13 +1243,13 @@ add x0, x27, 1
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.8b }, [x27], #8
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D=eeeeeeER.   ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     . D==eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .  D==eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1261,14 +1261,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b }, [x27], #8
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.1    2.0       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.1    2.0       <total>
 
 # CHECK:      [2] Code Region - G03
 
@@ -1277,10 +1277,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    2.95
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -1289,13 +1289,13 @@ add x0, x27, 1
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.2s }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D=eeeeeeER.   ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     . D==eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .  D==eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1307,14 +1307,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.1    2.0       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.1    2.0       <total>
 
 # CHECK:      [3] Code Region - G04
 
@@ -1323,10 +1323,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1900
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    3.74
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 3.8
+# CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -1334,14 +1334,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.16b }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ld1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeER .   ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeER   ld1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1352,15 +1352,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.0    0.1    2.0       <total>
 
 # CHECK:      [4] Code Region - G05
 
@@ -1369,10 +1369,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    3.94
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -1380,14 +1380,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.4s, v2.4s }, [x27], #32
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ld1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeER .   ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ld1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeER   ld1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1398,15 +1398,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], #32
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
 
 # CHECK:      [5] Code Region - G06
 
@@ -1415,10 +1415,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    3.94
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -1426,14 +1426,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.2d, v2.2d }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ld1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeER .   ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ld1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeER   ld1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1444,15 +1444,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
 
 # CHECK:      [6] Code Region - G07
 
@@ -1461,10 +1461,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2300
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    4.53
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 4.6
+# CHECK-NEXT: Block RThroughput: 4.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
@@ -1472,14 +1472,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.8h, v2.8h }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ld1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeER .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeER   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1490,15 +1490,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
 
 # CHECK:      [7] Code Region - G08
 
@@ -1507,7 +1507,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    4.92
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -1518,14 +1518,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeER .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeER   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1536,15 +1536,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
 
 # CHECK:      [8] Code Region - G09
 
@@ -1553,7 +1553,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    4.92
 # CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -1564,14 +1564,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeER .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeER   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1582,26 +1582,26 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
 
 # CHECK:      [9] Code Region - G10
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      708
+# CHECK-NEXT: Total Cycles:      608
 # CHECK-NEXT: Total uOps:        2700
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.81
-# CHECK-NEXT: IPC:               1.41
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.44
+# CHECK-NEXT: IPC:               1.64
 # CHECK-NEXT: Block RThroughput: 5.7
 
 # CHECK:      Timeline view:
@@ -1610,14 +1610,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeER .  .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE----R .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeER.   .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE----R.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeER   .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eE----R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeeER .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,7]     .   DeE-----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    DeeeeeeeER   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .DeE-----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeER.  .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE----R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER  .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeER.   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,7]     . D==eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeER   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,9]     . D===eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1628,42 +1628,42 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 9.     1     1.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.3    0.2    2.2       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 9.     1     4.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.6    0.1    2.2       <total>
 
 # CHECK:      [10] Code Region - G11
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1008
+# CHECK-NEXT: Total Cycles:      675
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    2.98
-# CHECK-NEXT: IPC:               0.99
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.44
+# CHECK-NEXT: IPC:               1.48
 # CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    . .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,1]     .DeE-----R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeER   . .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,3]     .  DeE-----R   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeeER . .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,5]     .    DeE-----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeeER .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,7]     .    . DeE-----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeeER   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .   DeE-----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.   .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,1]     D=eE-----R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeER   .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,3]     .D=eE-----R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeER  .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,5]     . D=eE-----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeER.   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D==eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeeeeeeER   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,9]     .   D==eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1673,43 +1673,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 5.     1     1.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 9.     1     1.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    2.5       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 9.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.9    0.2    2.5       <total>
 
 # CHECK:      [11] Code Region - G12
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1008
+# CHECK-NEXT: Total Cycles:      675
 # CHECK-NEXT: Total uOps:        3000
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    2.98
-# CHECK-NEXT: IPC:               0.99
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.44
+# CHECK-NEXT: IPC:               1.48
 # CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    . .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,1]     .DeE-----R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeER   . .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeE-----R   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeeER . .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     .    DeE-----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeeER .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . DeE-----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeeER   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   DeE-----R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.   .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,1]     D=eE-----R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeER   .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE-----R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeER  .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE-----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeER.   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeeeeeeER   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D==eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1719,25 +1719,25 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     1.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    2.5       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.9    0.2    2.5       <total>
 
 # CHECK:      [12] Code Region - G13
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1212
+# CHECK-NEXT: Total Cycles:      1210
 # CHECK-NEXT: Total uOps:        2800
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    2.31
 # CHECK-NEXT: IPC:               0.83
 # CHECK-NEXT: Block RThroughput: 5.7
@@ -1746,16 +1746,16 @@ add x0, x27, 1
 # CHECK-NEXT:                     0123456789
 # CHECK-NEXT: Index     0123456789          01
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .  .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeE-----R.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeER   .    .  .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeE-----R   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeeER .    .  .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    DeE-----R .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeeER    .  .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . DeE-----R    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    . D======eeeeeeeeER   ld1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,9]     .    . D=======eE------R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    .    ..   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE-----R.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeER    .    ..   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE-----R    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeER   .    ..   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE-----R   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeER .    ..   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE-----R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D========eeeeeeeeER   ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,9]     .  D=========eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1765,16 +1765,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     1.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     7.0    0.0    0.0       ld1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: 9.     1     8.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.4    2.6       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     9.0    0.0    0.0       ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 9.     1     10.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.3    0.2    2.6       <total>
 
 # CHECK:      [13] Code Region - G14
 
@@ -1783,10 +1783,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    0.50
 # CHECK-NEXT: IPC:               0.25
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
@@ -1794,14 +1794,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld1	{ v1.b }[8], [x27], #1
 # CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    . .   ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,3]     D=========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    . .   ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,5]     .D================eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    . .   ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .D========================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER   ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,9]     . D===============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1812,15 +1812,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.b }[8], [x27], #1
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 3.     1     10.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: 5.     1     17.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 7.     1     25.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 9.     1     32.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.7   0.1    3.0       <total>
 
 # CHECK:      [14] Code Region - G15
 
@@ -1829,10 +1829,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    0.50
 # CHECK-NEXT: IPC:               0.25
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
@@ -1840,14 +1840,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld1	{ v1.h }[0], [x27], x28
 # CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    . .   ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,3]     D=========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    . .   ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .D================eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    . .   ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D========================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER   ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,9]     . D===============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1858,15 +1858,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.h }[0], [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 3.     1     10.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: 5.     1     17.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 7.     1     25.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 9.     1     32.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.7   0.1    3.0       <total>
 
 # CHECK:      [15] Code Region - G16
 
@@ -1875,10 +1875,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    1.66
 # CHECK-NEXT: IPC:               0.83
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -1886,14 +1886,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld1	{ v1.d }[0], [x27], x28
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld1r	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld1r	{ v1.2s }, [x27], #4
-# CHECK-NEXT: [0,7]     .  D=eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld1r	{ v1.4h }, [x27], #2
-# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1904,15 +1904,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.d }[0], [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1r	{ v1.1d }, [x27], #8
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1r	{ v1.2s }, [x27], #4
-# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1r	{ v1.4h }, [x27], #2
-# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
 
 # CHECK:      [16] Code Region - G17
 
@@ -1921,10 +1921,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    3.92
 # CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -1932,14 +1932,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld1r	{ v1.4s }, [x27], #4
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld1r	{ v1.8b }, [x27], #1
-# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld1r	{ v1.16b }, [x27], #1
-# CHECK-NEXT: [0,7]     .  D=eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld1r	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1950,15 +1950,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.4s }, [x27], #4
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1r	{ v1.8b }, [x27], #1
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1r	{ v1.16b }, [x27], #1
-# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1r	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
 
 # CHECK:      [17] Code Region - G18
 
@@ -1967,10 +1967,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    3.92
 # CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -1978,14 +1978,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld1r	{ v1.2d }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld1r	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld1r	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld1r	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1996,15 +1996,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.2d }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1r	{ v1.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1r	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1r	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
 
 # CHECK:      [18] Code Region - G19
 
@@ -2013,10 +2013,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2400
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    4.71
 # CHECK-NEXT: IPC:               1.96
-# CHECK-NEXT: Block RThroughput: 4.8
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -2024,14 +2024,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld1r	{ v1.8h }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld1r	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5]     .  DeE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7]     .   DeE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9]     .    DeE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     . D==eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2042,42 +2042,42 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.8h }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1r	{ v1.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.2    0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.6    0.1    3.0       <total>
 
 # CHECK:      [19] Code Region - G20
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      909
+# CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2900
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.19
-# CHECK-NEXT: IPC:               1.10
-# CHECK-NEXT: Block RThroughput: 5.8
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.69
+# CHECK-NEXT: IPC:               1.96
+# CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234567
+# CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    . .   ld2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1]     .DeE------R    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER   . .   ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3]     . DeE------R   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .  DeeeeeeeeER . .   ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5]     .   DeE------R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    DeeeeeeeeER .   ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .DeE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    . DeeeeeeeeER   ld2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .  DeE------R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER   .   ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeeeeER.   ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     . D==eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=eeeeeeeeER   ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .  D==eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2087,43 +2087,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.4    3.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.2    0.1    3.0       <total>
 
 # CHECK:      [20] Code Region - G21
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      709
+# CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2700
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.81
-# CHECK-NEXT: IPC:               1.41
-# CHECK-NEXT: Block RThroughput: 5.4
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.29
+# CHECK-NEXT: IPC:               1.96
+# CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345
+# CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2	{ v1.2s, v2.2s }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER   .   ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeeER  .   ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .  DeE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeeeER .   ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .   DeE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    DeeeeeeeeER   ld2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .DeE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     . D==eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2134,15 +2134,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.2    0.2    3.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.6    0.1    3.0       <total>
 
 # CHECK:      [21] Code Region - G22
 
@@ -2151,25 +2151,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      3310
 # CHECK-NEXT: Total uOps:        2600
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    0.79
 # CHECK-NEXT: IPC:               0.30
-# CHECK-NEXT: Block RThroughput: 5.2
+# CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,3]     . D=======eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,5]     .  D==============eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .   D=====================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,9]     .    D============================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,5]     .D================eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    . .   ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     . D=======================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER   ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     . D===============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2179,16 +2179,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 3.     1     8.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 5.     1     15.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: 7.     1     22.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 9.     1     29.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.0   0.1    3.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 5.     1     17.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     24.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     32.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    3.0       <total>
 
 # CHECK:      [22] Code Region - G23
 
@@ -2197,7 +2197,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    0.62
 # CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -2208,14 +2208,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], #4
 # CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    . .   ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,3]     D=========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .D================eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    . .   ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .D========================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER   ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,9]     . D===============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2226,15 +2226,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], #4
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 3.     1     10.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     17.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     25.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 9.     1     32.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.7   0.1    3.0       <total>
 
 # CHECK:      [23] Code Region - G24
 
@@ -2243,7 +2243,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      2603
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    0.96
 # CHECK-NEXT: IPC:               0.38
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -2254,14 +2254,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .  .   ld2	{ v1.s, v2.s }[0], [x27], x28
 # CHECK-NEXT: [0,1]     D=eE------R    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .  .   ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,3]     .D========eE------R .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER .   ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D===============eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==============eeeeeeeeER.   ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,7]     .  D===============eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==============eeeeeeeeER   ld2r	{ v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: [0,9]     .   D===============eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .  .   ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,3]     D=========eE------R .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER .   ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .D================eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D================eeeeeeeeER.   ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,7]     .D=================eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D================eeeeeeeeER   ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     . D=================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2272,15 +2272,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 7.     1     16.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     15.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: 9.     1     16.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     11.3   0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 3.     1     10.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     17.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 7.     1     18.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     17.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     18.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     12.5   0.1    3.0       <total>
 
 # CHECK:      [24] Code Region - G25
 
@@ -2289,7 +2289,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    4.90
 # CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -2300,14 +2300,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2r	{ v1.2s, v2.2s }, [x27], #8
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: [0,7]     .  D=eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld2r	{ v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2318,15 +2318,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], #8
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
 
 # CHECK:      [25] Code Region - G26
 
@@ -2335,7 +2335,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    4.90
 # CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 5.0
@@ -2346,14 +2346,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2r	{ v1.16b, v2.16b }, [x27], #2
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld2r	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2364,15 +2364,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], #2
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
 
 # CHECK:      [26] Code Region - G27
 
@@ -2381,10 +2381,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      609
 # CHECK-NEXT: Total uOps:        2800
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    4.60
 # CHECK-NEXT: IPC:               1.64
-# CHECK-NEXT: Block RThroughput: 5.6
+# CHECK-NEXT: Block RThroughput: 5.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     01234
@@ -2392,14 +2392,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeeeeeER   .   ld2r	{ v1.4s, v2.4s }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeeeER .   ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeeeER.   ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,9]     .    DeE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2410,42 +2410,42 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.4    0.1    3.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
 
 # CHECK:      [27] Code Region - G28
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1009
+# CHECK-NEXT: Total Cycles:      759
 # CHECK-NEXT: Total uOps:        3700
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.67
-# CHECK-NEXT: IPC:               0.99
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.87
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
+# CHECK-NEXT:                     0123456
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.  .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,5]     .    DeE------R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1]     D=eE------R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   ..   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,3]     .D=eE------R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeeeER ..   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,5]     . D==eE------R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeeER..   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,7]     .  D==eE------R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeeeER   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,9]     .   D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2455,43 +2455,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    3.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    3.0       <total>
 
 # CHECK:      [28] Code Region - G29
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1009
+# CHECK-NEXT: Total Cycles:      759
 # CHECK-NEXT: Total uOps:        3800
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.77
-# CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 7.6
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.01
+# CHECK-NEXT: IPC:               1.32
+# CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
-# CHECK-NEXT: Index     0123456789         
-
-# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.  .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    DeE------R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     D=eE------R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   ..   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE------R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeeeER ..   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D==eE------R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeeER..   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE------R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeeeER   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2501,43 +2501,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    3.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    3.0       <total>
 
 # CHECK:      [29] Code Region - G30
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2011
+# CHECK-NEXT: Total Cycles:      2010
 # CHECK-NEXT: Total uOps:        3700
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    1.84
 # CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789 
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeE------R    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .    .    .    .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeE------R  .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.    .    .    .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    DeE------R.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D======eeeeeeeeER  .    .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,7]     .    . D======eE------R  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D============eeeeeeeeER   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,9]     .    .   D============eE------R   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .   .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   .    .    .   .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE------R   .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeeeER .    .    .   .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D==eE------R .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D========eeeeeeeeER   .   .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,7]     .  D=========eE------R   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D===============eeeeeeeeER   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,9]     .   D================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2547,16 +2547,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     7.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 7.     1     7.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     13.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 9.     1     13.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.6    0.3    3.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     9.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 7.     1     10.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     16.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 9.     1     17.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     6.3    0.2    3.0       <total>
 
 # CHECK:      [30] Code Region - G31
 
@@ -2565,7 +2565,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        3500
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    0.87
 # CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 7.5
@@ -2575,15 +2575,15 @@ add x0, x27, 1
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .DeE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     .  D======eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    . .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,5]     .    D============eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,7]     .    . D==================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D========================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2593,16 +2593,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 5.     1     13.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 7.     1     19.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 9.     1     25.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.0   0.1    3.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
 
 # CHECK:      [31] Code Region - G32
 
@@ -2611,7 +2611,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        3500
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    0.87
 # CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 7.5
@@ -2621,15 +2621,15 @@ add x0, x27, 1
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,1]     .DeE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,3]     .  D======eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    D============eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    . .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,7]     .    . D==================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D========================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2639,43 +2639,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 5.     1     13.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 7.     1     19.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 9.     1     25.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.0   0.1    3.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
 
 # CHECK:      [32] Code Region - G33
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1009
+# CHECK-NEXT: Total Cycles:      759
 # CHECK-NEXT: Total uOps:        3700
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.67
-# CHECK-NEXT: IPC:               0.99
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.87
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
-# CHECK-NEXT: Index     0123456789         
-
-# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.  .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: [0,5]     .    DeE------R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     D=eE------R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   ..   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,3]     .D=eE------R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeeeER ..   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,5]     . D==eE------R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeeER..   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,7]     .  D==eE------R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeeeER   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: [0,9]     .   D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2685,43 +2685,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    3.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    3.0       <total>
 
 # CHECK:      [33] Code Region - G34
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1009
+# CHECK-NEXT: Total Cycles:      759
 # CHECK-NEXT: Total uOps:        3800
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.77
-# CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 7.6
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.01
+# CHECK-NEXT: IPC:               1.32
+# CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
-# CHECK-NEXT: Index     0123456789         
-
-# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.  .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: [0,5]     .    DeE------R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1]     D=eE------R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   ..   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,3]     .D=eE------R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeeeER ..   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,5]     . D==eE------R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeeER..   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE------R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeeeER   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2731,43 +2731,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    3.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    3.0       <total>
 
 # CHECK:      [34] Code Region - G35
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1009
+# CHECK-NEXT: Total Cycles:      759
 # CHECK-NEXT: Total uOps:        3700
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.67
-# CHECK-NEXT: IPC:               0.99
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.87
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
-# CHECK-NEXT: Index     0123456789         
-
-# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.  .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    DeE------R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   ..   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE------R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeeeER ..   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D==eE------R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeeER..   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE------R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeeeER   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2777,43 +2777,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    3.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    3.0       <total>
 
 # CHECK:      [35] Code Region - G36
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1010
+# CHECK-NEXT: Total Cycles:      959
 # CHECK-NEXT: Total uOps:        4600
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    4.55
-# CHECK-NEXT: IPC:               0.99
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.80
+# CHECK-NEXT: IPC:               1.04
 # CHECK-NEXT: Block RThroughput: 9.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          
-
-# CHECK:      [0,0]     DeeeeeeeeER    .   .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeE------R    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeeeER .   .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,3]     .  DeE-------R .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.   .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,5]     .    DeE------R.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER  .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,7]     .    . DeE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeeER   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .   DeE-------R   add	x0, x27, #1
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeeER  .  .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3]     .D=eE-------R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeeeER .  .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,5]     . D==eE------R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==eeeeeeeeER  .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D===eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D===eeeeeeeeeER   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,9]     .   D====eE-------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2823,43 +2823,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 3.     1     1.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 9.     1     1.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    3.2       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 3.     1     2.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 9.     1     5.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.4    3.2       <total>
 
 # CHECK:      [36] Code Region - G37
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1009
+# CHECK-NEXT: Total Cycles:      1008
 # CHECK-NEXT: Total uOps:        4800
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    4.76
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012345678
-# CHECK-NEXT: Index     0123456789         
+# CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeeeER .  .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,3]     .  DeE-------R .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeeeeER  .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,5]     .    DeE-------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeeeeER.   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . DeE-------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     D=eE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeeER  .  .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3]     .D=eE-------R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeeeeeeeeER  .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,5]     . D===eE-------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==eeeeeeeeeER .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D===eE-------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====eeeeeeeeER   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=====eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2869,43 +2869,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 3.     1     1.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 5.     1     1.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    3.3       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 3.     1     2.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 5.     1     4.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    2.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     6.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.5    3.3       <total>
 
 # CHECK:      [37] Code Region - G38
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1010
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        4800
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    4.75
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.76
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          
+# CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeeER    .   .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeE------R    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeeeER .   .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeE-------R .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.   .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    DeE------R.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeeeeER .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . DeE-------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeeER   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   DeE-------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     D=eE------R    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeeER  .   .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE-------R  .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeeeeeeeER.   .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D===eE------R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==eeeeeeeeeER  .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D===eE-------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====eeeeeeeeeER   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=====eE-------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2915,16 +2915,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    7.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    3.3       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    2.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     6.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.5    3.3       <total>
 
 # CHECK:      [38] Code Region - G39
 
@@ -2933,7 +2933,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        4500
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    1.12
 # CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 10.0
@@ -2943,15 +2943,15 @@ add x0, x27, 1
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,1]     .DeE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,3]     .  D======eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    D============eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,7]     .    . D==================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .    .   D========================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2961,16 +2961,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 5.     1     13.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 7.     1     19.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 9.     1     25.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.0   0.1    3.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
 
 # CHECK:      [39] Code Region - G40
 
@@ -2979,7 +2979,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        4500
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    1.12
 # CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 10.0
@@ -2989,15 +2989,15 @@ add x0, x27, 1
 # CHECK-NEXT: Index     0123456789          0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,1]     .DeE------R    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .  D======eE------R .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,5]     .    D============eE------R   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    . .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,7]     .    . D==================eE------R.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D========================eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3007,16 +3007,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 5.     1     13.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 7.     1     19.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 9.     1     25.0   0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.0   0.1    3.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
 
 # CHECK:      [40] Code Region - G41
 
@@ -3025,7 +3025,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      2103
 # CHECK-NEXT: Total uOps:        4600
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    2.19
 # CHECK-NEXT: IPC:               0.48
 # CHECK-NEXT: Block RThroughput: 10.0
@@ -3035,15 +3035,15 @@ add x0, x27, 1
 # CHECK-NEXT: Index     0123456789          0123
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,1]     .DeE------R    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .  D======eE------R .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D=====eeeeeeeeER.  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,5]     .    D=====eE------R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D======eeeeeeeeER.   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: [0,7]     .    . D======eE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D=====eeeeeeeeER   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .   D=====eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     D=eE------R    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D========eE------R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=======eeeeeeeeER.  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,5]     . D========eE------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=========eeeeeeeeER.   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D==========eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=========eeeeeeeeER   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,9]     .   D==========eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3053,43 +3053,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     6.0    0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 5.     1     6.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     7.0    2.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: 7.     1     7.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     6.0    0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: 9.     1     6.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     5.4    0.3    3.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 5.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     10.0   2.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 7.     1     11.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     10.0   0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: 9.     1     11.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     7.9    0.3    3.0       <total>
 
 # CHECK:      [41] Code Region - G42
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1009
+# CHECK-NEXT: Total Cycles:      1008
 # CHECK-NEXT: Total uOps:        4800
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    4.76
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012345678
-# CHECK-NEXT: Index     0123456789         
+# CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: [0,5]     .    DeE------R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     D=eE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   .  .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=eE------R   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeeeeeeeER.  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,5]     . D===eE------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==eeeeeeeeER  .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,7]     .  D===eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====eeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: [0,9]     .   D=====eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3099,43 +3099,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    3.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: 5.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    2.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: 9.     1     6.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.5    3.0       <total>
 
 # CHECK:      [42] Code Region - G43
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1009
+# CHECK-NEXT: Total Cycles:      1008
 # CHECK-NEXT: Total uOps:        4700
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    4.66
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012345678
-# CHECK-NEXT: Index     0123456789         
+# CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    DeE------R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     D=eE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE------R   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeeeeeeeER.  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D===eE------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==eeeeeeeeER  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D===eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====eeeeeeeeER   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=====eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3145,43 +3145,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    3.0       <total>
-
-# CHECK:      [43] Code Region - G44
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    2.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     6.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.5    3.0       <total>
+
+# CHECK:      [43] Code Region - G44
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      808
+# CHECK-NEXT: Total Cycles:      708
 # CHECK-NEXT: Total uOps:        3900
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    4.83
-# CHECK-NEXT: IPC:               1.24
-# CHECK-NEXT: Block RThroughput: 7.8
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.51
+# CHECK-NEXT: IPC:               1.41
+# CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345
-# CHECK-NEXT: Index     0123456789      
-
-# CHECK:      [0,0]     DeeeeeeeeER    .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeE------R    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeE------R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    DeE------R.   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeER.   ldp	s1, s2, [x27], #248
-# CHECK-NEXT: [0,7]     .    .D=eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    . DeeeeeeER   ldp	d1, d2, [x27], #496
-# CHECK-NEXT: [0,9]     .    . D=eE----R   add	x0, x27, #1
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER   .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D===eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==eeeeeeE-R   ldp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,7]     .  D===eE-----R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D===eeeeeeER   ldp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,9]     .  D====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3191,43 +3191,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ldp	s1, s2, [x27], #248
-# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ldp	d1, d2, [x27], #496
-# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.2    0.4    2.6       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    1.0       ldp	s1, s2, [x27], #248
+# CHECK-NEXT: 7.     1     4.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldp	d1, d2, [x27], #496
+# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.9    0.3    2.8       <total>
 
 # CHECK:      [44] Code Region - G45
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      706
+# CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        2800
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.97
-# CHECK-NEXT: IPC:               1.42
-# CHECK-NEXT: Block RThroughput: 5.6
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.52
+# CHECK-NEXT: IPC:               1.97
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
-# CHECK-NEXT: Index     0123456789   
-
-# CHECK:      [0,0]     DeeeeeeER . .   ldp	q1, q2, [x27], #992
-# CHECK-NEXT: [0,1]     .DeE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeER .   ldp	s1, s2, [x27, #248]!
-# CHECK-NEXT: [0,3]     . D=eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .  DeeeeeeER.   ldp	d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,5]     .  D=eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .   DeeeeeeER   ldp	q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,7]     .    DeE----R   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    DeeeeE-R   ldp	w1, w2, [x27], #248
-# CHECK-NEXT: [0,9]     .    .DeE---R   add	x0, x27, #1
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER ..   ldp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     D=eE----R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER..   ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .D=eE----R..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER.   ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .D==eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeeER   ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     . D==eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeE-R   ldp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .  D==eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3237,16 +3237,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	q1, q2, [x27], #992
-# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ldp	s1, s2, [x27, #248]!
 # CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ldp	d1, d2, [x27, #496]!
-# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ldp	q1, q2, [x27, #992]!
-# CHECK-NEXT: 7.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    1.0       ldp	w1, w2, [x27], #248
-# CHECK-NEXT: 9.     1     1.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.2    0.2    2.0       <total>
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    1.0       ldp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.2    0.1    2.0       <total>
 
 # CHECK:      [45] Code Region - G46
 
@@ -3255,7 +3255,7 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      1006
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    1.99
 # CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 10.0
@@ -3266,14 +3266,14 @@ add x0, x27, 1
 
 # CHECK:      [0,0]     DeeeeER   .    .   ldp	x1, x2, [x27], #496
 # CHECK-NEXT: [0,1]     D=eE--R   .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeER  .    .   ldp	w1, w2, [x27, #248]!
-# CHECK-NEXT: [0,3]     .D=eE--R  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeER .    .   ldp	x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,5]     . D=eE--R .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeER    .   ldpsw	x1, x2, [x27], #248
-# CHECK-NEXT: [0,7]     .  D=eE---R    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====eeeeeER   ldpsw	x1, x2, [x27, #248]!
-# CHECK-NEXT: [0,9]     .   D=====eE---R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .    .   ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     D==eE--R  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .    .   ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .D==eE--R .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeER    .   ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: [0,7]     .D===eE---R    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======eeeeeER   ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: [0,9]     . D=======eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3284,15 +3284,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	x1, x2, [x27], #496
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ldp	w1, w2, [x27, #248]!
-# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ldp	x1, x2, [x27, #496]!
-# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ldpsw	x1, x2, [x27], #248
-# CHECK-NEXT: 7.     1     2.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    4.0    0.0       ldpsw	x1, x2, [x27, #248]!
-# CHECK-NEXT: 9.     1     6.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.5    1.2       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    4.0    0.0       ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: 9.     1     8.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.5    0.5    1.2       <total>
 
 # CHECK:      [46] Code Region - G47
 
@@ -3301,25 +3301,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    3.94
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 3.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
-# CHECK-NEXT: Index     0123456789   
+# CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeER . .   ldr	b1, [x27], #254
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ldr	h1, [x27], #254
-# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeER .   ldr	s1, [x27], #254
-# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ldr	d1, [x27], #254
-# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeER   ldr	q1, [x27], #254
-# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ldr	h1, [x27], #254
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ldr	s1, [x27], #254
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ldr	d1, [x27], #254
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ldr	q1, [x27], #254
+# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3330,15 +3330,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27], #254
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ldr	h1, [x27], #254
-# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ldr	s1, [x27], #254
-# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ldr	d1, [x27], #254
-# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ldr	q1, [x27], #254
-# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	h1, [x27], #254
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldr	s1, [x27], #254
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldr	d1, [x27], #254
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ldr	q1, [x27], #254
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
 
 # CHECK:      [47] Code Region - G48
 
@@ -3347,25 +3347,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    3.94
 # CHECK-NEXT: IPC:               1.97
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 3.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
-# CHECK-NEXT: Index     0123456789   
+# CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeeeER . .   ldr	b1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeeeER. .   ldr	h1, [x27, #254]!
-# CHECK-NEXT: [0,3]     .D=eE----R. .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeER .   ldr	s1, [x27, #254]!
-# CHECK-NEXT: [0,5]     . D=eE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeeeER.   ldr	d1, [x27, #254]!
-# CHECK-NEXT: [0,7]     .  D=eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeeeER   ldr	q1, [x27, #254]!
-# CHECK-NEXT: [0,9]     .   D=eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ldr	h1, [x27, #254]!
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ldr	s1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ldr	d1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ldr	q1, [x27, #254]!
+# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3376,15 +3376,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ldr	h1, [x27, #254]!
-# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ldr	s1, [x27, #254]!
-# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ldr	d1, [x27, #254]!
-# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ldr	q1, [x27, #254]!
-# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    2.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	h1, [x27, #254]!
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldr	s1, [x27, #254]!
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldr	d1, [x27, #254]!
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ldr	q1, [x27, #254]!
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
 
 # CHECK:      [48] Code Region - G49
 
@@ -3393,25 +3393,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    2.96
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
-# CHECK-NEXT: Index     0123456789 
+# CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeER   .   ldr	w1, [x27], #254
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeER  .   ldr	x1, [x27], #254
-# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeER .   ldr	w1, [x27, #254]!
-# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D=eeeeER.   ldr	x1, [x27, #254]!
-# CHECK-NEXT: [0,7]     . D==eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeER   ldrb	w1, [x27], #254
-# CHECK-NEXT: [0,9]     .  D==eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeER .   ldr	w1, [x27, #254]!
+# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   ldr	x1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeER   ldrb	w1, [x27], #254
+# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3423,14 +3423,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	w1, [x27], #254
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	x1, [x27], #254
-# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldr	w1, [x27, #254]!
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ldr	x1, [x27, #254]!
-# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ldrb	w1, [x27], #254
-# CHECK-NEXT: 9.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.1    1.0       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldr	w1, [x27, #254]!
+# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldr	x1, [x27, #254]!
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldrb	w1, [x27], #254
+# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.1    1.0       <total>
 
 # CHECK:      [49] Code Region - G50
 
@@ -3439,25 +3439,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    2.96
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
-# CHECK-NEXT: Index     0123456789 
+# CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeER   .   ldrb	w1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeER  .   ldrh	w1, [x27], #254
-# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeER .   ldrh	w1, [x27, #254]!
-# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D=eeeeER.   ldrsb	w1, [x27], #254
-# CHECK-NEXT: [0,7]     . D==eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeER   ldrsb	x1, [x27], #254
-# CHECK-NEXT: [0,9]     .  D==eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeER .   ldrh	w1, [x27, #254]!
+# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   ldrsb	w1, [x27], #254
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeER   ldrsb	x1, [x27], #254
+# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3469,14 +3469,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrb	w1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrh	w1, [x27], #254
-# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldrh	w1, [x27, #254]!
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ldrsb	w1, [x27], #254
-# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ldrsb	x1, [x27], #254
-# CHECK-NEXT: 9.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.1    1.0       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrh	w1, [x27, #254]!
+# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldrsb	w1, [x27], #254
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldrsb	x1, [x27], #254
+# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.1    1.0       <total>
 
 # CHECK:      [50] Code Region - G51
 
@@ -3485,25 +3485,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        1500
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    2.96
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
-# CHECK-NEXT: Index     0123456789 
+# CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeER   .   ldrsb	w1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeER  .   ldrsb	x1, [x27, #254]!
-# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeER .   ldrsh	w1, [x27], #254
-# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D=eeeeER.   ldrsh	x1, [x27], #254
-# CHECK-NEXT: [0,7]     . D==eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==eeeeER   ldrsh	w1, [x27, #254]!
-# CHECK-NEXT: [0,9]     .  D==eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeER .   ldrsh	w1, [x27], #254
+# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   ldrsh	x1, [x27], #254
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeER   ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .D====eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3515,14 +3515,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrsb	w1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrsb	x1, [x27, #254]!
-# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldrsh	w1, [x27], #254
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ldrsh	x1, [x27], #254
-# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ldrsh	w1, [x27, #254]!
-# CHECK-NEXT: 9.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.1    1.0       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrsh	w1, [x27], #254
+# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldrsh	x1, [x27], #254
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldrsh	w1, [x27, #254]!
+# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.1    1.0       <total>
 
 # CHECK:      [51] Code Region - G52
 
@@ -3531,10 +3531,10 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        1700
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    3.37
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 3.4
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
@@ -3542,13 +3542,13 @@ add x0, x27, 1
 # CHECK:      [0,0]     DeeeeER .   ldrsh	x1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eE--R .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D=eeeeER.   ldrsw	x1, [x27], #254
-# CHECK-NEXT: [0,3]     .D=eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeER   ldrsw	x1, [x27, #254]!
-# CHECK-NEXT: [0,5]     .D==eE--R   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D=eeE-R   st1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,7]     . D==eE-R   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D=eeER   st1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: [0,9]     .  D==eER   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     D==eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeER   ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: [0,5]     D===eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeE-R   st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,7]     .D===eE-R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeER   st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .D====eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3560,14 +3560,14 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldrsh	x1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldrsw	x1, [x27], #254
-# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldrsw	x1, [x27, #254]!
-# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    1.0       st1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: 7.     1     3.0    0.0    1.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.2    0.1    0.8       <total>
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    1.0       st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 7.     1     4.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.1    0.8       <total>
 
 # CHECK:      [52] Code Region - G53
 
@@ -3576,24 +3576,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.2s }, [x27], #8
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3604,15 +3604,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s }, [x27], #8
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [53] Code Region - G54
 
@@ -3621,24 +3621,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.16b }, [x27], #16
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3649,15 +3649,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b }, [x27], #16
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [54] Code Region - G55
 
@@ -3666,24 +3666,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.4s }, [x27], x28
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3694,15 +3694,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [55] Code Region - G56
 
@@ -3711,24 +3711,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2400
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    4.76
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 4.8
+# CHECK-NEXT: Block RThroughput: 3.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   st1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,1]     .DeER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,7]     .   DeER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3738,42 +3738,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.3    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [56] Code Region - G57
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      604
+# CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2600
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    4.30
-# CHECK-NEXT: IPC:               1.66
-# CHECK-NEXT: Block RThroughput: 5.2
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.16
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeER.   .   st1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,1]     .DeER.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeER  .   st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,3]     .  DeER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .  DeeER .   st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,5]     .  D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .   DeeER.   st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    DeER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    DeeER   st1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    D=eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.  .   st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeER.   st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     . D==eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3783,42 +3783,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.2    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    0.0       <total>
 
 # CHECK:      [57] Code Region - G58
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      604
+# CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2600
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    4.30
-# CHECK-NEXT: IPC:               1.66
-# CHECK-NEXT: Block RThroughput: 5.2
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.16
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeER.   .   st1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eER.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER   .   st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     . DeER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER  .   st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER .   st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,7]     .   DeER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    DeeER   st1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .DeER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.  .   st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3829,42 +3829,41 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.2    0.2    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [58] Code Region - G59
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        3400
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.39
-# CHECK-NEXT: IPC:               1.00
-# CHECK-NEXT: Block RThroughput: 6.8
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.84
+# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
-# CHECK-NEXT: Index     0123456789   
-
-# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeER   . .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeER . .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeER .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeER   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.   .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     D=eER.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER   .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3]     .D=eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER  .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,5]     . D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeER.   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,7]     .  D==eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeER   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,9]     .   D==eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3874,43 +3873,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.9    0.2    0.0       <total>
 
 # CHECK:      [59] Code Region - G60
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        3600
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.59
-# CHECK-NEXT: IPC:               1.00
-# CHECK-NEXT: Block RThroughput: 7.2
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.12
+# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: Block RThroughput: 6.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
-# CHECK-NEXT: Index     0123456789   
-
-# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeER   . .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeER . .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeER .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeER   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.   .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1]     D=eER.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER   .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3]     .D=eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER  .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,5]     . D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeER.   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeER   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D==eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3920,43 +3918,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.9    0.2    0.0       <total>
 
 # CHECK:      [60] Code Region - G61
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        3400
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.39
-# CHECK-NEXT: IPC:               1.00
-# CHECK-NEXT: Block RThroughput: 6.8
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.84
+# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
-# CHECK-NEXT: Index     0123456789   
-
-# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeER   . .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeER . .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeER .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeER   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.   .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eER.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER   .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER  .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeER   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D==eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3966,43 +3963,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.7    0.2    0.0       <total>
 
 # CHECK:      [61] Code Region - G62
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total Cycles:      704
 # CHECK-NEXT: Total uOps:        3600
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.59
-# CHECK-NEXT: IPC:               1.00
-# CHECK-NEXT: Block RThroughput: 7.2
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.11
+# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: Block RThroughput: 6.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
-# CHECK-NEXT: Index     0123456789   
-
-# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeER   . .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeER . .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeER .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeER   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eER.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER    .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3]     .D=eER    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeER  .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,5]     . D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeER .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeER   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,9]     .   D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4012,43 +4009,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    0.0       <total>
 
 # CHECK:      [62] Code Region - G63
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total Cycles:      804
 # CHECK-NEXT: Total uOps:        4200
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    4.19
-# CHECK-NEXT: IPC:               1.00
-# CHECK-NEXT: Block RThroughput: 8.4
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.22
+# CHECK-NEXT: IPC:               1.24
+# CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
-# CHECK-NEXT: Index     0123456789   
-
-# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeER   . .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeER . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeER .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeER   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    ..   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1]     D=eER.    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER    ..   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3]     .D=eER    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeER  ..   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,5]     . D==eER  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeER ..   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,7]     .  D==eER ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D===eeER   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D====eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4058,43 +4055,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    2.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.5    0.4    0.0       <total>
 
 # CHECK:      [63] Code Region - G64
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1003
+# CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        3800
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.79
-# CHECK-NEXT: IPC:               1.00
-# CHECK-NEXT: Block RThroughput: 7.6
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.41
+# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
-# CHECK-NEXT: Index     0123456789   
-
-# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeER   . .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeER . .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    DeER . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeER .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeER   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.   .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eER.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER   .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeER .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeER.   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeER   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D==eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4104,16 +4100,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.1    0.2    0.0       <total>
 
 # CHECK:      [64] Code Region - G65
 
@@ -4122,25 +4118,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      706
 # CHECK-NEXT: Total uOps:        3200
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    4.53
 # CHECK-NEXT: IPC:               1.42
-# CHECK-NEXT: Block RThroughput: 6.4
+# CHECK-NEXT: Block RThroughput: 5.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     012
-# CHECK-NEXT: Index     0123456789   
+# CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeER.    . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeER.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeER   . .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeER   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeER .   st1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,5]     .   D=eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    DeeeeER.   st1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,7]     .    D=eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .DeeeeER   st1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .D=eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     D=eER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER    . .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eER    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeeeER .   st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,5]     . D===eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D===eeeeER.   st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,7]     . D====eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D===eeeeER   st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .  D====eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4150,16 +4146,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.3    0.3    0.6       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 7.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.3    0.6       <total>
 
 # CHECK:      [65] Code Region - G66
 
@@ -4168,25 +4164,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    3.95
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
-# CHECK-NEXT: Index     0123456789 
+# CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeER   .   st1	{ v1.b }[8], [x27], x28
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeER  .   st1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeER .   st1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,5]     . D=eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeER.   st1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeER   st1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4197,15 +4193,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.b }[8], [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    1.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
 
 # CHECK:      [66] Code Region - G67
 
@@ -4214,25 +4210,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2200
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    4.35
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 4.4
+# CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
-# CHECK-NEXT: Index     0123456789 
+# CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeER   .   st1	{ v1.s }[0], [x27], #4
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeER  .   st1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeER .   st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,5]     . D=eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeER.   st1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .  D=eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeER   st2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,9]     .    DeE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4243,15 +4239,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.s }[0], [x27], #4
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 9.     1     1.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.4    0.1    1.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
 
 # CHECK:      [67] Code Region - G68
 
@@ -4260,25 +4256,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2400
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    4.74
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 4.8
+# CHECK-NEXT: Block RThroughput: 3.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
-# CHECK-NEXT: Index     0123456789 
+# CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeER   .   st2	{ v1.2s, v2.2s }, [x27], #16
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeER  .   st2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeER .   st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,5]     .  DeE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeER.   st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,7]     .  D=eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeER   st2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,9]     .    DeE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4289,42 +4285,42 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.2s, v2.2s }, [x27], #16
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 5.     1     1.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 9.     1     1.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.3    0.1    1.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
 
 # CHECK:      [68] Code Region - G69
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      705
+# CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2600
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.69
-# CHECK-NEXT: IPC:               1.42
-# CHECK-NEXT: Block RThroughput: 5.2
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.14
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
-# CHECK-NEXT: Index     0123456789  
-
-# CHECK:      [0,0]     DeeeeER   ..   st2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,1]     .DeE--R   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeER ..   st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeE--R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .  DeeeeER..   st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .  D=eE--R..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .   DeeeeER.   st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .   D=eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    DeeeeER   st2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .DeE--R   add	x0, x27, #1
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER  .   st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeER.   st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     . D==eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4334,43 +4330,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 1.     1     1.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.2    0.2    1.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    1.0       <total>
 
 # CHECK:      [69] Code Region - G70
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      606
+# CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2400
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.96
-# CHECK-NEXT: IPC:               1.65
-# CHECK-NEXT: Block RThroughput: 4.8
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.74
+# CHECK-NEXT: IPC:               1.98
+# CHECK-NEXT: Block RThroughput: 3.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
-# CHECK-NEXT: Index     0123456789  
-
-# CHECK:      [0,0]     DeeeeER   ..   st2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=eE--R   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeER  ..   st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     . DeE--R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .  DeeeeER..   st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .   DeE--R..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .   DeeeeER.   st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,7]     .   D=eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    DeeeeER   st2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,9]     .    D=eE--R   add	x0, x27, #1
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .   st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4381,15 +4377,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.8b, v2.8b }, [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     1.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 9.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.3    0.2    1.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
 
 # CHECK:      [70] Code Region - G71
 
@@ -4398,25 +4394,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    3.95
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
-# CHECK-NEXT: Index     0123456789 
+# CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeER   .   st2	{ v1.b, v2.b }[0], [x27], x28
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeER  .   st2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeER .   st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,5]     . D=eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeER.   st2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,7]     .  D=eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeER   st2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4427,15 +4423,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.b, v2.b }[0], [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    1.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
 
 # CHECK:      [71] Code Region - G72
 
@@ -4444,25 +4440,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    3.95
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
-# CHECK-NEXT: Index     0123456789 
+# CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeER   .   st2	{ v1.h, v2.h }[4], [x27], x28
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeER  .   st2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeER .   st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D=eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeeeER.   st2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,7]     .  D=eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeeeER   st2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .   D=eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4473,42 +4469,42 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.h, v2.h }[4], [x27], x28
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 9.     1     2.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    1.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
 
 # CHECK:      [72] Code Region - G73
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      707
+# CHECK-NEXT: Total Cycles:      607
 # CHECK-NEXT: Total uOps:        2800
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.96
-# CHECK-NEXT: IPC:               1.41
-# CHECK-NEXT: Block RThroughput: 5.6
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.61
+# CHECK-NEXT: IPC:               1.65
+# CHECK-NEXT: Block RThroughput: 4.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
-# CHECK-NEXT: Index     0123456789    
-
-# CHECK:      [0,0]     DeER .    .  .   st2g	x26, [x27], #4064
-# CHECK-NEXT: [0,1]     D=eER.    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeER.    .  .   st2g	x26, [x27, #4064]!
-# CHECK-NEXT: [0,3]     .D=eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeeeER  .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,5]     .  DeE----R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .   DeeeeeER .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,7]     .    DeE---R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .DeeeeeER   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,9]     .    . DeE---R   add	x0, x27, #1
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeER .    . .   st2g	x26, [x27], #4064
+# CHECK-NEXT: [0,1]     D=eER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eER.    . .   st2g	x26, [x27, #4064]!
+# CHECK-NEXT: [0,3]     D==eER    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeER .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,7]     . D==eE---R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D==eeeeeER   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,9]     .  D===eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4519,42 +4515,42 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2g	x26, [x27], #4064
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2g	x26, [x27, #4064]!
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 7.     1     1.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 9.     1     1.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.2    0.3    1.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2g	x26, [x27, #4064]!
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 7.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 9.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.5    0.2    1.0       <total>
 
 # CHECK:      [73] Code Region - G74
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1007
+# CHECK-NEXT: Total Cycles:      708
 # CHECK-NEXT: Total uOps:        3800
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.77
-# CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 7.6
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.37
+# CHECK-NEXT: IPC:               1.41
+# CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
-# CHECK-NEXT: Index     0123456789       
-
-# CHECK:      [0,0]     DeeeeeeER .    ..   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,1]     .DeE----R .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeER.    ..   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,3]     .  DeE---R.    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeER  ..   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,5]     .    DeE----R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeER..   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,7]     .    . DeE----R..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeER   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   DeE----R   add	x0, x27, #1
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .   .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     D=eE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER .   .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3]     .D=eE---R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeER  .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,5]     . D==eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeER .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,7]     .  D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeER   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4564,43 +4560,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 3.     1     1.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 7.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    1.9       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    1.9       <total>
 
 # CHECK:      [74] Code Region - G75
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1007
+# CHECK-NEXT: Total Cycles:      707
 # CHECK-NEXT: Total uOps:        3400
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.38
-# CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 6.8
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.81
+# CHECK-NEXT: IPC:               1.41
+# CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
-# CHECK-NEXT: Index     0123456789       
-
-# CHECK:      [0,0]     DeeeeeER  .    ..   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeE---R  .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeER.    ..   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeE---R.    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeER  ..   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    DeE----R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeER ..   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . DeE---R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeER   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   DeE----R   add	x0, x27, #1
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .  .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE---R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER .  .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE---R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER  .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeER  .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE---R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeeeeeER   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D==eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4610,43 +4606,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    1.7       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    1.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.7    0.2    1.7       <total>
 
 # CHECK:      [75] Code Region - G76
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1007
+# CHECK-NEXT: Total Cycles:      757
 # CHECK-NEXT: Total uOps:        4000
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.97
-# CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 8.0
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.28
+# CHECK-NEXT: IPC:               1.32
+# CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
-# CHECK-NEXT: Index     0123456789       
-
-# CHECK:      [0,0]     DeeeeeeER .    ..   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeE----R .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeER    ..   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,3]     .  DeE----R    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeER  ..   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,5]     .    DeE----R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeER..   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    . DeE----R..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeER   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,9]     .    .   DeE----R   add	x0, x27, #1
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .   .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.   .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,3]     .D=eE----R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeER  .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,5]     . D==eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeER .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeER   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4656,43 +4652,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 3.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    2.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    2.0       <total>
 
 # CHECK:      [76] Code Region - G77
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1007
+# CHECK-NEXT: Total Cycles:      757
 # CHECK-NEXT: Total uOps:        4000
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.97
-# CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 8.0
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.28
+# CHECK-NEXT: IPC:               1.32
+# CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
-# CHECK-NEXT: Index     0123456789       
-
-# CHECK:      [0,0]     DeeeeeeER .    ..   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,1]     .DeE----R .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeER    ..   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,3]     .  DeE----R    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeER  ..   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    DeE----R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeER..   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,7]     .    . DeE----R..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeER   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,9]     .    .   DeE----R   add	x0, x27, #1
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .   .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,1]     D=eE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.   .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,3]     .D=eE----R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeER  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D==eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeER .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeER   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,9]     .   D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4702,43 +4698,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 3.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 9.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    2.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    2.0       <total>
 
 # CHECK:      [77] Code Region - G78
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1007
+# CHECK-NEXT: Total Cycles:      807
 # CHECK-NEXT: Total uOps:        4200
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    4.17
-# CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 8.4
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.20
+# CHECK-NEXT: IPC:               1.24
+# CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
-# CHECK-NEXT: Index     0123456789       
-
-# CHECK:      [0,0]     DeeeeeeER .    ..   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .DeE----R .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeER    ..   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,3]     .  DeE----R    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeER  ..   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    DeE----R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeER ..   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,7]     .    . DeE---R ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeER   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,9]     .    .   DeE----R   add	x0, x27, #1
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .   .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.   .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,3]     .D=eE----R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeER  .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D==eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeER  .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,7]     .  D==eE---R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeER   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,9]     .   D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4748,43 +4744,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 3.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 7.     1     1.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 9.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    1.9       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 7.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    1.9       <total>
 
 # CHECK:      [78] Code Region - G79
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1307
+# CHECK-NEXT: Total Cycles:      1205
 # CHECK-NEXT: Total uOps:        5800
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    4.44
-# CHECK-NEXT: IPC:               0.77
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.81
+# CHECK-NEXT: IPC:               0.83
 # CHECK-NEXT: Block RThroughput: 12.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          
-
-# CHECK:      [0,0]     DeeeeeeER .    .   .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,1]     .DeE----R .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeeER   .   .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,3]     .   DeE----R   .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    DeeeeeeER .   .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .DeE----R .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    . D=eeeeeeeER .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,7]     .    .   DeE-----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    DeeeeeeeER   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .    . DeE----R   add	x0, x27, #1
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .    ..   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     D=eE----R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeER    ..   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3]     . DeE-----R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  DeeeeeeER   ..   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,5]     .  D=eE----R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   D==eeeeeeeER.   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,7]     .    D==eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D=eeeeeeeER   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,9]     .    . D=eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4794,43 +4790,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 3.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    2.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 9.     1     1.0    1.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.1    0.8    2.1       <total>
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    2.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 9.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.8    0.4    2.3       <total>
 
 # CHECK:      [79] Code Region - G80
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1107
+# CHECK-NEXT: Total Cycles:      1006
 # CHECK-NEXT: Total uOps:        4800
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    4.34
-# CHECK-NEXT: IPC:               0.90
-# CHECK-NEXT: Block RThroughput: 9.6
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.77
+# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: Block RThroughput: 9.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234567
-# CHECK-NEXT: Index     0123456789        
-
-# CHECK:      [0,0]     DeeeeeER  .    . .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .DeE---R  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeER    . .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .  DeE----R    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeER  . .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    DeE----R  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeeER .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .  DeE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .   DeeeeeeER   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    DeE----R   add	x0, x27, #1
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE---R  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.    .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE----R.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeeeeeER  .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D===eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==eeeeeeeER.   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .   D==eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    D==eeeeeeER   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4840,43 +4836,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    3.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     1.0    1.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.6    1.9       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.6    0.4    2.0       <total>
 
 # CHECK:      [80] Code Region - G81
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1207
+# CHECK-NEXT: Total Cycles:      1058
 # CHECK-NEXT: Total uOps:        5200
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    4.31
-# CHECK-NEXT: IPC:               0.83
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    4.91
+# CHECK-NEXT: IPC:               0.95
 # CHECK-NEXT: Block RThroughput: 10.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
-# CHECK-NEXT: Index     0123456789         
-
-# CHECK:      [0,0]     DeeeeeeeER.    .  .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     . DeE----R.    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .  DeeeeeeeER  .  .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    DeE----R  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .DeeeeeeER.  .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .    . DeE----R.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .  D=eeeeeeER.   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,7]     .    .   D=eE----R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    DeeeeeeER   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .DeE----R   add	x0, x27, #1
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    . .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-----R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   . .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE-----R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==eeeeeeER. .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .   D===eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    D===eeeeeeER.   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,7]     .    D====eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D===eeeeeeER   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .D====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4886,43 +4882,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     1.0    1.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    2.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.2    0.7    2.0       <total>
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    3.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 7.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.9    0.6    2.2       <total>
 
 # CHECK:      [81] Code Region - G82
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1007
+# CHECK-NEXT: Total Cycles:      757
 # CHECK-NEXT: Total uOps:        4000
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.97
-# CHECK-NEXT: IPC:               0.99
-# CHECK-NEXT: Block RThroughput: 8.0
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.28
+# CHECK-NEXT: IPC:               1.32
+# CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
-# CHECK-NEXT: Index     0123456789       
-
-# CHECK:      [0,0]     DeeeeeeER .    ..   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,1]     .DeE----R .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeER    ..   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,3]     .  DeE----R    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeeeER  ..   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,5]     .    DeE----R  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeeER..   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    . DeE----R..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeeeeeeER   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,9]     .    .   DeE----R   add	x0, x27, #1
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .   .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.   .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .D=eE----R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeER  .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,5]     . D==eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeER .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeER   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4932,43 +4928,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 3.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 9.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.0    0.5    2.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    2.0       <total>
 
 # CHECK:      [82] Code Region - G83
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      904
+# CHECK-NEXT: Total Cycles:      704
 # CHECK-NEXT: Total uOps:        3600
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    3.98
-# CHECK-NEXT: IPC:               1.11
-# CHECK-NEXT: Block RThroughput: 7.2
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.11
+# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: Block RThroughput: 6.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
-# CHECK-NEXT: Index     0123456789   
-
-# CHECK:      [0,0]     DeeeeeeER . .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,1]     .DeE----R . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeeeER .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .  DeE----R .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeER .   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,5]     .    DeE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeER   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    . DeE--R   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  DeE-R   stg	x26, [x27], #4064
-# CHECK-NEXT: [0,9]     .    .  D=eER   add	x0, x27, #1
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,1]     D=eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeER.   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,5]     . D==eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeER   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eE-R   stg	x26, [x27], #4064
+# CHECK-NEXT: [0,9]     .   D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4978,16 +4974,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 3.     1     1.0    0.0    4.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 5.     1     1.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 7.     1     1.0    0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    1.0    1.0       stg	x26, [x27], #4064
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.1    0.5    1.3       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    1.0       stg	x26, [x27], #4064
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    1.3       <total>
 
 # CHECK:      [83] Code Region - G84
 
@@ -4996,24 +4992,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2200
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    4.37
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 4.4
+# CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeER .  .   stg	x26, [x27, #4064]!
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeER.  .   stgp	x1, x2, [x27], #992
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeER  .   stgp	x1, x2, [x27, #992]!
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER.   stp	s1, s2, [x27], #248
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeER   stp	d1, d2, [x27], #496
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eER.  .   stgp	x1, x2, [x27], #992
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eER  .   stgp	x1, x2, [x27, #992]!
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   stp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   stp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5024,15 +5020,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       stg	x26, [x27, #4064]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       stgp	x1, x2, [x27], #992
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       stgp	x1, x2, [x27, #992]!
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       stp	s1, s2, [x27], #248
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       stp	d1, d2, [x27], #496
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stgp	x1, x2, [x27], #992
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       stgp	x1, x2, [x27, #992]!
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       stp	s1, s2, [x27], #248
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       stp	d1, d2, [x27], #496
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [84] Code Region - G85
 
@@ -5041,25 +5037,25 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      704
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    3.55
 # CHECK-NEXT: IPC:               1.42
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
-# CHECK-NEXT: Index     0123456789 
+# CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeER.    .   stp	q1, q2, [x27], #992
 # CHECK-NEXT: [0,1]     D==eER    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=eeER   .   stp	s1, s2, [x27, #248]!
-# CHECK-NEXT: [0,3]     .D==eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=eeER  .   stp	d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,5]     . D==eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eeER .   stp	q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,7]     .   D==eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==eER.   stp	w1, w2, [x27], #248
-# CHECK-NEXT: [0,9]     .   D===eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D==eeER   .   stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     D===eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D==eeER  .   stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .D===eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D===eeER .   stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     . D====eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D====eER.   stp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     . D=====eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5070,15 +5066,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	q1, q2, [x27], #992
 # CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stp	s1, s2, [x27, #248]!
-# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     2.0    0.0    0.0       stp	d1, d2, [x27, #496]!
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       stp	q1, q2, [x27, #992]!
-# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       stp	w1, w2, [x27], #248
-# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.6    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     3.0    0.0    0.0       stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       stp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.8    0.1    0.0       <total>
 
 # CHECK:      [85] Code Region - G86
 
@@ -5087,24 +5083,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2200
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    4.37
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 4.4
+# CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeER .  .   stp	x1, x2, [x27], #496
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeER.  .   stp	w1, w2, [x27, #248]!
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeER  .   stp	x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER.   str	b1, [x27], #254
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeER   str	h1, [x27], #254
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eER.  .   stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eER  .   stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   str	b1, [x27], #254
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   str	h1, [x27], #254
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5115,15 +5111,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	x1, x2, [x27], #496
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       stp	w1, w2, [x27, #248]!
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       stp	x1, x2, [x27, #496]!
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       str	b1, [x27], #254
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       str	h1, [x27], #254
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       str	b1, [x27], #254
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       str	h1, [x27], #254
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [86] Code Region - G87
 
@@ -5132,24 +5128,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2500
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    4.96
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK-NEXT: Block RThroughput: 3.8
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   str	s1, [x27], #254
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER  .   str	d1, [x27], #254
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER .   str	q1, [x27], #254
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeeER.   str	b1, [x27, #254]!
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeER   str	h1, [x27, #254]!
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeER  .   str	d1, [x27], #254
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   str	q1, [x27], #254
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   str	b1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   str	h1, [x27, #254]!
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5160,15 +5156,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27], #254
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       str	d1, [x27], #254
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       str	q1, [x27], #254
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       str	b1, [x27, #254]!
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       str	h1, [x27, #254]!
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	d1, [x27], #254
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       str	q1, [x27], #254
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       str	b1, [x27, #254]!
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       str	h1, [x27, #254]!
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [87] Code Region - G88
 
@@ -5177,24 +5173,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2300
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    4.56
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 4.6
+# CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeeER.  .   str	s1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeER  .   str	d1, [x27, #254]!
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER .   str	q1, [x27, #254]!
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeER .   str	w1, [x27], #254
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeER.   str	x1, [x27], #254
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeER  .   str	d1, [x27, #254]!
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   str	q1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eER .   str	w1, [x27], #254
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eER.   str	x1, [x27], #254
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5205,15 +5201,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       str	d1, [x27, #254]!
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       str	q1, [x27, #254]!
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       str	w1, [x27], #254
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       str	x1, [x27], #254
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	d1, [x27, #254]!
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       str	q1, [x27, #254]!
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       str	w1, [x27], #254
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       str	x1, [x27], #254
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [88] Code Region - G89
 
@@ -5222,24 +5218,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeER .  .   str	w1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeER.  .   str	x1, [x27, #254]!
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeER  .   strb	w1, [x27], #254
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeER .   strb	w1, [x27, #254]!
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeER.   strh	w1, [x27], #254
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eER.  .   str	x1, [x27, #254]!
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eER  .   strb	w1, [x27], #254
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eER .   strb	w1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eER.   strh	w1, [x27], #254
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5250,15 +5246,15 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	w1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       str	x1, [x27, #254]!
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       strb	w1, [x27], #254
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       strb	w1, [x27, #254]!
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       strh	w1, [x27], #254
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	x1, [x27, #254]!
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       strb	w1, [x27], #254
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       strb	w1, [x27, #254]!
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       strh	w1, [x27], #254
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [89] Code Region - G90
 
@@ -5267,24 +5263,24 @@ add x0, x27, 1
 # CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
-# CHECK:      Dispatch Width:    5
+# CHECK:      Dispatch Width:    10
 # CHECK-NEXT: uOps Per Cycle:    3.97
 # CHECK-NEXT: IPC:               1.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT: Index     012345678
 
 # CHECK:      [0,0]     DeER .  .   strh	w1, [x27, #254]!
 # CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeER.  .   stz2g	x26, [x27], #4064
-# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeER  .   stz2g	x26, [x27, #4064]!
-# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeER .   stzg	x26, [x27], #4064
-# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeER.   stzg	x26, [x27, #4064]!
-# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eER.  .   stz2g	x26, [x27], #4064
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eER  .   stz2g	x26, [x27, #4064]!
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eER .   stzg	x26, [x27], #4064
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eER.   stzg	x26, [x27, #4064]!
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5295,36 +5291,36 @@ add x0, x27, 1
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       strh	w1, [x27, #254]!
 # CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       stz2g	x26, [x27], #4064
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       stz2g	x26, [x27, #4064]!
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       stzg	x26, [x27], #4064
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       stzg	x26, [x27, #4064]!
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stz2g	x26, [x27], #4064
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       stz2g	x26, [x27, #4064]!
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       stzg	x26, [x27], #4064
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       stzg	x26, [x27, #4064]!
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [90] Code Region - G91
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      143
+# CHECK-NEXT: Total Cycles:      110
 # CHECK-NEXT: Total uOps:        600
 
-# CHECK:      Dispatch Width:    5
-# CHECK-NEXT: uOps Per Cycle:    4.20
-# CHECK-NEXT: IPC:               2.80
-# CHECK-NEXT: Block RThroughput: 1.2
+# CHECK:      Dispatch Width:    10
+# CHECK-NEXT: uOps Per Cycle:    5.45
+# CHECK-NEXT: IPC:               3.64
+# CHECK-NEXT: Block RThroughput: 1.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0
-# CHECK-NEXT: Index     0123456789 
+# CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeeeER   .   ldr	x1, [x27], #254
 # CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     D====eeeeER   ldr	x2, [x1], #254
-# CHECK-NEXT: [0,3]     .DeE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     D=eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5336,5 +5332,5 @@ add x0, x27, 1
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	x1, [x27], #254
 # CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     5.0    0.0    0.0       ldr	x2, [x1], #254
-# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.3    0.3    2.0       <total>
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.5    0.3    2.0       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-neon-instructions.s
index 68a067eb8a360..d417101fc8d0a 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-neon-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-neon-instructions.s
@@ -1257,11 +1257,11 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      2     0.25                        addhn2	v0.8h, v0.4s, v0.4s
 # CHECK-NEXT:  1      2     0.25                        addp	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT:  1      2     0.25                        addp	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  1      2     0.50                        addv	s0, v0.4s
-# CHECK-NEXT:  1      2     0.50                        addv	h0, v0.4h
-# CHECK-NEXT:  2      4     0.50                        addv	h0, v0.8h
-# CHECK-NEXT:  2      4     0.50                        addv	b0, v0.8b
-# CHECK-NEXT:  2      4     1.00                        addv	b0, v0.16b
+# CHECK-NEXT:  1      3     0.50                        addv	s0, v0.4s
+# CHECK-NEXT:  1      3     0.50                        addv	h0, v0.4h
+# CHECK-NEXT:  2      5     0.50                        addv	h0, v0.8h
+# CHECK-NEXT:  2      5     0.50                        addv	b0, v0.8b
+# CHECK-NEXT:  2      6     1.00                        addv	b0, v0.16b
 # CHECK-NEXT:  1      2     0.25                        aesd	v0.16b, v0.16b
 # CHECK-NEXT:  1      2     0.25                        aese	v0.16b, v0.16b
 # CHECK-NEXT:  1      2     0.25                        aesimc	v0.16b, v0.16b
@@ -1779,11 +1779,11 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      2     0.25                        saddlp	v0.4h, v0.8b
 # CHECK-NEXT:  1      2     0.25                        saddlp	v0.4s, v0.8h
 # CHECK-NEXT:  1      2     0.25                        saddlp	v0.8h, v0.16b
-# CHECK-NEXT:  1      2     0.50                        saddlv	d0, v0.4s
-# CHECK-NEXT:  1      2     0.50                        saddlv	s0, v0.4h
-# CHECK-NEXT:  2      4     0.50                        saddlv	s0, v0.8h
-# CHECK-NEXT:  2      4     0.50                        saddlv	h0, v0.8b
-# CHECK-NEXT:  2      4     1.00                        saddlv	h0, v0.16b
+# CHECK-NEXT:  1      3     0.50                        saddlv	d0, v0.4s
+# CHECK-NEXT:  1      3     0.50                        saddlv	s0, v0.4h
+# CHECK-NEXT:  2      5     0.50                        saddlv	s0, v0.8h
+# CHECK-NEXT:  2      5     0.50                        saddlv	h0, v0.8b
+# CHECK-NEXT:  2      6     1.00                        saddlv	h0, v0.16b
 # CHECK-NEXT:  1      2     0.25                        saddw	v0.2d, v0.2d, v0.2s
 # CHECK-NEXT:  1      2     0.25                        saddw	v0.4s, v0.4s, v0.4h
 # CHECK-NEXT:  1      2     0.25                        saddw	v0.8h, v0.8h, v0.8b
@@ -2221,11 +2221,11 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      2     0.25                        uaddlp	v0.4h, v0.8b
 # CHECK-NEXT:  1      2     0.25                        uaddlp	v0.4s, v0.8h
 # CHECK-NEXT:  1      2     0.25                        uaddlp	v0.8h, v0.16b
-# CHECK-NEXT:  1      2     0.50                        uaddlv	d0, v0.4s
-# CHECK-NEXT:  1      2     0.50                        uaddlv	s0, v0.4h
-# CHECK-NEXT:  2      4     0.50                        uaddlv	s0, v0.8h
-# CHECK-NEXT:  2      4     0.50                        uaddlv	h0, v0.8b
-# CHECK-NEXT:  2      4     1.00                        uaddlv	h0, v0.16b
+# CHECK-NEXT:  1      3     0.50                        uaddlv	d0, v0.4s
+# CHECK-NEXT:  1      3     0.50                        uaddlv	s0, v0.4h
+# CHECK-NEXT:  2      5     0.50                        uaddlv	s0, v0.8h
+# CHECK-NEXT:  2      5     0.50                        uaddlv	h0, v0.8b
+# CHECK-NEXT:  2      6     1.00                        uaddlv	h0, v0.16b
 # CHECK-NEXT:  1      2     0.25                        uaddw	v0.2d, v0.2d, v0.2s
 # CHECK-NEXT:  1      2     0.25                        uaddw	v0.4s, v0.4s, v0.4h
 # CHECK-NEXT:  1      2     0.25                        uaddw	v0.8h, v0.8h, v0.8b



More information about the llvm-commits mailing list