[llvm] [AArch64] Fix postinc operands for Cortex-A510 scheduling (PR #68518)

David Green via llvm-commits llvm-commits at lists.llvm.org
Sun Oct 8 03:33:58 PDT 2023


https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/68518

Similar to D159254, this fixes the order of WriteAdr operands on post/pre-inc loads/stores in the Cortex-A510 scheduling model.

I will add the same for other models too, this will be the most impactful due to it being the default cpu scheduling model.

>From 10f934de6abe8e968e966008c16aa2333b280e57 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Sun, 8 Oct 2023 11:25:40 +0100
Subject: [PATCH] [AArch64] Fix postinc operands for Cortex-A510 scheduling

Similar to D159254, this fixes the order of WriteAdr operands on post/pre-inc
loads/stores in the Cortex-A510 scheduling model.

I will add the same for other models too, this will be the most impactful due
to it being the default cpu scheduling model.
---
 llvm/lib/Target/AArch64/AArch64SchedA510.td   |   68 +-
 .../aarch64-interleaved-access-w-undef.ll     |    2 +-
 .../AArch64/arm64-indexed-vector-ldst.ll      |   18 +-
 llvm/test/CodeGen/AArch64/extbinopload.ll     |  180 +-
 llvm/test/CodeGen/AArch64/ld1postmul.ll       |    2 +-
 .../AArch64/machine-cse-profitable-check.ll   |    4 +-
 llvm/test/CodeGen/AArch64/tbl-loops.ll        |   14 +-
 llvm/test/CodeGen/AArch64/vldn_shuffle.ll     |    4 +-
 .../llvm-mca/AArch64/Cortex/A510-writeback.s  | 2042 ++++++++---------
 9 files changed, 1167 insertions(+), 1167 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SchedA510.td b/llvm/lib/Target/AArch64/AArch64SchedA510.td
index fab2cda87807554..1afbc5d9102ca96 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA510.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA510.td
@@ -295,16 +295,16 @@ def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Threev(16b|8h|4s|2d)$")>;
 def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
 def : InstRW<[CortexA510WriteVLD2], (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
 
-def : InstRW<[CortexA510WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
 
 //    2-element structures
 def : InstRW<[CortexA510WriteVLD2], (instregex "LD2i(8|16|32|64)$")>;
@@ -312,10 +312,10 @@ def : InstRW<[CortexA510WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$
 def : InstRW<[CortexA510WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
 def : InstRW<[CortexA510WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
 
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
-def : InstRW<[CortexA510WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
 
 //    3-element structures
 def : InstRW<[CortexA510WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
@@ -323,10 +323,10 @@ def : InstRW<[CortexA510WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$
 def : InstRW<[CortexA510WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)$")>;
 def : InstRW<[CortexA510WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)$")>;
 
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD3, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD6, WriteAdr], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
 
 //    4-element structures
 def : InstRW<[CortexA510WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;                // load single 4-el structure to one lane of 4 regs.
@@ -334,10 +334,10 @@ def : InstRW<[CortexA510WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$
 def : InstRW<[CortexA510WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)$")>;           // load multiple 4-el structures to 4 regs.
 def : InstRW<[CortexA510WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
 
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD4, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA510WriteVLD8, WriteAdr], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
 
 //---
 // Vector Stores
@@ -347,28 +347,28 @@ def : InstRW<[CortexA510WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d
 def : InstRW<[CortexA510WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 def : InstRW<[CortexA510WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 def : InstRW<[CortexA510WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[CortexA510WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA510WriteVST4, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 def : InstRW<[CortexA510WriteVST2], (instregex "ST2i(8|16|32|64)$")>;
 def : InstRW<[CortexA510WriteVST2], (instregex "ST2Twov(8b|4h|2s)$")>;
 def : InstRW<[CortexA510WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[CortexA510WriteVST2, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVST2, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
-def : InstRW<[CortexA510WriteVST4, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
 
 def : InstRW<[CortexA510WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
 def : InstRW<[CortexA510WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[CortexA510WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVST4, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>;
 
 def : InstRW<[CortexA510WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
 def : InstRW<[CortexA510WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[CortexA510WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA510WriteVST4, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST2], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA510WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 //---
 // Floating Point Conversions, MAC, DIV, SQRT
diff --git a/llvm/test/CodeGen/AArch64/aarch64-interleaved-access-w-undef.ll b/llvm/test/CodeGen/AArch64/aarch64-interleaved-access-w-undef.ll
index cbda7b027587d9c..07fbe5d7310f60f 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-interleaved-access-w-undef.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-interleaved-access-w-undef.ll
@@ -47,10 +47,10 @@ define void @f_undef_1(<8 x i64> %a, ptr %dst) {
 ; CHECK-LABEL: f_undef_1:
 ; CHECK:       // %bb.0: // %BB
 ; CHECK-NEXT:    mov v16.16b, v0.16b
-; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:    mov v5.16b, v2.16b
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $q1_q2
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 def $q3_q4
+; CHECK-NEXT:    mov x8, x0
 ; CHECK-NEXT:    mov v2.16b, v1.16b
 ; CHECK-NEXT:    mov v4.16b, v3.16b
 ; CHECK-NEXT:    mov v17.16b, v16.16b
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index 6657b19d24929d8..7d73e1c6c1d7f41 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -14320,8 +14320,8 @@ define <8 x i8> @test_v8i8_post_imm_ld1lane(ptr %bar, ptr %ptr, <8 x i8> %A) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ld1.b { v0 }[1], [x0], #1
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    str x0, [x1]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-GISEL-LABEL: test_v8i8_post_imm_ld1lane:
@@ -14345,8 +14345,8 @@ define <8 x i8> @test_v8i8_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <8 x i
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ld1.b { v0 }[1], [x0], x2
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    str x0, [x1]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-GISEL-LABEL: test_v8i8_post_reg_ld1lane:
@@ -14413,8 +14413,8 @@ define <4 x i16> @test_v4i16_post_imm_ld1lane(ptr %bar, ptr %ptr, <4 x i16> %A)
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ld1.h { v0 }[1], [x0], #2
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    str x0, [x1]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-GISEL-LABEL: test_v4i16_post_imm_ld1lane:
@@ -14439,8 +14439,8 @@ define <4 x i16> @test_v4i16_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <4 x
 ; CHECK-NEXT:    lsl x8, x2, #1
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ld1.h { v0 }[1], [x0], x8
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    str x0, [x1]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-GISEL-LABEL: test_v4i16_post_reg_ld1lane:
@@ -14507,8 +14507,8 @@ define <2 x i32> @test_v2i32_post_imm_ld1lane(ptr %bar, ptr %ptr, <2 x i32> %A)
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ld1.s { v0 }[1], [x0], #4
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    str x0, [x1]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-GISEL-LABEL: test_v2i32_post_imm_ld1lane:
@@ -14533,8 +14533,8 @@ define <2 x i32> @test_v2i32_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <2 x
 ; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ld1.s { v0 }[1], [x0], x8
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    str x0, [x1]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-GISEL-LABEL: test_v2i32_post_reg_ld1lane:
@@ -14644,8 +14644,8 @@ define <2 x float> @test_v2f32_post_imm_ld1lane(ptr %bar, ptr %ptr, <2 x float>
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ld1.s { v0 }[1], [x0], #4
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    str x0, [x1]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-GISEL-LABEL: test_v2f32_post_imm_ld1lane:
@@ -14670,8 +14670,8 @@ define <2 x float> @test_v2f32_post_reg_ld1lane(ptr %bar, ptr %ptr, i64 %inc, <2
 ; CHECK-NEXT:    lsl x8, x2, #2
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ld1.s { v0 }[1], [x0], x8
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    str x0, [x1]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-GISEL-LABEL: test_v2f32_post_reg_ld1lane:
@@ -14776,9 +14776,9 @@ define <4 x i16> @test_v4i16_post_reg_ld1lane_forced_narrow(ptr %bar, ptr %ptr,
 ; CHECK-NEXT:    lsl x8, x2, #1
 ; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    ld1.h { v0 }[1], [x0], x8
-; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    str x0, [x1]
 ; CHECK-NEXT:    ldr d1, [x3]
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    cnt.8b v1, v1
 ; CHECK-NEXT:    uaddlp.4h v1, v1
 ; CHECK-NEXT:    uaddlp.2s v1, v1
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index 99f573795489a08..849fc7aa00a8e7e 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -365,15 +365,15 @@ define <12 x i32> @load_bv_3xv4i8_i32(ptr %p, ptr %q, ptr %r) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp s0, s1, [x0]
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x1], #4
+; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
 ; CHECK-NEXT:    ldp s3, s2, [x2]
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
 ; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
 ; CHECK-NEXT:    ushll v3.8h, v3.8b, #0
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ushll v2.4s, v2.4h, #3
 ; CHECK-NEXT:    ushll2 v4.4s, v1.8h, #3
 ; CHECK-NEXT:    ushll v1.4s, v1.4h, #3
+; CHECK-NEXT:    ushll v2.4s, v2.4h, #3
 ; CHECK-NEXT:    uaddw v2.4s, v2.4s, v3.4h
 ; CHECK-NEXT:    uaddw2 v3.4s, v4.4s, v0.8h
 ; CHECK-NEXT:    uaddw v0.4s, v1.4s, v0.4h
@@ -407,10 +407,10 @@ define <16 x i16> @load_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp s0, s1, [x0]
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x1], #4
-; CHECK-NEXT:    ldp s2, s3, [x2]
 ; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
-; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
+; CHECK-NEXT:    ldp s2, s3, [x2]
 ; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
 ; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
 ; CHECK-NEXT:    uaddl v1.8h, v2.8b, v3.8b
 ; CHECK-NEXT:    ret
@@ -444,10 +444,10 @@ define <8 x i32> @double_bv_2xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp s0, s1, [x0]
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x1], #4
-; CHECK-NEXT:    ldp s2, s3, [x2]
 ; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
-; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
+; CHECK-NEXT:    ldp s2, s3, [x2]
 ; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
 ; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
 ; CHECK-NEXT:    usubl v2.8h, v2.8b, v3.8b
 ; CHECK-NEXT:    shll v3.4s, v2.4h, #16
@@ -489,18 +489,18 @@ define <16 x i32> @double_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp s0, s1, [x0]
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x1], #4
-; CHECK-NEXT:    ldp s2, s3, [x2]
 ; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
-; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
-; CHECK-NEXT:    ldp s4, s5, [x4]
+; CHECK-NEXT:    ldp s2, s3, [x2]
 ; CHECK-NEXT:    usubl v1.8h, v0.8b, v1.8b
+; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
 ; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
-; CHECK-NEXT:    ld1 { v4.s }[1], [x5], #4
-; CHECK-NEXT:    ldp s6, s7, [x6]
+; CHECK-NEXT:    ldp s4, s5, [x4]
 ; CHECK-NEXT:    usubl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT:    ld1 { v4.s }[1], [x5], #4
 ; CHECK-NEXT:    ld1 { v5.s }[1], [x5]
-; CHECK-NEXT:    ld1 { v6.s }[1], [x7], #4
+; CHECK-NEXT:    ldp s6, s7, [x6]
 ; CHECK-NEXT:    usubl v4.8h, v4.8b, v5.8b
+; CHECK-NEXT:    ld1 { v6.s }[1], [x7], #4
 ; CHECK-NEXT:    ld1 { v7.s }[1], [x7]
 ; CHECK-NEXT:    usubl v5.8h, v6.8b, v7.8b
 ; CHECK-NEXT:    shll v0.4s, v4.4h, #16
@@ -647,7 +647,7 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s1, [x0]
 ; CHECK-NEXT:    add x8, x3, #8
-; CHECK-NEXT:    add x11, x1, #12
+; CHECK-NEXT:    add x11, x3, #12
 ; CHECK-NEXT:    str s1, [x4]
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-NEXT:    ldp s0, s5, [x2]
@@ -664,16 +664,16 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-NEXT:    add x9, x1, #4
 ; CHECK-NEXT:    uzp1 v1.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    mov v0.b[11], w10
-; CHECK-NEXT:    add x10, x3, #12
+; CHECK-NEXT:    add x10, x1, #12
 ; CHECK-NEXT:    ld1 { v0.s }[3], [x3], #4
 ; CHECK-NEXT:    ldr s4, [x0, #12]
 ; CHECK-NEXT:    ldp s3, s16, [x0, #4]
-; CHECK-NEXT:    ldp s6, s7, [x2, #8]
-; CHECK-NEXT:    ld1 { v4.s }[1], [x11]
 ; CHECK-NEXT:    ld1 { v5.s }[1], [x3]
+; CHECK-NEXT:    ldp s6, s7, [x2, #8]
+; CHECK-NEXT:    ld1 { v4.s }[1], [x10]
 ; CHECK-NEXT:    ld1 { v3.s }[1], [x9]
 ; CHECK-NEXT:    ld1 { v6.s }[1], [x8]
-; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v7.s }[1], [x11]
 ; CHECK-NEXT:    add x8, x1, #8
 ; CHECK-NEXT:    ld1 { v16.s }[1], [x8]
 ; CHECK-NEXT:    uaddl v2.8h, v3.8b, v4.8b
@@ -757,39 +757,39 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 define <16 x i32> @extrause_shuffle(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-LABEL: extrause_shuffle:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp s2, s7, [x0, #8]
-; CHECK-NEXT:    add x8, x3, #8
-; CHECK-NEXT:    ldr s18, [x1, #12]
-; CHECK-NEXT:    ldp s0, s1, [x2]
-; CHECK-NEXT:    ldp s3, s16, [x0]
-; CHECK-NEXT:    add x9, x1, #8
-; CHECK-NEXT:    mov v4.16b, v7.16b
-; CHECK-NEXT:    ldp s6, s17, [x2, #8]
+; CHECK-NEXT:    ldp s0, s1, [x0, #8]
+; CHECK-NEXT:    add x8, x1, #8
+; CHECK-NEXT:    ldr s6, [x1, #12]
+; CHECK-NEXT:    ldp s17, s18, [x2, #8]
+; CHECK-NEXT:    ldp s2, s3, [x2]
+; CHECK-NEXT:    add x9, x3, #8
+; CHECK-NEXT:    mov v4.16b, v1.16b
+; CHECK-NEXT:    ldp s7, s16, [x0]
 ; CHECK-NEXT:    ldr s5, [x3, #12]
-; CHECK-NEXT:    mov v7.s[1], v18.s[0]
-; CHECK-NEXT:    ld1 { v0.s }[1], [x3], #4
-; CHECK-NEXT:    mov v4.s[1], v18.s[0]
-; CHECK-NEXT:    ld1 { v3.s }[1], [x1], #4
-; CHECK-NEXT:    ld1 { v2.s }[1], [x9]
-; CHECK-NEXT:    ld1 { v6.s }[1], [x8]
-; CHECK-NEXT:    ld1 { v1.s }[1], [x3]
+; CHECK-NEXT:    mov v1.s[1], v6.s[0]
+; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
+; CHECK-NEXT:    mov v4.s[1], v6.s[0]
+; CHECK-NEXT:    ld1 { v7.s }[1], [x1], #4
 ; CHECK-NEXT:    ld1 { v16.s }[1], [x1]
-; CHECK-NEXT:    mov v4.s[2], v17.s[0]
-; CHECK-NEXT:    mov v17.s[1], v5.s[0]
-; CHECK-NEXT:    uaddl v2.8h, v3.8b, v2.8b
-; CHECK-NEXT:    uaddl v6.8h, v0.8b, v6.8b
-; CHECK-NEXT:    uaddl v7.8h, v16.8b, v7.8b
-; CHECK-NEXT:    uaddl v1.8h, v1.8b, v17.8b
+; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
+; CHECK-NEXT:    ld1 { v0.s }[1], [x8]
+; CHECK-NEXT:    ld1 { v17.s }[1], [x9]
+; CHECK-NEXT:    mov v4.s[2], v18.s[0]
+; CHECK-NEXT:    mov v18.s[1], v5.s[0]
+; CHECK-NEXT:    uaddl v1.8h, v16.8b, v1.8b
+; CHECK-NEXT:    uaddl v6.8h, v7.8b, v0.8b
+; CHECK-NEXT:    uaddl v2.8h, v2.8b, v17.8b
+; CHECK-NEXT:    uaddl v3.8h, v3.8b, v18.8b
+; CHECK-NEXT:    ushll v0.4s, v1.4h, #3
+; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #3
 ; CHECK-NEXT:    mov v4.s[3], v5.s[0]
-; CHECK-NEXT:    ushll v0.4s, v7.4h, #3
-; CHECK-NEXT:    ushll v16.4s, v1.4h, #3
-; CHECK-NEXT:    ushll2 v3.4s, v1.8h, #3
-; CHECK-NEXT:    ushll2 v1.4s, v7.8h, #3
-; CHECK-NEXT:    uaddw v0.4s, v0.4s, v2.4h
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v6.4h
+; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v6.8h
+; CHECK-NEXT:    ushll v7.4s, v3.4h, #3
+; CHECK-NEXT:    ushll2 v3.4s, v3.8h, #3
 ; CHECK-NEXT:    str q4, [x4]
-; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v2.8h
-; CHECK-NEXT:    uaddw2 v3.4s, v3.4s, v6.8h
-; CHECK-NEXT:    uaddw v2.4s, v16.4s, v6.4h
+; CHECK-NEXT:    uaddw2 v3.4s, v3.4s, v2.8h
+; CHECK-NEXT:    uaddw v2.4s, v7.4s, v2.4h
 ; CHECK-NEXT:    ret
   %lp1 = load <4 x i8>, ptr %p
   %p2 = getelementptr i8, ptr %p, i32 4
@@ -859,32 +859,32 @@ define <16 x i32> @extrause_ext(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-LABEL: extrause_ext:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp s1, s2, [x2]
-; CHECK-NEXT:    add x8, x3, #8
-; CHECK-NEXT:    ldp s3, s5, [x0]
-; CHECK-NEXT:    add x9, x1, #8
 ; CHECK-NEXT:    add x10, x3, #12
+; CHECK-NEXT:    ldp s3, s5, [x0]
+; CHECK-NEXT:    add x11, x1, #12
 ; CHECK-NEXT:    ldp s6, s0, [x2, #8]
+; CHECK-NEXT:    add x8, x3, #8
 ; CHECK-NEXT:    ldp s7, s4, [x0, #8]
-; CHECK-NEXT:    add x11, x1, #12
+; CHECK-NEXT:    add x9, x1, #8
 ; CHECK-NEXT:    ld1 { v1.s }[1], [x3], #4
 ; CHECK-NEXT:    ld1 { v3.s }[1], [x1], #4
-; CHECK-NEXT:    ld1 { v0.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v5.s }[1], [x1]
 ; CHECK-NEXT:    ld1 { v4.s }[1], [x11]
+; CHECK-NEXT:    ld1 { v2.s }[1], [x3]
+; CHECK-NEXT:    ld1 { v0.s }[1], [x10]
 ; CHECK-NEXT:    ld1 { v7.s }[1], [x9]
 ; CHECK-NEXT:    ld1 { v6.s }[1], [x8]
-; CHECK-NEXT:    ld1 { v2.s }[1], [x3]
-; CHECK-NEXT:    ld1 { v5.s }[1], [x1]
+; CHECK-NEXT:    uaddl v5.8h, v5.8b, v4.8b
+; CHECK-NEXT:    uaddl v2.8h, v2.8b, v0.8b
 ; CHECK-NEXT:    ushll v16.8h, v0.8b, #0
 ; CHECK-NEXT:    uaddl v3.8h, v3.8b, v7.8b
 ; CHECK-NEXT:    uaddl v6.8h, v1.8b, v6.8b
-; CHECK-NEXT:    uaddl v2.8h, v2.8b, v0.8b
-; CHECK-NEXT:    uaddl v5.8h, v5.8b, v4.8b
 ; CHECK-NEXT:    ushll v4.8h, v4.8b, #0
+; CHECK-NEXT:    ushll v1.4s, v5.4h, #3
 ; CHECK-NEXT:    ushll v7.4s, v2.4h, #3
 ; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #3
-; CHECK-NEXT:    stp q4, q16, [x4]
-; CHECK-NEXT:    ushll v1.4s, v5.4h, #3
 ; CHECK-NEXT:    ushll2 v5.4s, v5.8h, #3
+; CHECK-NEXT:    stp q4, q16, [x4]
 ; CHECK-NEXT:    uaddw v0.4s, v1.4s, v3.4h
 ; CHECK-NEXT:    uaddw2 v1.4s, v5.4s, v3.8h
 ; CHECK-NEXT:    uaddw2 v3.4s, v2.4s, v6.8h
@@ -958,33 +958,33 @@ define <16 x i32> @extrause_add(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-LABEL: extrause_add:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp s0, s1, [x0]
-; CHECK-NEXT:    add x8, x3, #8
-; CHECK-NEXT:    ldp s2, s3, [x2]
-; CHECK-NEXT:    add x9, x1, #8
 ; CHECK-NEXT:    add x10, x3, #12
+; CHECK-NEXT:    ldp s2, s3, [x2]
+; CHECK-NEXT:    add x11, x1, #12
 ; CHECK-NEXT:    ldp s4, s5, [x0, #8]
+; CHECK-NEXT:    add x8, x3, #8
 ; CHECK-NEXT:    ldp s6, s7, [x2, #8]
-; CHECK-NEXT:    add x11, x1, #12
+; CHECK-NEXT:    add x9, x1, #8
 ; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x1], #4
+; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
 ; CHECK-NEXT:    ld1 { v5.s }[1], [x11]
+; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
 ; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
 ; CHECK-NEXT:    ld1 { v4.s }[1], [x9]
 ; CHECK-NEXT:    ld1 { v6.s }[1], [x8]
-; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
-; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
-; CHECK-NEXT:    uaddl v2.8h, v2.8b, v6.8b
-; CHECK-NEXT:    uaddl v7.8h, v3.8b, v7.8b
 ; CHECK-NEXT:    uaddl v5.8h, v1.8b, v5.8b
+; CHECK-NEXT:    uaddl v7.8h, v3.8b, v7.8b
 ; CHECK-NEXT:    uaddl v1.8h, v0.8b, v4.8b
+; CHECK-NEXT:    uaddl v2.8h, v2.8b, v6.8b
+; CHECK-NEXT:    ushll v0.4s, v5.4h, #3
 ; CHECK-NEXT:    ushll v4.4s, v7.4h, #3
 ; CHECK-NEXT:    ushll2 v3.4s, v7.8h, #3
-; CHECK-NEXT:    ushll v0.4s, v5.4h, #3
 ; CHECK-NEXT:    ushll2 v6.4s, v5.8h, #3
 ; CHECK-NEXT:    stp q5, q7, [x4]
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NEXT:    uaddw2 v3.4s, v3.4s, v2.8h
 ; CHECK-NEXT:    uaddw v2.4s, v4.4s, v2.4h
-; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NEXT:    uaddw2 v1.4s, v6.4s, v1.8h
 ; CHECK-NEXT:    ret
   %lp1 = load <4 x i8>, ptr %p
@@ -1055,25 +1055,25 @@ define <16 x i32> @extrause_ext2(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-LABEL: extrause_ext2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp s0, s1, [x2]
-; CHECK-NEXT:    add x8, x3, #8
-; CHECK-NEXT:    ldp s2, s3, [x0]
-; CHECK-NEXT:    add x9, x1, #8
 ; CHECK-NEXT:    add x10, x3, #12
+; CHECK-NEXT:    ldp s2, s3, [x0]
+; CHECK-NEXT:    add x11, x1, #12
 ; CHECK-NEXT:    ldp s4, s5, [x2, #8]
+; CHECK-NEXT:    add x8, x3, #8
 ; CHECK-NEXT:    ldp s6, s7, [x0, #8]
-; CHECK-NEXT:    add x11, x1, #12
+; CHECK-NEXT:    add x9, x1, #8
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x3], #4
 ; CHECK-NEXT:    ld1 { v2.s }[1], [x1], #4
-; CHECK-NEXT:    ld1 { v5.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v3.s }[1], [x1]
 ; CHECK-NEXT:    ld1 { v7.s }[1], [x11]
+; CHECK-NEXT:    ld1 { v1.s }[1], [x3]
+; CHECK-NEXT:    ld1 { v5.s }[1], [x10]
 ; CHECK-NEXT:    ld1 { v6.s }[1], [x9]
 ; CHECK-NEXT:    ld1 { v4.s }[1], [x8]
-; CHECK-NEXT:    ld1 { v1.s }[1], [x3]
-; CHECK-NEXT:    ld1 { v3.s }[1], [x1]
-; CHECK-NEXT:    uaddl v2.8h, v2.8b, v6.8b
-; CHECK-NEXT:    uaddl v4.8h, v0.8b, v4.8b
 ; CHECK-NEXT:    uaddl v7.8h, v3.8b, v7.8b
 ; CHECK-NEXT:    uaddl v3.8h, v1.8b, v5.8b
+; CHECK-NEXT:    uaddl v2.8h, v2.8b, v6.8b
+; CHECK-NEXT:    uaddl v4.8h, v0.8b, v4.8b
 ; CHECK-NEXT:    ushll v0.4s, v7.4h, #3
 ; CHECK-NEXT:    ushll2 v1.4s, v7.8h, #3
 ; CHECK-NEXT:    ushll v5.4s, v3.4h, #3
@@ -1157,35 +1157,35 @@ define <16 x i32> @extrause_shl(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-LABEL: extrause_shl:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp s0, s1, [x0]
-; CHECK-NEXT:    add x8, x3, #8
-; CHECK-NEXT:    ldp s2, s3, [x2]
-; CHECK-NEXT:    add x9, x1, #8
 ; CHECK-NEXT:    add x10, x3, #12
+; CHECK-NEXT:    ldp s2, s3, [x2]
+; CHECK-NEXT:    add x11, x1, #12
 ; CHECK-NEXT:    ldp s4, s5, [x0, #8]
+; CHECK-NEXT:    add x8, x3, #8
 ; CHECK-NEXT:    ldp s6, s7, [x2, #8]
-; CHECK-NEXT:    add x11, x1, #12
+; CHECK-NEXT:    add x9, x1, #8
 ; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x1], #4
+; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
 ; CHECK-NEXT:    ld1 { v5.s }[1], [x11]
+; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
 ; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
 ; CHECK-NEXT:    ld1 { v4.s }[1], [x9]
 ; CHECK-NEXT:    ld1 { v6.s }[1], [x8]
-; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
-; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
+; CHECK-NEXT:    uaddl v1.8h, v1.8b, v5.8b
+; CHECK-NEXT:    uaddl v3.8h, v3.8b, v7.8b
 ; CHECK-NEXT:    uaddl v4.8h, v0.8b, v4.8b
 ; CHECK-NEXT:    uaddl v2.8h, v2.8b, v6.8b
-; CHECK-NEXT:    uaddl v3.8h, v3.8b, v7.8b
-; CHECK-NEXT:    uaddl v1.8h, v1.8b, v5.8b
-; CHECK-NEXT:    ushll v6.4s, v3.4h, #3
-; CHECK-NEXT:    ushll2 v16.4s, v3.8h, #3
 ; CHECK-NEXT:    ushll v5.4s, v1.4h, #3
+; CHECK-NEXT:    ushll v6.4s, v3.4h, #3
 ; CHECK-NEXT:    ushll2 v7.4s, v1.8h, #3
-; CHECK-NEXT:    uaddw2 v3.4s, v16.4s, v2.8h
-; CHECK-NEXT:    uaddw v2.4s, v6.4s, v2.4h
-; CHECK-NEXT:    stp q6, q16, [x4, #32]
+; CHECK-NEXT:    ushll2 v16.4s, v3.8h, #3
 ; CHECK-NEXT:    uaddw v0.4s, v5.4s, v4.4h
 ; CHECK-NEXT:    uaddw2 v1.4s, v7.4s, v4.8h
 ; CHECK-NEXT:    stp q5, q7, [x4]
+; CHECK-NEXT:    uaddw2 v3.4s, v16.4s, v2.8h
+; CHECK-NEXT:    uaddw v2.4s, v6.4s, v2.4h
+; CHECK-NEXT:    stp q6, q16, [x4, #32]
 ; CHECK-NEXT:    ret
   %lp1 = load <4 x i8>, ptr %p
   %p2 = getelementptr i8, ptr %p, i32 4
diff --git a/llvm/test/CodeGen/AArch64/ld1postmul.ll b/llvm/test/CodeGen/AArch64/ld1postmul.ll
index 1553aab9046edb7..658010d09e5ba2b 100644
--- a/llvm/test/CodeGen/AArch64/ld1postmul.ll
+++ b/llvm/test/CodeGen/AArch64/ld1postmul.ll
@@ -81,8 +81,8 @@ define ptr @fmla_v4f16(ptr %p, ptr %ps, <4 x half> %t, <4 x half> %u) {
 ; CHECK-NOFP16:       // %bb.0:
 ; CHECK-NOFP16-NEXT:    ld1r { v2.4h }, [x0], #2
 ; CHECK-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
-; CHECK-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
+; CHECK-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-NOFP16-NEXT:    fmul v0.4s, v2.4s, v0.4s
 ; CHECK-NOFP16-NEXT:    fcvtn v0.4h, v0.4s
 ; CHECK-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
diff --git a/llvm/test/CodeGen/AArch64/machine-cse-profitable-check.ll b/llvm/test/CodeGen/AArch64/machine-cse-profitable-check.ll
index c89d99328ceb323..d12240a9f4f3201 100644
--- a/llvm/test/CodeGen/AArch64/machine-cse-profitable-check.ll
+++ b/llvm/test/CodeGen/AArch64/machine-cse-profitable-check.ll
@@ -17,9 +17,9 @@ define void @foo(ptr %buf, <8 x i16> %a) {
 ; CHECK-AGGRESSIVE-CSE:       // %bb.0: // %entry
 ; CHECK-AGGRESSIVE-CSE-NEXT:    // kill: def $q0 killed $q0 def $q0_q1
 ; CHECK-AGGRESSIVE-CSE-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-AGGRESSIVE-CSE-NEXT:    zip2 v2.8h, v0.8h, v1.8h
 ; CHECK-AGGRESSIVE-CSE-NEXT:    st2 { v0.4h, v1.4h }, [x0], #16
-; CHECK-AGGRESSIVE-CSE-NEXT:    zip2 v0.8h, v0.8h, v1.8h
-; CHECK-AGGRESSIVE-CSE-NEXT:    str q0, [x0]
+; CHECK-AGGRESSIVE-CSE-NEXT:    str q2, [x0]
 ; CHECK-AGGRESSIVE-CSE-NEXT:    ret
 entry:
   %vzip.i = shufflevector <8 x i16> %a, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison, i16 poison, i16 poison>, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index be3df664c8876a1..b63d540fb8e0291 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -199,13 +199,13 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:  .LBB1_9: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld2 { v1.4s, v2.4s }, [x1], #32
+; CHECK-NEXT:    subs x12, x12, #4
 ; CHECK-NEXT:    fcmgt v3.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    fcmgt v4.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    fcmlt v5.4s, v1.4s, #0.0
-; CHECK-NEXT:    subs x12, x12, #4
 ; CHECK-NEXT:    bsl v3.16b, v0.16b, v1.16b
-; CHECK-NEXT:    fcmlt v1.4s, v2.4s, #0.0
 ; CHECK-NEXT:    bsl v4.16b, v0.16b, v2.16b
+; CHECK-NEXT:    fcmlt v1.4s, v2.4s, #0.0
 ; CHECK-NEXT:    bic v2.16b, v3.16b, v5.16b
 ; CHECK-NEXT:    bic v1.16b, v4.16b, v1.16b
 ; CHECK-NEXT:    fcvtzs v2.4s, v2.4s
@@ -346,17 +346,17 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:  .LBB2_4: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld3 { v2.4s, v3.4s, v4.4s }, [x1], #48
+; CHECK-NEXT:    add x13, x0, #8
+; CHECK-NEXT:    subs x12, x12, #4
 ; CHECK-NEXT:    fcmgt v5.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    fcmgt v6.4s, v3.4s, v0.4s
 ; CHECK-NEXT:    fcmgt v7.4s, v4.4s, v0.4s
 ; CHECK-NEXT:    fcmlt v16.4s, v2.4s, #0.0
 ; CHECK-NEXT:    fcmlt v17.4s, v3.4s, #0.0
-; CHECK-NEXT:    add x13, x0, #8
-; CHECK-NEXT:    subs x12, x12, #4
 ; CHECK-NEXT:    bsl v5.16b, v0.16b, v2.16b
-; CHECK-NEXT:    fcmlt v2.4s, v4.4s, #0.0
 ; CHECK-NEXT:    bsl v6.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    bsl v7.16b, v0.16b, v4.16b
+; CHECK-NEXT:    fcmlt v2.4s, v4.4s, #0.0
 ; CHECK-NEXT:    bic v3.16b, v5.16b, v16.16b
 ; CHECK-NEXT:    bic v4.16b, v6.16b, v17.16b
 ; CHECK-NEXT:    bic v2.16b, v7.16b, v2.16b
@@ -599,19 +599,19 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:  .LBB3_9: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x1], #64
+; CHECK-NEXT:    subs x12, x12, #4
 ; CHECK-NEXT:    fcmgt v6.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    fcmgt v7.4s, v3.4s, v0.4s
 ; CHECK-NEXT:    fcmgt v16.4s, v4.4s, v0.4s
 ; CHECK-NEXT:    fcmgt v17.4s, v5.4s, v0.4s
 ; CHECK-NEXT:    fcmlt v18.4s, v2.4s, #0.0
 ; CHECK-NEXT:    fcmlt v19.4s, v3.4s, #0.0
-; CHECK-NEXT:    subs x12, x12, #4
 ; CHECK-NEXT:    fcmlt v20.4s, v4.4s, #0.0
 ; CHECK-NEXT:    bsl v6.16b, v0.16b, v2.16b
-; CHECK-NEXT:    fcmlt v2.4s, v5.4s, #0.0
 ; CHECK-NEXT:    bsl v7.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    bsl v16.16b, v0.16b, v4.16b
 ; CHECK-NEXT:    bsl v17.16b, v0.16b, v5.16b
+; CHECK-NEXT:    fcmlt v2.4s, v5.4s, #0.0
 ; CHECK-NEXT:    bic v3.16b, v6.16b, v18.16b
 ; CHECK-NEXT:    bic v4.16b, v7.16b, v19.16b
 ; CHECK-NEXT:    bic v5.16b, v16.16b, v20.16b
diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
index 55d3943bbc7d890..5c73ba16972beb7 100644
--- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
@@ -88,10 +88,10 @@ define void @vld4(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32
 ; CHECK-NEXT:  .LBB2_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64
-; CHECK-NEXT:    fmul v4.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    add x9, x1, x8
 ; CHECK-NEXT:    add x8, x8, #32
 ; CHECK-NEXT:    cmp x8, #2, lsl #12 // =8192
+; CHECK-NEXT:    fmul v4.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    fmla v4.4s, v1.4s, v1.4s
 ; CHECK-NEXT:    fmul v5.4s, v2.4s, v2.4s
 ; CHECK-NEXT:    fmla v5.4s, v3.4s, v3.4s
@@ -257,10 +257,10 @@ define void @vld4_multiuse(ptr nocapture readonly %pSrc, ptr noalias nocapture %
 ; CHECK-NEXT:  .LBB6_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0], #64
-; CHECK-NEXT:    fmul v4.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    add x9, x1, x8
 ; CHECK-NEXT:    add x8, x8, #32
 ; CHECK-NEXT:    cmp x8, #2, lsl #12 // =8192
+; CHECK-NEXT:    fmul v4.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    fmla v4.4s, v1.4s, v1.4s
 ; CHECK-NEXT:    fmul v5.4s, v2.4s, v2.4s
 ; CHECK-NEXT:    fmla v5.4s, v3.4s, v3.4s
diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-writeback.s
index f9b4509531d35cb..94439acafe370fe 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-writeback.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-writeback.s
@@ -1162,28 +1162,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2201
+# CHECK-NEXT: Total Cycles:      1701
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.68
-# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: uOps Per Cycle:    0.88
+# CHECK-NEXT: IPC:               0.59
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeE .    .    .    . .   ld1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,1]     .  DE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   DeeeE .    .    . .   ld1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .  DE.    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .   DeeE  .    . .   ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: [0,5]     .    .    . DE .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .  DeeE   . .   ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,7]     .    .    .    .DE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    . DeeeE.   ld1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeE .    .    . .   ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,1]     . DE .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeE  .    . .   ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .DE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    . DeeE    . .   ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,5]     .    .   DE    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    DeeE . .   ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    . DE . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .  DeeeE   ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1208,28 +1208,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2301
+# CHECK-NEXT: Total Cycles:      1801
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.65
-# CHECK-NEXT: IPC:               0.43
+# CHECK-NEXT: uOps Per Cycle:    0.83
+# CHECK-NEXT: IPC:               0.56
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeE .    .    .    .  .   ld1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: [0,1]     .  DE.    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   DeeeE .    .    .  .   ld1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .  DE.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .   DeeeE .    .  .   ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: [0,5]     .    .    .  DE.    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .   DeeE  .  .   ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    . DE .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .  DeeeE.   ld1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    . DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeE .    .    .  .   ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,1]     . DE .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeE  .    .  .   ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .DE  .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    . DeeeE   .  .   ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,5]     .    .    DE   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .DeeE.  .   ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DE.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .   DeeeE   ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1254,28 +1254,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2201
+# CHECK-NEXT: Total Cycles:      1701
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.68
-# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: uOps Per Cycle:    0.88
+# CHECK-NEXT: IPC:               0.59
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeE .    .    .    . .   ld1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .  DE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   DeeE  .    .    . .   ld1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    . DE .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .  DeeeE  .    . .   ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    . DE .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .  DeeE   . .   ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .DE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    . DeeeE.   ld1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeE .    .    . .   ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     . DE .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeE   .    . .   ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DE   .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeE    . .   ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DE    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    DeeE . .   ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    . DE . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .  DeeeE   ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1300,28 +1300,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1346,28 +1346,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1392,28 +1392,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1438,28 +1438,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1484,28 +1484,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1530,28 +1530,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1576,28 +1576,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1622,28 +1622,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1668,28 +1668,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1714,28 +1714,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2401
+# CHECK-NEXT: Total Cycles:      1901
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.62
-# CHECK-NEXT: IPC:               0.42
+# CHECK-NEXT: uOps Per Cycle:    0.79
+# CHECK-NEXT: IPC:               0.53
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeE.    .    .    .   .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DE    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .   .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DE    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .   .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DE    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.   .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DE   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeE.   ld1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,9]     .    .    .    .    .  DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .   .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .   .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .   .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE  .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DE  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeE   ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,9]     .    .    .    .  DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1760,28 +1760,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total Cycles:      1501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.75
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.67
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeE .    .    .    .   ld1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   DeeE  .    .    .   ld1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .  DeeE   .    .   ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    . DeeE    .   ld1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .DeeE.   ld1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeE .    .    .   ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,1]     . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeE   .    .   ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    DE   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeE.    .   ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,5]     .    .  DE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeE  .   ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .    .    .DE  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeE   ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,9]     .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1806,28 +1806,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total Cycles:      1501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.75
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.67
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeE .    .    .    .   ld1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   DeeE  .    .    .   ld1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .  DeeE   .    .   ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    . DeeE    .   ld1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .DeeE.   ld1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeE .    .    .   ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,1]     . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeE   .    .   ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,3]     .    DE   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeE.    .   ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .  DE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeE  .   ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .DE  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeE   ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1852,28 +1852,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total Cycles:      1501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.75
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.67
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeE .    .    .    .   ld1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   DeeE  .    .    .   ld1r	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .  DeeE   .    .   ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    . DeeE    .   ld1r	{ v1.2s }, [x27], #4
-# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .DeeE.   ld1r	{ v1.4h }, [x27], #2
-# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeE .    .    .   ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,1]     . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeE   .    .   ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,3]     .    DE   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeE.    .   ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: [0,5]     .    .  DE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeE  .   ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .DE  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeE   ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: [0,9]     .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1898,28 +1898,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total Cycles:      1501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.75
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.67
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeE .    .    .    .   ld1r	{ v1.4s }, [x27], #4
-# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   DeeE  .    .    .   ld1r	{ v1.8b }, [x27], #1
-# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .  DeeE   .    .   ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    . DeeE    .   ld1r	{ v1.16b }, [x27], #1
-# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .DeeE.   ld1r	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeE .    .    .   ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: [0,1]     . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeE   .    .   ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: [0,3]     .    DE   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeE.    .   ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: [0,5]     .    .  DE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeE  .   ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: [0,7]     .    .    .DE  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeE   ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1944,28 +1944,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2001
+# CHECK-NEXT: Total Cycles:      1501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.75
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    1.00
+# CHECK-NEXT: IPC:               0.67
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeE .    .    .    .   ld1r	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   DeeE  .    .    .   ld1r	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .  DeeE   .    .   ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    . DeeE    .   ld1r	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .DeeE.   ld1r	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeE .    .    .   ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeE   .    .   ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DE   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeE.    .   ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .  DE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   DeeE  .   ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .DE  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeE   ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1990,28 +1990,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeE .    .    .    .    .   ld1r	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .  DE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   DeeE  .    .    .    .   ld1r	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    . DE .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .  DeeeeeE.    .    .   ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeE .    .    .    .   ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     . DE .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeE   .    .    .   ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeeeE  .    .   ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2036,28 +2036,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3301
+# CHECK-NEXT: Total Cycles:      2801
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.45
-# CHECK-NEXT: IPC:               0.30
+# CHECK-NEXT: uOps Per Cycle:    0.54
+# CHECK-NEXT: IPC:               0.36
 # CHECK-NEXT: Block RThroughput: 9.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .  .   ld2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1]     .    .DE  .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .  .   ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .    .DE  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeeE .    .    .  .   ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    .  DE.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .   DeeeeeE    .  .   ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DE   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .DeeeeeE.   ld2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .  .   ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeE    .    .    .  .   ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeeE   .    .  .   ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .DeeeeeE  .  .   ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .DE  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    . DeeeeeE   ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    . DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2082,28 +2082,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2901
+# CHECK-NEXT: Total Cycles:      2401
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.52
-# CHECK-NEXT: IPC:               0.34
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.42
 # CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          01234
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   .   ld2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   .   ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeeE   .    .   .   ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .DE  .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    . DeeeE   .   .   ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .DE  .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    . DeeeeeE.   ld2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .  DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   .   ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   .   ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeeeE.    .   .   ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .  DE.    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .   DeeeE .   .   ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DE .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .  DeeeeeE   ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .  DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2128,28 +2128,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2701
+# CHECK-NEXT: Total Cycles:      2201
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.56
-# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: uOps Per Cycle:    0.68
+# CHECK-NEXT: IPC:               0.45
 # CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234567
-
-# CHECK:      [0,0]     DeeeeeE   .    .    .    . .   ld2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    .DE  .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,3]     .    .    .DE  .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeE   .    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,5]     .    .    .    .DE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    . DeeeE   . .   ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .DE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    . DeeeE.   ld2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012
+
+# CHECK:      [0,0]     DeeeeeE   .    .    . .   ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DE   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeE    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .    .   DE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,5]     .    .    .  DE.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .   DeeeE . .   ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DE . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .  DeeeE   ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2174,28 +2174,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2220,28 +2220,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld2r	{ v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2266,28 +2266,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld2r	{ v1.2s, v2.2s }, [x27], #8
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld2r	{ v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2312,28 +2312,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld2r	{ v1.16b, v2.16b }, [x27], #2
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld2r	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2358,28 +2358,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2601
+# CHECK-NEXT: Total Cycles:      2101
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.58
-# CHECK-NEXT: IPC:               0.38
+# CHECK-NEXT: uOps Per Cycle:    0.71
+# CHECK-NEXT: IPC:               0.48
 # CHECK-NEXT: Block RThroughput: 5.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeE.    .    .    .    ..   ld2r	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    ..   ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    ..   ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    ..   ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeE.    .    .    ..   ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    ..   ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    ..   ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   ..   ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeeE   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2404,28 +2404,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,3]     .    .    .DE  .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,5]     .    .    .    . DE .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DE.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE.   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2450,28 +2450,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DE  .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    . DE .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DE.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE.   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2496,28 +2496,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2801
+# CHECK-NEXT: Total Cycles:      2301
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.54
-# CHECK-NEXT: IPC:               0.36
+# CHECK-NEXT: uOps Per Cycle:    0.65
+# CHECK-NEXT: IPC:               0.43
 # CHECK-NEXT: Block RThroughput: 6.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .  .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .  .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DE  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .  .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    . DE .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeE  .  .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,7]     .    .    .    .    . DE .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .  DeeeE.   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,9]     .    .    .    .    .    . DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .  .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .  .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .  .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.  .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,7]     .    .    .    .  DE.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .   DeeeE   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,9]     .    .    .    .    . DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2542,28 +2542,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2588,28 +2588,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2634,28 +2634,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2680,28 +2680,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2726,28 +2726,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2772,28 +2772,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3301
+# CHECK-NEXT: Total Cycles:      2801
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.45
-# CHECK-NEXT: IPC:               0.30
+# CHECK-NEXT: uOps Per Cycle:    0.54
+# CHECK-NEXT: IPC:               0.36
 # CHECK-NEXT: Block RThroughput: 9.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .  .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeeE   .    .    .    .  .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,3]     .    .    .DE  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeeE .    .    .  .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    .  DE.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .   DeeeeeE    .  .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DE   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .DeeeeeE.   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .    .  .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeeeE    .    .    .  .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeeE   .    .  .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    DE   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .DeeeeeE  .  .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .DE  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    . DeeeeeE   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .    . DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2818,28 +2818,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total Cycles:      3001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT:                     0123456789          0
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,1]     .    .DE  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeeeE .    .    .    .    .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,3]     .    .    .  DE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeeeE    .    .    .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,5]     .    .    .    .    DE   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeeeE  .    .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    . DE .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeeeE.   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeeE  .    .    .    .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .    .DE  .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeeE .    .    .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeeE.    .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeeE   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2864,28 +2864,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total Cycles:      3001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT:                     0123456789          0
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     .    .DE  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeeeE .    .    .    .    .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeeeE    .    .    .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    DE   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeeeE  .    .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    . DE .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeeeE.   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeeE  .    .    .    .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DE  .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeeE .    .    .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeeE.    .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeeE   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2910,28 +2910,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2956,28 +2956,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3002,28 +3002,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3048,28 +3048,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3094,28 +3094,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
-
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE.   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3140,28 +3140,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2101
+# CHECK-NEXT: Total Cycles:      1801
 # CHECK-NEXT: Total uOps:        1700
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.81
-# CHECK-NEXT: IPC:               0.48
+# CHECK-NEXT: uOps Per Cycle:    0.94
+# CHECK-NEXT: IPC:               0.56
 # CHECK-NEXT: Block RThroughput: 5.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeE.    .    .    ..   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DE    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    ..   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DE    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    ..   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DE    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeE ..   ldp	s1, s2, [x27], #248
-# CHECK-NEXT: [0,7]     .    .    .    . DE ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .  DeeE   ldp	d1, d2, [x27], #496
-# CHECK-NEXT: [0,9]     .    .    .    .    DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .  .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .  .   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeE  .   ldp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,7]     .    .    .   DE  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    DeeE   ldp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,9]     .    .    .    . DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3526,9 +3526,9 @@ add x0, x27, 1
 # CHECK-NEXT: [0,3]     .  DE.    .   .   add	x0, x27, #1
 # CHECK-NEXT: [0,4]     .   DeE   .   .   ldrsw	x1, [x27, #254]!
 # CHECK-NEXT: [0,5]     .    DE   .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    DeeeE.   .   st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,6]     .    .DeeeE   .   st1	{ v1.1d }, [x27], #8
 # CHECK-NEXT: [0,7]     .    .   DE   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .   DeeeE.   st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,8]     .    .    DeeeE   st1	{ v1.2d }, [x27], #16
 # CHECK-NEXT: [0,9]     .    .    .  DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
@@ -3554,7 +3554,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2002
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -3564,18 +3564,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    ..   st1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: [0,1]     .   DE    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   DeeeE .    .    ..   st1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,3]     .    .  DE.    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .  DeeeE  .    ..   st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,5]     .    .    . DE .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    . DeeeE   ..   st1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: [0,7]     .    .    .    .DE  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .DeeeE.   st1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3600,7 +3600,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2002
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -3610,18 +3610,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    ..   st1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: [0,1]     .   DE    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   DeeeE .    .    ..   st1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .  DE.    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .  DeeeE  .    ..   st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    . DE .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    . DeeeE   ..   st1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .DE  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .DeeeE.   st1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3646,7 +3646,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2002
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -3656,18 +3656,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    ..   st1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DE    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   DeeeE .    .    ..   st1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .  DE.    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .  DeeeE  .    ..   st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    . DE .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    . DeeeE   ..   st1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .DE  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .DeeeE.   st1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3692,7 +3692,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2002
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -3702,18 +3702,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    ..   st1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,1]     .   DE    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   DeeeE .    .    ..   st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .  DE.    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .  DeeeE  .    ..   st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,5]     .    .    . DE .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    . DeeeE   ..   st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .DE  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .DeeeE.   st1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3738,7 +3738,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2002
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -3748,18 +3748,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    ..   st1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,1]     .   DE    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   DeeeE .    .    ..   st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,3]     .    .  DE.    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .  DeeeE  .    ..   st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    . DE .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    . DeeeE   ..   st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .DE  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .DeeeE.   st1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3784,7 +3784,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2002
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -3794,18 +3794,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    ..   st1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DE    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   DeeeE .    .    ..   st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .  DE.    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .  DeeeE  .    ..   st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    . DE .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    . DeeeE   ..   st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .DE  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .DeeeE.   st1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3830,7 +3830,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -3840,18 +3840,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3876,7 +3876,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -3886,18 +3886,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3922,7 +3922,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -3932,18 +3932,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3968,7 +3968,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -3978,18 +3978,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4014,7 +4014,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -4024,18 +4024,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4060,7 +4060,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -4070,18 +4070,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4106,7 +4106,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2202
+# CHECK-NEXT: Total Cycles:      2201
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -4116,18 +4116,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123
+# CHECK-NEXT: Index     0123456789          012
 
-# CHECK:      [0,0]     DeeeeE    .    .    .  .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DE   .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .  .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    DE   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .  .   st1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,5]     .    .    .   DE    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .   DeeeE .  .   st1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,7]     .    .    .    .  DE.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .  DeeeE.   st1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    . DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    . .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    . .   st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,5]     .    .    .  DE.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .   DeeeE . .   st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,7]     .    .    .    . DE . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .  DeeeE   st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4152,7 +4152,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2002
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -4162,18 +4162,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    ..   st1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,1]     .   DE    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   DeeeE .    .    ..   st1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,3]     .    .  DE.    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .  DeeeE  .    ..   st1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,5]     .    .    . DE .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    . DeeeE   ..   st1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .DE  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .DeeeE.   st1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     .  DE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .    . DE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,5]     .    .    .DE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4198,7 +4198,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2102
+# CHECK-NEXT: Total Cycles:      2101
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -4208,18 +4208,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012
+# CHECK-NEXT: Index     0123456789          01
 
-# CHECK:      [0,0]     DeeeE.    .    .    . .   st1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,1]     .   DE    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   DeeeE .    .    . .   st1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .    .  DE.    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .  DeeeE  .    . .   st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,5]     .    .    . DE .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    . DeeeE   . .   st1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .DE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .DeeeeE.   st2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,9]     .    .    .    .    .DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    ..   st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .  DE.    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    ..   st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    . DE .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    ..   st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,5]     .    .    .DE  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   ..   st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeeE   st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4244,7 +4244,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -4254,18 +4254,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4290,7 +4290,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -4300,18 +4300,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4336,7 +4336,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -4346,18 +4346,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4382,7 +4382,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -4392,18 +4392,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4428,7 +4428,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -4438,18 +4438,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4474,7 +4474,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
-# CHECK-NEXT: Total Cycles:      1502
+# CHECK-NEXT: Total Cycles:      1501
 # CHECK-NEXT: Total uOps:        900
 
 # CHECK:      Dispatch Width:    3
@@ -4483,15 +4483,15 @@ add x0, x27, 1
 # CHECK-NEXT: Block RThroughput: 12.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeE    .    ..   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,1]     .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    ..   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,3]     .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE.   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,5]     .    .    .    DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,1]     .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4512,7 +4512,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -4522,18 +4522,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4558,7 +4558,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -4568,18 +4568,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4604,7 +4604,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -4614,18 +4614,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4650,7 +4650,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -4660,18 +4660,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4696,7 +4696,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -4706,18 +4706,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4742,7 +4742,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -4752,18 +4752,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4788,7 +4788,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -4798,18 +4798,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4834,7 +4834,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -4844,18 +4844,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4880,7 +4880,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2502
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
@@ -4890,18 +4890,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,1]     .    DE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,3]     .    .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,5]     .    .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE.   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     .   DE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .    .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,5]     .    .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4926,7 +4926,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      800
-# CHECK-NEXT: Total Cycles:      2002
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1200
 
 # CHECK:      Dispatch Width:    3
@@ -4936,16 +4936,16 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeeE    .    .    ..   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,1]     .    DE   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    ..   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .    .    DE   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    ..   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    DE   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeeE.   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    DE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,1]     .   DE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .   DE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions



More information about the llvm-commits mailing list