[llvm] 475d687 - [AArch64] Fix postinc operands for Cortex-A55 scheduling

David Green via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 10 04:56:40 PDT 2023


Author: David Green
Date: 2023-10-10T12:56:33+01:00
New Revision: 475d687ac1123f7ea01bc90789d93b5938930ca1

URL: https://github.com/llvm/llvm-project/commit/475d687ac1123f7ea01bc90789d93b5938930ca1
DIFF: https://github.com/llvm/llvm-project/commit/475d687ac1123f7ea01bc90789d93b5938930ca1.diff

LOG: [AArch64] Fix postinc operands for Cortex-A55 scheduling

Similar to D159254, this fixes the order of WriteAdr operands on post/pre-inc
loads/stores in the Cortex-A55 scheduling model.

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64SchedA55.td
    llvm/test/tools/llvm-mca/AArch64/Cortex/A55-writeback.s

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64SchedA55.td b/llvm/lib/Target/AArch64/AArch64SchedA55.td
index 533fb9330260606..cb77be350d12444 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA55.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA55.td
@@ -285,16 +285,16 @@ def : InstRW<[CortexA55WriteVLD6], (instregex "LD1Threev(16b|8h|4s|2d)$")>;
 def : InstRW<[CortexA55WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
 def : InstRW<[CortexA55WriteVLD8], (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
 
-def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA55WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA55WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA55WriteVLD6, WriteAdr], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA55WriteVLD8, WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD1], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD2], (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD4], (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD6], (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD8], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
 
 //    2-element structures
 def : InstRW<[CortexA55WriteVLD2], (instregex "LD2i(8|16|32|64)$")>;
@@ -302,10 +302,10 @@ def : InstRW<[CortexA55WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$"
 def : InstRW<[CortexA55WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
 def : InstRW<[CortexA55WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
 
-def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
-def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
-def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
-def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD2], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD2], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD2], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
 
 //    3-element structures
 def : InstRW<[CortexA55WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
@@ -313,10 +313,10 @@ def : InstRW<[CortexA55WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$"
 def : InstRW<[CortexA55WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)$")>;
 def : InstRW<[CortexA55WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)$")>;
 
-def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA55WriteVLD3, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA55WriteVLD6, WriteAdr], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD2], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD3], (instregex "LD3Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD6], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
 
 //    4-element structures
 def : InstRW<[CortexA55WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;                // load single 4-el structure to one lane of 4 regs.
@@ -324,10 +324,10 @@ def : InstRW<[CortexA55WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$"
 def : InstRW<[CortexA55WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)$")>;           // load multiple 4-el structures to 4 regs.
 def : InstRW<[CortexA55WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
 
-def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA55WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA55WriteVLD4, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[CortexA55WriteVLD8, WriteAdr], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD2], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD4], (instregex "LD4Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD8], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
 
 //---
 // Vector Stores
@@ -337,28 +337,28 @@ def : InstRW<[CortexA55WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)
 def : InstRW<[CortexA55WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 def : InstRW<[CortexA55WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 def : InstRW<[CortexA55WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA55WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVST1], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVST4], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 def : InstRW<[CortexA55WriteVST2], (instregex "ST2i(8|16|32|64)$")>;
 def : InstRW<[CortexA55WriteVST2], (instregex "ST2Twov(8b|4h|2s)$")>;
 def : InstRW<[CortexA55WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
-def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVST2], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVST2], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVST4], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
 
 def : InstRW<[CortexA55WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
 def : InstRW<[CortexA55WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVST2], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVST4], (instregex "ST3Threev(8b|4h|2s|1d|2d|16b|8h|4s|4d)_POST$")>;
 
 def : InstRW<[CortexA55WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
 def : InstRW<[CortexA55WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[CortexA55WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
-def : InstRW<[CortexA55WriteVST4, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVST2], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, CortexA55WriteVST4], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 //---
 // Floating Point Conversions, MAC, DIV, SQRT

diff  --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-writeback.s
index 76f46ccf0c5cb4a..0fded5f8f08f49f 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-writeback.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-writeback.s
@@ -1162,28 +1162,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3701
+# CHECK-NEXT: Total Cycles:      2201
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.41
-# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: uOps Per Cycle:    0.68
+# CHECK-NEXT: IPC:               0.45
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234567
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    . .   ld1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeeE  .    .    .    .    . .   ld1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .    . DeeE    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    DeeeE.    .    .    . .   ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: [0,5]     .    .    .    .   DeeE  .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    . DeeeE   .    . .   ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .DeeE.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .   DeeeeE. .   ld1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    . .   ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeeE.    .    . .   ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .DeeE.    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .   DeeeE .    . .   ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,5]     .    .    DeeE .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .  DeeeE  . .   ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    . DeeeeE   ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .   DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1208,28 +1208,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3801
+# CHECK-NEXT: Total Cycles:      2301
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.39
-# CHECK-NEXT: IPC:               0.26
+# CHECK-NEXT: uOps Per Cycle:    0.65
+# CHECK-NEXT: IPC:               0.43
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .  .   ld1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeeE  .    .    .    .    .  .   ld1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .    . DeeE    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    DeeeeE    .    .    .  .   ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: [0,5]     .    .    .    .    DeeE .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .  DeeeE  .    .  .   ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    . DeeE    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    DeeeeE  .   ld1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeE.    .    .    .  .   ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeeE.    .    .  .   ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .DeeE.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .   DeeeeE.    .  .   ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,5]     .    .    .DeeE.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .   DeeeE .  .   ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DeeE .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .  DeeeeE   ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1254,28 +1254,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3701
+# CHECK-NEXT: Total Cycles:      2201
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.41
-# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: uOps Per Cycle:    0.68
+# CHECK-NEXT: IPC:               0.45
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234567
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    . .   ld1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    . .   ld1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeeE.    .    .    . .   ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .   DeeE  .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    . DeeeE   .    . .   ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .DeeE.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .   DeeeeE. .   ld1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    . .   ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    . .   ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeeE .    . .   ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    DeeE .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .  DeeeE  . .   ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    . DeeeeE   ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1300,28 +1300,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4201
+# CHECK-NEXT: Total Cycles:      2701
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.36
-# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: uOps Per Cycle:    0.56
+# CHECK-NEXT: IPC:               0.37
 # CHECK-NEXT: Block RThroughput: 12.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    . .   ld1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    . .   ld1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeeeE .    .    .    . .   ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    .    .  DeeE   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .DeeeeE   .    . .   ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .DeeE.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .   DeeeeE. .   ld1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    . .   ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    . .   ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeeeE  .    . .   ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    . DeeeeE  . .   ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    . DeeeeE   ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1346,28 +1346,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4601
+# CHECK-NEXT: Total Cycles:      3101
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.33
-# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: uOps Per Cycle:    0.48
+# CHECK-NEXT: IPC:               0.32
 # CHECK-NEXT: Block RThroughput: 16.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    ..   ld1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    DeeeeE    .    .    .    .    .    ..   ld1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeeE    .    .    .    ..   ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    .    .    DeeE .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .  DeeeeeeE    .    ..   ld1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    DeeE .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .  DeeeeE ..   ld1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .  DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          01
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    ..   ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeE  .    .    .    ..   ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeeeE.    .    ..   ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    .DeeE.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .   DeeeeeeE   ..   ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .DeeeeE   ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .  DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1392,28 +1392,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4401
+# CHECK-NEXT: Total Cycles:      2901
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.34
-# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: uOps Per Cycle:    0.52
+# CHECK-NEXT: IPC:               0.34
 # CHECK-NEXT: Block RThroughput: 14.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01234
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .   .   ld1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    DeeeeE    .    .    .    .    .   .   ld1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeE .    .    .    .   .   ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .  DeeE   .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .DeeeeeeE .    .   .   ld1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .  DeeE   .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .DeeeeE  .   ld1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .   .   ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeE  .    .    .   .   ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .   .   ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    . DeeeeeeE.   .   ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .DeeE.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE   ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1438,28 +1438,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      5001
+# CHECK-NEXT: Total Cycles:      3501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.30
-# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 20.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    .    .   ld1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    DeeeeeeE  .    .    .    .    .    .    .   ld1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .    . DeeE    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .    DeeeeeE   .    .    .    .    .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,5]     .    .    .    .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .   DeeeeeeeeE .    .    .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .  DeeE   .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .DeeeeeE  .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .   ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeeeE.    .    .    .    .   ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeeeE    .    .    .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    DeeeeeeeeE.    .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .DeeE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .   DeeeeeE   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1484,28 +1484,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      5401
+# CHECK-NEXT: Total Cycles:      3901
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.28
-# CHECK-NEXT: IPC:               0.19
+# CHECK-NEXT: uOps Per Cycle:    0.38
+# CHECK-NEXT: IPC:               0.26
 # CHECK-NEXT: Block RThroughput: 24.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          01234
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .    .    .    .   .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .   DeeeeeeeeE .    .    .    .    .    .    .   .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,3]     .    .    .    .  DeeE   .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .    .DeeeeeE  .    .    .    .    .   .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,5]     .    .    .    .    .    . DeeE    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .    DeeeeeeeeE.    .    .   .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .   DeeE  .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    . DeeeeeeeeE  .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .   .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,1]     .  DeeE   .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeeeeeE    .    .    .    .   .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3]     .    .    . DeeE    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    DeeeeeE   .    .    .   .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeeeeeeE    .   .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    .    .    .    . DeeE    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    DeeeeeeeeE   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1530,28 +1530,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      5101
+# CHECK-NEXT: Total Cycles:      3601
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: uOps Per Cycle:    0.42
+# CHECK-NEXT: IPC:               0.28
 # CHECK-NEXT: Block RThroughput: 21.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          01
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .    .    .    ..   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .   DeeeeeeeeE .    .    .    .    .    .    ..   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .    .  DeeE   .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .    .DeeeeeE  .    .    .    .    ..   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .    . DeeE    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .    DeeeeeE   .    .    ..   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .DeeE.    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .   DeeeeeeeeE ..   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .  DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    ..   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DeeE   .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeeeeeE    .    .    .    ..   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    . DeeE    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    DeeeeeE   .    .    ..   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeeeE  .    ..   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .   DeeE  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    . DeeeeeeeeE   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .  DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1576,28 +1576,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      5701
+# CHECK-NEXT: Total Cycles:      4201
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.26
-# CHECK-NEXT: IPC:               0.18
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.24
 # CHECK-NEXT: Block RThroughput: 27.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          01234567
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .    .    .    .    . .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .   DeeeeeeeeE .    .    .    .    .    .    .    . .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .    .  DeeE   .    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .    .DeeeeeeeeE    .    .    .    .    . .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .    .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .  DeeeeeeE    .    .    . .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .    DeeE .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .  DeeeeeeeeeeE. .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .    . .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DeeE   .    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeeeeeE    .    .    .    .    . .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    . DeeE    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    DeeeeeeeeE.    .    .    . .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeeeE   .    . .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .  DeeE   .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .DeeeeeeeeeeE   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1622,28 +1622,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      5801
+# CHECK-NEXT: Total Cycles:      4301
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.26
-# CHECK-NEXT: IPC:               0.17
+# CHECK-NEXT: uOps Per Cycle:    0.35
+# CHECK-NEXT: IPC:               0.23
 # CHECK-NEXT: Block RThroughput: 28.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    .    .    .  .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    DeeeeeeE  .    .    .    .    .    .    .    .  .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,3]     .    .    .    . DeeE    .    .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .    DeeeeeeeeeeE   .    .    .    .    .  .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,5]     .    .    .    .    .    .    .DeeE.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .   DeeeeeeE   .    .    .  .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .    .DeeE.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .   DeeeeeeeeeeE  .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .    DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .  .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeeeE.    .    .    .    .    .  .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeeeeeeeeE    .    .    .  .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    .    .    . DeeE    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    DeeeeeeE  .    .  .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeeeeeeeE   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1668,28 +1668,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      5801
+# CHECK-NEXT: Total Cycles:      4301
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.26
-# CHECK-NEXT: IPC:               0.17
+# CHECK-NEXT: uOps Per Cycle:    0.35
+# CHECK-NEXT: IPC:               0.23
 # CHECK-NEXT: Block RThroughput: 28.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeeeeeE   .    .    .    .    .    .    .    .    .  .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,1]     .    .    .DeeE.    .    .    .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    .   DeeeeeeE   .    .    .    .    .    .    .  .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .    .    .DeeE.    .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .    .   DeeeeeeeeeeE    .    .    .    .  .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .    .    .    DeeE .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .    .  DeeeeeeE    .    .  .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .    .    DeeE .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .    .  DeeeeeeE  .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .    DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeeeE   .    .    .    .    .    .  .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,1]     .    .  DeeE   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    .DeeeeeeE .    .    .    .    .  .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeeeeeeE.    .    .  .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .    .DeeE.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .   DeeeeeeE   .  .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .  DeeE   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .DeeeeeeE   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1714,28 +1714,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      5901
+# CHECK-NEXT: Total Cycles:      4401
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.25
-# CHECK-NEXT: IPC:               0.17
+# CHECK-NEXT: uOps Per Cycle:    0.34
+# CHECK-NEXT: IPC:               0.23
 # CHECK-NEXT: Block RThroughput: 29.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeeeeeE   .    .    .    .    .    .    .    .    .   .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     .    .    .DeeE.    .    .    .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    .   DeeeeeeE   .    .    .    .    .    .    .   .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .    .    .DeeE.    .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .    .   DeeeeeeeeeeE    .    .    .    .   .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .    .    .    DeeE .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .    .  DeeeeeeeeeeE.    .   .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .    .    .   DeeE  .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .    .    . DeeeE  .   ld1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234
+
+# CHECK:      [0,0]     DeeeeeeeeeeE   .    .    .    .    .    .   .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .    .  DeeE   .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .    .DeeeeeeE .    .    .    .    .   .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeeeeeeE.    .    .   .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .    .DeeE.    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .   DeeeeeeeeeeE   .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    . DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    DeeeE   ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1760,28 +1760,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1806,28 +1806,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1852,28 +1852,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1r	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1r	{ v1.2s }, [x27], #4
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1r	{ v1.4h }, [x27], #2
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1898,28 +1898,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1r	{ v1.4s }, [x27], #4
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1r	{ v1.8b }, [x27], #1
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1r	{ v1.16b }, [x27], #1
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1r	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1944,28 +1944,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1r	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1r	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1r	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1r	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1990,28 +1990,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .    .   ld1r	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .    .   ld1r	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeeeeE   .    .    .    .   ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .    .   ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeeeeE    .    .   ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2036,28 +2036,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4801
+# CHECK-NEXT: Total Cycles:      3301
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.31
-# CHECK-NEXT: IPC:               0.21
+# CHECK-NEXT: uOps Per Cycle:    0.45
+# CHECK-NEXT: IPC:               0.30
 # CHECK-NEXT: Block RThroughput: 18.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012345678
-
-# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    .  .   ld2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    DeeeeE    .    .    .    .    .    .  .   ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeeE    .    .    .    .  .   ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    .    .    DeeE .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .  DeeeeeeE    .    .  .   ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    DeeE .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .  DeeeeeeE  .   ld2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .  .   ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeE  .    .    .    .  .   ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeeeE.    .    .  .   ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    .DeeE.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .   DeeeeeeE   .  .   ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .DeeeeeeE   ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2082,28 +2082,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4401
+# CHECK-NEXT: Total Cycles:      2901
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.34
-# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: uOps Per Cycle:    0.52
+# CHECK-NEXT: IPC:               0.34
 # CHECK-NEXT: Block RThroughput: 14.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01234
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   .   ld2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   .   ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeeeE .    .    .    .   .   ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .  DeeE   .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .DeeeeE   .    .   .   ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .DeeE.    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .   DeeeeeeE  .   ld2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   .   ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   .   ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeeeE  .    .   .   ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    . DeeeeE  .   .   ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    . DeeeeeeE   ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2128,28 +2128,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4201
+# CHECK-NEXT: Total Cycles:      2701
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.36
-# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: uOps Per Cycle:    0.56
+# CHECK-NEXT: IPC:               0.37
 # CHECK-NEXT: Block RThroughput: 12.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
 
-# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    . .   ld2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    DeeeeE    .    .    .    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeE .    .    .    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,5]     .    .    .    .    .  DeeE   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .DeeeeE   .    . .   ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .DeeE.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .   DeeeeE. .   ld2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    . .   ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeE  .    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    . DeeeeE  . .   ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    . DeeeeE   ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2174,28 +2174,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2220,28 +2220,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld2r	{ v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2266,28 +2266,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld2r	{ v1.2s, v2.2s }, [x27], #8
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld2r	{ v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2312,28 +2312,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld2r	{ v1.16b, v2.16b }, [x27], #2
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld2r	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2358,28 +2358,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4401
+# CHECK-NEXT: Total Cycles:      2901
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.34
-# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: uOps Per Cycle:    0.52
+# CHECK-NEXT: IPC:               0.34
 # CHECK-NEXT: Block RThroughput: 14.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01234
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   .   ld2r	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   .   ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   .   ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   .   ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeeeeeE  .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   .   ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   .   ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   .   ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   .   ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeeeeeE   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2404,28 +2404,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      5101
+# CHECK-NEXT: Total Cycles:      3601
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: uOps Per Cycle:    0.42
+# CHECK-NEXT: IPC:               0.28
 # CHECK-NEXT: Block RThroughput: 21.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          01
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .    .    .    ..   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .   DeeeeeE    .    .    .    .    .    .    ..   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeeeeE  .    .    .    .    ..   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,5]     .    .    .    .    .    . DeeE    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .    DeeeeeE   .    .    ..   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .DeeE.    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .   DeeeeeeeeE ..   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .  DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    ..   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1]     .  DeeE   .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeeE  .    .    .    .    ..   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeeeeeE   .    .    ..   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeeeE  .    ..   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .    .    .   DeeE  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    . DeeeeeeeeE   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .  DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2450,28 +2450,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      5401
+# CHECK-NEXT: Total Cycles:      3901
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.28
-# CHECK-NEXT: IPC:               0.19
+# CHECK-NEXT: uOps Per Cycle:    0.38
+# CHECK-NEXT: IPC:               0.26
 # CHECK-NEXT: Block RThroughput: 24.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          01234
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeeeE.    .    .    .    .    .    .    .    .   .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,1]     .    .   DeeE  .    .    .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    . DeeeeeeeeE   .    .    .    .    .    .   .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .    .    .DeeE.    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .    .   DeeeeeE    .    .    .    .   .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .    .    DeeE .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .  DeeeeeE.    .    .   .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .   DeeE  .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    . DeeeeeeeeE  .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeE.    .    .    .    .    .   .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .   DeeeeeeeeE .    .    .    .   .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeE.    .    .   .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeeE    .   .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    . DeeE    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    DeeeeeeeeE   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2496,28 +2496,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4901
+# CHECK-NEXT: Total Cycles:      3401
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.31
-# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: uOps Per Cycle:    0.44
+# CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 19.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .    .    .   .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .   DeeeeeeeeE .    .    .    .    .    .   .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .    .  DeeE   .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .    .DeeeeeeeeE    .    .    .   .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .    .    DeeE .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .  DeeeeE .    .   .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .  DeeE   .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .DeeeeE  .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .   .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DeeE   .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeeeeeE    .    .    .   .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    . DeeE    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    DeeeeeeeeE.    .   .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.   .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .DeeE.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .   DeeeeE   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2542,28 +2542,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2588,28 +2588,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2634,28 +2634,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2680,28 +2680,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2726,28 +2726,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2772,28 +2772,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      5601
+# CHECK-NEXT: Total Cycles:      4101
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.27
-# CHECK-NEXT: IPC:               0.18
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.24
 # CHECK-NEXT: Block RThroughput: 26.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          0123456
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .    .    .    ..   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeeeeeeeE.    .    .    .    .    .    .    ..   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,3]     .    .    .    .   DeeE  .    .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .    . DeeeeeeE.    .    .    .    .    ..   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    .    .    .   DeeE  .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .    . DeeeeeeE.    .    .    ..   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .   DeeE  .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    . DeeeeeeeeeeE ..   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .  DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    ..   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeeeeeeeE   .    .    .    .    ..   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeeeE .    .    .    ..   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    .    DeeE .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .  DeeeeeeE    .    ..   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .    . DeeE    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    DeeeeeeeeeeE   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .  DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2818,28 +2818,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      6201
+# CHECK-NEXT: Total Cycles:      4701
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.24
-# CHECK-NEXT: IPC:               0.16
+# CHECK-NEXT: uOps Per Cycle:    0.32
+# CHECK-NEXT: IPC:               0.21
 # CHECK-NEXT: Block RThroughput: 32.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789          012
-
-# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    .    .    .    . .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    DeeeeeeeeeeE   .    .    .    .    .    .    .    . .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,3]     .    .    .    .    .DeeE.    .    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .    .   DeeeeeeeeeeE    .    .    .    .    . .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,5]     .    .    .    .    .    .    .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .    .  DeeeeeeeeeeE.    .    . .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .    .    .   DeeE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .    .    . DeeeeeeE. .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    . .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeeeeeeeE .    .    .    .    .    . .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeeeeeeE.    .    .    . .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    .    .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    .   DeeeeeeeeeeE    . .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    . DeeE    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    DeeeeeeE   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2864,28 +2864,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      6201
+# CHECK-NEXT: Total Cycles:      4701
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.24
-# CHECK-NEXT: IPC:               0.16
+# CHECK-NEXT: uOps Per Cycle:    0.32
+# CHECK-NEXT: IPC:               0.21
 # CHECK-NEXT: Block RThroughput: 32.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789          012
-
-# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    .    .    .    . .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    DeeeeeeeeeeE   .    .    .    .    .    .    .    . .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .    .    .DeeE.    .    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .    .   DeeeeeeE   .    .    .    .    .    . .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .    .    .DeeE.    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .   DeeeeeeeeeeE    .    .    . .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .    .    DeeE .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .    .  DeeeeeeeeeeE. .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    . .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeeeeeeeE .    .    .    .    .    . .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeeE    .    .    .    . .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    . DeeE    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .    DeeeeeeeeeeE   .    . .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .  DeeE   .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .DeeeeeeeeeeE   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2910,28 +2910,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2956,28 +2956,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3002,28 +3002,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3048,28 +3048,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3094,28 +3094,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3140,28 +3140,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3301
+# CHECK-NEXT: Total Cycles:      2401
 # CHECK-NEXT: Total uOps:        1700
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.51
-# CHECK-NEXT: IPC:               0.30
+# CHECK-NEXT: uOps Per Cycle:    0.71
+# CHECK-NEXT: IPC:               0.42
 # CHECK-NEXT: Block RThroughput: 9.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .  .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .  .   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeE .  .   ldp	s1, s2, [x27], #248
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeeE   ldp	d1, d2, [x27], #496
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234
+
+# CHECK:      [0,0]     DeeeeE    .    .    .   .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .   .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .   .   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.   .   ldp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,7]     .    .    .    .DeeE.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .   DeeeeE   ldp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,9]     .    .    .    .    .DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3508,28 +3508,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2201
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.68
-# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeE .    .    .    . .   ldrsh	x1, [x27, #254]!
-# CHECK-NEXT: [0,1]     .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   DeeE  .    .    . .   ldrsw	x1, [x27], #254
-# CHECK-NEXT: [0,3]     .    DeeE .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .  DeeE   .    . .   ldrsw	x1, [x27, #254]!
-# CHECK-NEXT: [0,5]     .    .   DeeE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    DeeeE.    . .   st1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,7]     .    .    .   DeeE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    DeeeE. .   st1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeE .    .    .    .   ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeE  .    .    .   ldrsw	x1, [x27], #254
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeE   .    .   ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3554,28 +3554,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3600,28 +3600,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3646,28 +3646,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3692,30 +3692,30 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234567
-
-# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      Average Wait times (based on the timeline view):
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
 # CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
 # CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
@@ -3738,28 +3738,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3784,28 +3784,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3830,28 +3830,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3876,28 +3876,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3922,28 +3922,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3968,28 +3968,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 18.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4014,28 +4014,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 20.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4060,28 +4060,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 20.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4106,28 +4106,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2703
+# CHECK-NEXT: Total Cycles:      2201
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.55
-# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: uOps Per Cycle:    0.68
+# CHECK-NEXT: IPC:               0.45
 # CHECK-NEXT: Block RThroughput: 11.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          012
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .   .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .   .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeE   .    .   .   st1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,5]     .    .    .    .DeeE.    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    . DeeeE   .   .   st1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,7]     .    .    .    .    .DeeE.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    . DeeeE  .   st1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    . .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    . .   st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,5]     .    .    .DeeE.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .   DeeeE . .   st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,7]     .    .    .    DeeE . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .  DeeeE   st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4152,28 +4152,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4198,28 +4198,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2603
+# CHECK-NEXT: Total Cycles:      2101
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.58
-# CHECK-NEXT: IPC:               0.38
+# CHECK-NEXT: uOps Per Cycle:    0.71
+# CHECK-NEXT: IPC:               0.48
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
-
-# CHECK:      [0,0]     DeeeE.    .    .    .    .  .   st1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .  .   st1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .  .   st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .  .   st1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE  .   st2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DeeE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeE.    .    .    ..   st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    ..   st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    ..   st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   ..   st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeeE   st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .  DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4244,28 +4244,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 14.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4290,28 +4290,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 16.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4336,28 +4336,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 14.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4382,28 +4382,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4428,28 +4428,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4474,24 +4474,24 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
-# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total Cycles:      1501
 # CHECK-NEXT: Total uOps:        900
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 12.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeE    .    .    .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,1]     . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,3]     .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4512,28 +4512,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 20.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4558,28 +4558,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 20.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4604,28 +4604,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 12.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4650,28 +4650,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4696,28 +4696,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 14.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4742,28 +4742,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 20.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4788,28 +4788,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 20.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4834,28 +4834,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 14.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4880,28 +4880,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4926,26 +4926,26 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      800
-# CHECK-NEXT: Total Cycles:      2403
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1200
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    ..   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    ..   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE ..   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions


        


More information about the llvm-commits mailing list