[llvm] 6c4b3dc - [AArch64] Fix postinc operands for Cortex-A53 scheduling

David Green via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 10 02:14:51 PDT 2023


Author: David Green
Date: 2023-10-10T10:14:44+01:00
New Revision: 6c4b3dc340d1afda02e68ce42ebafa77fa076797

URL: https://github.com/llvm/llvm-project/commit/6c4b3dc340d1afda02e68ce42ebafa77fa076797
DIFF: https://github.com/llvm/llvm-project/commit/6c4b3dc340d1afda02e68ce42ebafa77fa076797.diff

LOG: [AArch64] Fix postinc operands for Cortex-A53 scheduling

Similar to D159254, this fixes the order of WriteAdr operands on post/pre-inc
loads/stores in the Cortex-A53 scheduling model.

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64SchedA53.td
    llvm/test/tools/llvm-mca/AArch64/Cortex/A53-writeback.s

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64SchedA53.td b/llvm/lib/Target/AArch64/AArch64SchedA53.td
index 67e2b07692509ac..3e4168f5f445f58 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA53.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA53.td
@@ -215,39 +215,39 @@ def : InstRW<[A53WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 def : InstRW<[A53WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 def : InstRW<[A53WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 def : InstRW<[A53WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
-def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVLD1], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 def : InstRW<[A53WriteVLD1], (instregex "LD2i(8|16|32|64)$")>;
 def : InstRW<[A53WriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 def : InstRW<[A53WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
 def : InstRW<[A53WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)_POST$")>;
-def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)_POST$")>;
-def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVLD1], (instregex "LD2i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVLD2], (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>;
 
 def : InstRW<[A53WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
 def : InstRW<[A53WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 def : InstRW<[A53WriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
 def : InstRW<[A53WriteVLD3], (instregex "LD3Threev2d$")>;
-def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
-def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
-def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD3Threev2d_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVLD2], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVLD3], (instregex "LD3Threev2d_POST$")>;
 
 def : InstRW<[A53WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;
 def : InstRW<[A53WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 def : InstRW<[A53WriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
 def : InstRW<[A53WriteVLD4], (instregex "LD4Fourv(2d)$")>;
-def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
-def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[A53WriteVLD5, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
-def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVLD2], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVLD4], (instregex "LD4Fourv(2d)_POST$")>;
 
 //---
 // Vector Stores
@@ -257,32 +257,32 @@ def : InstRW<[A53WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 def : InstRW<[A53WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 def : InstRW<[A53WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
 def : InstRW<[A53WriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
-def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVST1], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 def : InstRW<[A53WriteVST1], (instregex "ST2i(8|16|32|64)$")>;
 def : InstRW<[A53WriteVST1], (instregex "ST2Twov(8b|4h|2s)$")>;
 def : InstRW<[A53WriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
-def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
-def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVST1], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVST1], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
 
 def : InstRW<[A53WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
 def : InstRW<[A53WriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
 def : InstRW<[A53WriteVST2], (instregex "ST3Threev(2d)$")>;
-def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
-def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
-def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3Threev(2d)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVST2], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVST2], (instregex "ST3Threev(2d)_POST$")>;
 
 def : InstRW<[A53WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
 def : InstRW<[A53WriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
 def : InstRW<[A53WriteVST2], (instregex "ST4Fourv(2d)$")>;
-def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
-def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
-def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVST2], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[WriteAdr, A53WriteVST2], (instregex "ST4Fourv(2d)_POST$")>;
 
 //---
 // Floating Point MAC, DIV, SQRT

diff  --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A53-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A53-writeback.s
index c5ca6f9f1764aa0..ed5b0869e53ded9 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A53-writeback.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A53-writeback.s
@@ -1162,28 +1162,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1208,28 +1208,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1254,28 +1254,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1300,28 +1300,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3901
+# CHECK-NEXT: Total Cycles:      2401
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.38
-# CHECK-NEXT: IPC:               0.26
+# CHECK-NEXT: uOps Per Cycle:    0.62
+# CHECK-NEXT: IPC:               0.42
 # CHECK-NEXT: Block RThroughput: 9.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   .   ld1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeeE  .    .    .    .    .   .   ld1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .    . DeeE    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    DeeeeE    .    .    .   .   ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    .    DeeE .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .  DeeeeE .    .   .   ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .  DeeE   .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .DeeeeE  .   ld1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234
+
+# CHECK:      [0,0]     DeeeE.    .    .    .   .   ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeeE.    .    .   .   ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .DeeE.    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .   DeeeeE.    .   .   ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .DeeE.    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .   DeeeeE.   .   ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    .DeeE.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .   DeeeeE   ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    .DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1346,28 +1346,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1392,28 +1392,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1438,28 +1438,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4301
+# CHECK-NEXT: Total Cycles:      2801
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.35
-# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: uOps Per Cycle:    0.54
+# CHECK-NEXT: IPC:               0.36
 # CHECK-NEXT: Block RThroughput: 13.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .  .   ld1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .  .   ld1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeeE  .    .    .    .  .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,5]     .    .    .    .    . DeeE    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    DeeeeeE   .    .  .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .DeeE.    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .   DeeeeeE  .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .  .   ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .  .   ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeeE   .    .  .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .  DeeE   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .DeeeeeE  .  .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    . DeeeeeE   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,9]     .    .    .    .    .    DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1484,28 +1484,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4501
+# CHECK-NEXT: Total Cycles:      3001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.33
-# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 15.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012345
-
-# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .    .    .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .   DeeeeeE    .    .    .    .    .    .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeE.    .    .    .    .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,5]     .    .    .    .    .   DeeE  .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    . DeeeeeE .    .    .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .  DeeE   .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .DeeeeeE  .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,1]     .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeeE  .    .    .    .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeeE .    .    .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeeE.    .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    .    .    .DeeE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeeE   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1530,28 +1530,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4501
+# CHECK-NEXT: Total Cycles:      3001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.33
-# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 15.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012345
-
-# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .    .    .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .   DeeeeeE    .    .    .    .    .    .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeE.    .    .    .    .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .   DeeE  .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    . DeeeeeE .    .    .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .  DeeE   .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .DeeeeeE  .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeeE  .    .    .    .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeeE .    .    .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeeE.    .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .DeeE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeeE   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1576,28 +1576,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4701
+# CHECK-NEXT: Total Cycles:      3201
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.32
-# CHECK-NEXT: IPC:               0.21
+# CHECK-NEXT: uOps Per Cycle:    0.47
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 17.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01234567
-
-# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    .    .    . .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .   DeeeeeE    .    .    .    .    .    . .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeE.    .    .    .    . .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .   DeeE  .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    . DeeeeeeE.    .    . .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .   DeeE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    . DeeeeeeE. .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    . .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DeeE   .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeeE  .    .    .    . .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeeE .    .    . .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    DeeE .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeeeE    . .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    . DeeE    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    DeeeeeeE   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1622,28 +1622,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      5001
+# CHECK-NEXT: Total Cycles:      3501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.30
-# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 20.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    .    .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    DeeeeeeE  .    .    .    .    .    .    .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,3]     .    .    .    . DeeE    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .    DeeeeeeE  .    .    .    .    .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,5]     .    .    .    .    .    . DeeE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .    DeeeeeeE  .    .    .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    . DeeE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    DeeeeeeE  .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeeeE.    .    .    .    .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeeeeE   .    .    .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeeeeE .    .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeeeeE   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1668,28 +1668,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      5001
+# CHECK-NEXT: Total Cycles:      3501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.30
-# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 20.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    .    .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    DeeeeeeE  .    .    .    .    .    .    .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .    . DeeE    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .    DeeeeeeE  .    .    .    .    .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .    . DeeE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .    DeeeeeeE  .    .    .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    . DeeE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    DeeeeeeE  .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeeeE.    .    .    .    .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeeeeE   .    .    .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeeeeE .    .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeeeeE   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1714,28 +1714,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4701
+# CHECK-NEXT: Total Cycles:      3201
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.32
-# CHECK-NEXT: IPC:               0.21
+# CHECK-NEXT: uOps Per Cycle:    0.47
+# CHECK-NEXT: IPC:               0.31
 # CHECK-NEXT: Block RThroughput: 17.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01234567
-
-# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    . .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    DeeeeeeE  .    .    .    .    .    . .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .    . DeeE    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .    DeeeeeeE  .    .    .    . .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .    . DeeE    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .    DeeeeeeE  .    . .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    . DeeE    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    DeeeE. .   ld1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          012
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    . .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeeeE.    .    .    . .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeeeeE   .    . .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeeeeE . .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE   ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1760,28 +1760,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1806,28 +1806,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1852,28 +1852,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1r	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1r	{ v1.2s }, [x27], #4
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1r	{ v1.4h }, [x27], #2
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1898,28 +1898,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1r	{ v1.4s }, [x27], #4
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1r	{ v1.8b }, [x27], #1
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1r	{ v1.16b }, [x27], #1
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1r	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1944,28 +1944,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld1r	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld1r	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld1r	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld1r	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1990,28 +1990,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .    .   ld1r	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .    .   ld1r	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeeeeE   .    .    .    .   ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeE.    .    .    .    .   ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .    .   ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeeeeE    .    .   ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2036,28 +2036,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4801
+# CHECK-NEXT: Total Cycles:      3301
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.31
-# CHECK-NEXT: IPC:               0.21
+# CHECK-NEXT: uOps Per Cycle:    0.45
+# CHECK-NEXT: IPC:               0.30
 # CHECK-NEXT: Block RThroughput: 18.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012345678
-
-# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    .  .   ld2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    DeeeeE    .    .    .    .    .    .  .   ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeeE    .    .    .    .  .   ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    .    .    DeeE .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .  DeeeeeeE    .    .  .   ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    DeeE .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .  DeeeeeeE  .   ld2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          0123
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .  .   ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeE  .    .    .    .  .   ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeeeE.    .    .  .   ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    .DeeE.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .   DeeeeeeE   .  .   ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .DeeeeeeE   ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2082,28 +2082,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4401
+# CHECK-NEXT: Total Cycles:      2901
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.34
-# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: uOps Per Cycle:    0.52
+# CHECK-NEXT: IPC:               0.34
 # CHECK-NEXT: Block RThroughput: 14.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01234
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   .   ld2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   .   ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeeeE .    .    .    .   .   ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .  DeeE   .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .DeeeeE   .    .   .   ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .DeeE.    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .   DeeeeeeE  .   ld2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   .   ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   .   ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeeeE  .    .   .   ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    . DeeeeE  .   .   ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    . DeeeeeeE   ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2128,28 +2128,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3801
+# CHECK-NEXT: Total Cycles:      2301
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.39
-# CHECK-NEXT: IPC:               0.26
+# CHECK-NEXT: uOps Per Cycle:    0.65
+# CHECK-NEXT: IPC:               0.43
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .  .   ld2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    DeeeE.    .    .    .    .  .   ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,3]     .    .    .   DeeE  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    . DeeeE   .    .    .  .   ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeE .    .  .   ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .  DeeE   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .DeeeE  .   ld2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .  .   ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeE   .    .  .   ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .    .  DeeE   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .DeeeE    .  .   ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.  .   ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .DeeE.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .   DeeeE   ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2174,28 +2174,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2220,28 +2220,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld2r	{ v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2266,28 +2266,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld2r	{ v1.2s, v2.2s }, [x27], #8
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld2r	{ v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2312,28 +2312,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3501
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    .   ld2r	{ v1.16b, v2.16b }, [x27], #2
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    .   ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    .   ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    .   ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE  .   ld2r	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2358,28 +2358,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3701
+# CHECK-NEXT: Total Cycles:      2201
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.41
-# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: uOps Per Cycle:    0.68
+# CHECK-NEXT: IPC:               0.45
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234567
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .    .    . .   ld2r	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeE   .    .    .    .    . .   ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeE .    .    .    . .   ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeE    .    . .   ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeeeE. .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    . .   ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    . .   ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    . .   ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   . .   ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeeeE   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .   DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2404,28 +2404,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      5001
+# CHECK-NEXT: Total Cycles:      3501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.30
-# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: uOps Per Cycle:    0.43
+# CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 20.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    .    .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    DeeeeeeE  .    .    .    .    .    .    .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,3]     .    .    .    . DeeE    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .    DeeeeeeE  .    .    .    .    .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,5]     .    .    .    .    .    . DeeE    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .    DeeeeeeE  .    .    .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    . DeeE    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    DeeeeeeE  .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeeeE.    .    .    .    .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeeeeE   .    .    .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeeeeE .    .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeeeeE   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2450,28 +2450,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4901
+# CHECK-NEXT: Total Cycles:      3401
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.31
-# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: uOps Per Cycle:    0.44
+# CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 19.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    .   .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    DeeeeeE   .    .    .    .    .    .   .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .    .DeeE.    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .   DeeeeeeE   .    .    .    .   .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .    .DeeE.    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .   DeeeeeeE   .    .   .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .DeeE.    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .   DeeeeeeE  .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          01234
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .   .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeeE .    .    .    .   .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    DeeE .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .  DeeeeeeE    .    .   .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    DeeeeeeE  .   .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .   DeeE  .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    . DeeeeeeE   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2496,28 +2496,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4601
+# CHECK-NEXT: Total Cycles:      3101
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.33
-# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: uOps Per Cycle:    0.48
+# CHECK-NEXT: IPC:               0.32
 # CHECK-NEXT: Block RThroughput: 16.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    .    .    .    ..   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    . DeeE    .    .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    DeeeeeeE  .    .    .    .    .    ..   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .    . DeeE    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .    DeeeeeeE  .    .    .    ..   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .    . DeeE    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .    DeeeeE    .    ..   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    DeeE .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .  DeeeeE ..   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .  DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          01
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeE  .    .    .    .    ..   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    . DeeeeeeE.    .    .    ..   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .   DeeeeeeE   .    ..   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeeE   ..   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .DeeeeE   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .  DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2542,28 +2542,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2588,28 +2588,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
-# CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2634,28 +2634,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2680,28 +2680,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2726,28 +2726,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2772,28 +2772,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      5101
+# CHECK-NEXT: Total Cycles:      3601
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: uOps Per Cycle:    0.42
+# CHECK-NEXT: IPC:               0.28
 # CHECK-NEXT: Block RThroughput: 21.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          01
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .    .    ..   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeeeE    .    .    .    .    .    .    ..   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,3]     .    .    .    DeeE .    .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .  DeeeeeeeE   .    .    .    .    ..   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    .    .    .DeeE.    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .   DeeeeeeeE  .    .    ..   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    . DeeE    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    DeeeeeeeE ..   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .  DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    ..   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeeeE  .    .    .    .    ..   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeeeeE    .    .    ..   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    DeeeeeeeE .    ..   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeeeeeE   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .  DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2818,28 +2818,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      5401
+# CHECK-NEXT: Total Cycles:      3901
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.28
-# CHECK-NEXT: IPC:               0.19
+# CHECK-NEXT: uOps Per Cycle:    0.38
+# CHECK-NEXT: IPC:               0.26
 # CHECK-NEXT: Block RThroughput: 24.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          01234
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeeE .    .    .    .    .    .    .    .    .   .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,1]     .    .  DeeE   .    .    .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    .DeeeeeeeE.    .    .    .    .    .    .   .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,3]     .    .    .    .   DeeE  .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .    . DeeeeeeeE    .    .    .    .   .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,5]     .    .    .    .    .    .    DeeE .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .  DeeeeeeE    .    .   .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .    DeeE .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .  DeeeeeeeE  .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeE .    .    .    .    .    .   .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeeeeE   .    .    .    .   .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeeeeE.    .    .   .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeeeE   .   .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .  DeeE   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .DeeeeeeeE   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2864,28 +2864,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      5501
+# CHECK-NEXT: Total Cycles:      4001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.27
-# CHECK-NEXT: IPC:               0.18
+# CHECK-NEXT: uOps Per Cycle:    0.37
+# CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 25.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeeE .    .    .    .    .    .    .    .    .    .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     .    .  DeeE   .    .    .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .    .DeeeeeeeE.    .    .    .    .    .    .    .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .    .   DeeE  .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .    . DeeeeeeeE    .    .    .    .    .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .    .    DeeE .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .    .    .  DeeeeeeeE   .    .    .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .    .    .    .DeeE.    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    .    .    .   DeeeeeeeE  .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeeeeE .    .    .    .    .    .    .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .  DeeeeeeeE   .    .    .    .    .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .    .DeeeeeeeE.    .    .    .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeeeeE  .    .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeeeeE   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2910,28 +2910,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2956,28 +2956,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3002,28 +3002,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3048,28 +3048,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3094,28 +3094,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4001
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.37
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    .    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    .    .    .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    .    .    .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeeE.    .    .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .   DeeE  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .    . DeeeeE  .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3140,28 +3140,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3201
+# CHECK-NEXT: Total Cycles:      2301
 # CHECK-NEXT: Total uOps:        1700
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.53
-# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: uOps Per Cycle:    0.74
+# CHECK-NEXT: IPC:               0.43
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .  DeeeeE .    .    .    . .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .    .DeeeeE   .    . .   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    .DeeE.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .   DeeeE . .   ldp	s1, s2, [x27], #248
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeE   ldp	d1, d2, [x27], #496
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeE    .    .    .  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .  .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .  .   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeE.  .   ldp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,7]     .    .    .    .DeeE.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .   DeeeE   ldp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,9]     .    .    .    .    DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3508,28 +3508,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2201
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.68
-# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    . .   ldrsh	x1, [x27, #254]!
-# CHECK-NEXT: [0,1]     .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   DeeeE .    .    . .   ldrsw	x1, [x27], #254
-# CHECK-NEXT: [0,3]     .    DeeE .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .  DeeeE  .    . .   ldrsw	x1, [x27, #254]!
-# CHECK-NEXT: [0,5]     .    .   DeeE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    DeeeE.    . .   st1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,7]     .    .    .   DeeE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    DeeeE. .   st1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   ldrsw	x1, [x27], #254
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3554,28 +3554,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3600,28 +3600,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3646,28 +3646,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3692,28 +3692,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3738,28 +3738,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3784,28 +3784,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234567
-
-# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3830,28 +3830,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3876,28 +3876,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3922,28 +3922,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3968,28 +3968,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4014,28 +4014,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4060,28 +4060,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4106,28 +4106,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2703
+# CHECK-NEXT: Total Cycles:      2201
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.55
-# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: uOps Per Cycle:    0.68
+# CHECK-NEXT: IPC:               0.45
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          012
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .   .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .   .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeE   .    .   .   st1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,5]     .    .    .    .DeeE.    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    . DeeeE   .   .   st1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,7]     .    .    .    .    .DeeE.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    . DeeeE  .   st1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    . .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    . .   st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,5]     .    .    .DeeE.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .   DeeeE . .   st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,7]     .    .    .    DeeE . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .  DeeeE   st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .   DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4152,28 +4152,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4198,28 +4198,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2603
+# CHECK-NEXT: Total Cycles:      2101
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.58
-# CHECK-NEXT: IPC:               0.38
+# CHECK-NEXT: uOps Per Cycle:    0.71
+# CHECK-NEXT: IPC:               0.48
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
-
-# CHECK:      [0,0]     DeeeE.    .    .    .    .  .   st1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .  .   st1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    .  .   st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    .  .   st1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE  .   st2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,9]     .    .    .    .    .    DeeE   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeE.    .    .    ..   st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    ..   st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    ..   st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   ..   st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeeE   st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .  DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4244,28 +4244,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2703
+# CHECK-NEXT: Total Cycles:      2201
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.55
-# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: uOps Per Cycle:    0.68
+# CHECK-NEXT: IPC:               0.45
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          012
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   .   st2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    .   .   st2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   .   st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    DeeE .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .DeeeE    .   .   st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,7]     .    .    .    .    DeeE .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .DeeeeE  .   st2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    . .   st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    . .   st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3]     .    DeeE .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeeE .    . .   st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    DeeE .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .  DeeeE  . .   st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    . DeeeeE   st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .   DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4290,28 +4290,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2803
+# CHECK-NEXT: Total Cycles:      2301
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.54
-# CHECK-NEXT: IPC:               0.36
+# CHECK-NEXT: uOps Per Cycle:    0.65
+# CHECK-NEXT: IPC:               0.43
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .   st2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    .   st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeE   .    .    .   st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .DeeE.    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    . DeeeE   .    .   st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .DeeE.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    . DeeeeE  .   st2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    . DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeE    .    .    .  .   st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .  .   st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeE.    .  .   st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DeeE.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .   DeeeE .  .   st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    DeeE .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .  DeeeeE   st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4336,28 +4336,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2703
+# CHECK-NEXT: Total Cycles:      2201
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.55
-# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: uOps Per Cycle:    0.68
+# CHECK-NEXT: IPC:               0.45
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          012
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    .   .   st2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   .   st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    DeeE .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .DeeeeE   .    .   .   st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .DeeE.    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    . DeeeE   .   .   st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,7]     .    .    .    .    .DeeE.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    . DeeeE  .   st2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    . .   st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeeE.    .    . .   st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .DeeE.    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .   DeeeeE.    . .   st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .DeeE.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .   DeeeE . .   st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .    .    .    DeeE . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .  DeeeE   st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,9]     .    .    .    .   DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4382,28 +4382,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4428,28 +4428,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2503
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    0.75
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234567
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeE.    .    .    .    . .   st2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,1]     .   DeeE  .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    DeeeE.    .    .    . .   st2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    DeeeE.    .    . .   st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    DeeeE.    . .   st2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    DeeeE. .   st2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeE.    .    .    .   st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     .DeeE.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   DeeeE .    .    .   st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  DeeeE  .    .   st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeeE  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . DeeeE   .   st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    .    .  DeeE   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .DeeeE   st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4474,24 +4474,24 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
-# CHECK-NEXT: Total Cycles:      2003
+# CHECK-NEXT: Total Cycles:      1701
 # CHECK-NEXT: Total uOps:        900
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.45
-# CHECK-NEXT: IPC:               0.30
+# CHECK-NEXT: uOps Per Cycle:    0.53
+# CHECK-NEXT: IPC:               0.35
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeE    .    .    . .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,1]     .    DeeE .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeeE  .    . .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,3]     .    .    . DeeE    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .  DeeeeeE. .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,5]     .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    . .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,1]     . DeeE    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeeE   . .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .  DeeE   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .DeeeeeE   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .    .   DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4512,28 +4512,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3403
+# CHECK-NEXT: Total Cycles:      2901
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.44
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.52
+# CHECK-NEXT: IPC:               0.34
 # CHECK-NEXT: Block RThroughput: 14.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT:                     0123456789
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    ..   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeeeE .    .    .    .    ..   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeeeE    .    .    ..   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,5]     .    .    .    .    DeeE .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeeeE  .    ..   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,7]     .    .    .    .    .    . DeeE    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeeE ..   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .  DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .   .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     .  DeeE   .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeeE  .    .    .   .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeeE .    .   .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .    .    DeeE .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeeE.   .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    .    .    .DeeE.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4558,28 +4558,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3503
+# CHECK-NEXT: Total Cycles:      3001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 15.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT:                     0123456789          0
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    . .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeeeE .    .    .    .    . .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeeeE    .    .    . .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .    DeeE .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeeeE  .    . .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    . DeeE    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeeeE. .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeeE  .    .    .    .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeeE .    .    .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeeE.    .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    .DeeE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeeE   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4604,28 +4604,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3103
+# CHECK-NEXT: Total Cycles:      2601
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.48
-# CHECK-NEXT: IPC:               0.32
+# CHECK-NEXT: uOps Per Cycle:    0.58
+# CHECK-NEXT: IPC:               0.38
 # CHECK-NEXT: Block RThroughput: 11.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
 
-# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .  .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeeE  .    .    .    .  .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,3]     .    .    . DeeE    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .  DeeeeE .    .    .  .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,5]     .    .    .    .  DeeE   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .   DeeeeE.    .  .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .   DeeE  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    DeeeeE  .   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeE   .    .    .    ..   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DeeE   .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    ..   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,3]     .    .  DeeE   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .DeeeeE   .    ..   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,5]     .    .    .  DeeE   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .DeeeeE   ..   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .  DeeE   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .DeeeeE   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .  DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4650,28 +4650,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4696,28 +4696,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3103
+# CHECK-NEXT: Total Cycles:      2601
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.48
-# CHECK-NEXT: IPC:               0.32
+# CHECK-NEXT: uOps Per Cycle:    0.58
+# CHECK-NEXT: IPC:               0.38
 # CHECK-NEXT: Block RThroughput: 11.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .  .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    .  .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    .  .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    .  .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeeE  .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    ..   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    ..   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    ..   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeeE   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .    .    .  DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4742,28 +4742,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3503
+# CHECK-NEXT: Total Cycles:      3001
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.43
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.33
 # CHECK-NEXT: Block RThroughput: 15.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234567
+# CHECK-NEXT:                     0123456789          0
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .    . .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeeeE .    .    .    .    . .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeeeE    .    .    . .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    .    DeeE .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    .DeeeeeE  .    . .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,7]     .    .    .    .    .    . DeeE    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .  DeeeeeE. .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     .  DeeE   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeeE  .    .    .    .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeeE .    .    .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    .    DeeE .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeeE.    .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    .    .DeeE.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeeE   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4788,28 +4788,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3403
+# CHECK-NEXT: Total Cycles:      2901
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.44
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.52
+# CHECK-NEXT: IPC:               0.34
 # CHECK-NEXT: Block RThroughput: 14.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT:                     0123456789
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    .    ..   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeeE  .    .    .    .    ..   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    . DeeE    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .  DeeeeeE.    .    .    ..   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .    .    .   DeeE  .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    DeeeeeE   .    ..   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .    .DeeE.    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    . DeeeeeE ..   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .  DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeeE   .    .    .   .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .  DeeE   .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    .DeeeeeE  .    .   .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    . DeeeeeE .   .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    .    DeeE .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    .  DeeeeeE   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .    .DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4834,28 +4834,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3203
+# CHECK-NEXT: Total Cycles:      2701
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.47
-# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: uOps Per Cycle:    0.56
+# CHECK-NEXT: IPC:               0.37
 # CHECK-NEXT: Block RThroughput: 12.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
 
-# CHECK:      [0,0]     DeeeeeE   .    .    .    .    .   .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .    .DeeE.    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    . DeeeeeE .    .    .    .   .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .  DeeE   .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    .   DeeeeE.    .    .   .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .    .    .    .   DeeE  .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .    DeeeeE    .   .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,7]     .    .    .    .    .    DeeE .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .    .DeeeeE  .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .    .DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeE   .    .    .    . .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DeeE   .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    .DeeeeeE  .    .    . .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DeeE  .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    . .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .    .   DeeE  .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    . DeeeeE  . .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .    .   DeeE  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    . DeeeeE   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    .   DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4880,28 +4880,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3003
+# CHECK-NEXT: Total Cycles:      2501
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeE    .    .    .    .    . .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    .    . .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    .    . .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE .    . .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .    .   DeeeeE. .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .    .    .   DeeE   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeE    .    .    .    .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .    .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .    .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE    .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .    DeeeeE   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4926,26 +4926,26 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      800
-# CHECK-NEXT: Total Cycles:      2403
+# CHECK-NEXT: Total Cycles:      2001
 # CHECK-NEXT: Total uOps:        1200
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    0.60
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeeE    .    .    .    ..   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,1]     .    DeeE .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    .DeeeeE   .    .    ..   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .    .    .DeeE.    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    . DeeeeE  .    ..   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,5]     .    .    .    . DeeE    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .    .  DeeeeE ..   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .    .    .  DeeE   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeE    .    .    .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,1]     . DeeE    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    DeeeeE    .    .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    . DeeE    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    DeeeeE    .   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,5]     .    .    . DeeE    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .    DeeeeE   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .    . DeeE   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions


        


More information about the llvm-commits mailing list