[llvm] f635ab3 - [AArch64] Fix postinc operands for Neoverse-N1 scheduling

David Green via llvm-commits llvm-commits at lists.llvm.org
Wed Oct 11 04:04:03 PDT 2023


Author: David Green
Date: 2023-10-11T12:03:56+01:00
New Revision: f635ab3ffd85b2c7e6b8da106ec482faba99214e

URL: https://github.com/llvm/llvm-project/commit/f635ab3ffd85b2c7e6b8da106ec482faba99214e
DIFF: https://github.com/llvm/llvm-project/commit/f635ab3ffd85b2c7e6b8da106ec482faba99214e.diff

LOG: [AArch64] Fix postinc operands for Neoverse-N1 scheduling

Similar to D159254, this fixes the order of WriteAdr operands on post/pre-inc
loads/stores in the Neoverse-N1 scheduling model.

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
    llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-writeback.s

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
index 220bebc668ed508..2ec9600f84f7e58 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
@@ -377,7 +377,7 @@ def : InstRW<[N1Write_4c_1L, N1Write_0c_0Z], (instrs LDPWi, LDNPWi)>;
 def : InstRW<[N1Write_5c_1I_1L, N1Write_0c_0Z], (instrs LDPSWi)>;
 
 // Load pair, immed post or pre-index, signed words
-def : InstRW<[N1Write_5c_1I_1L, N1Write_0c_0Z, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_5c_1I_1L, N1Write_0c_0Z],
              (instrs LDPSWpost, LDPSWpre)>;
 
 
@@ -477,7 +477,7 @@ def : InstRW<[N1Write_5c_1L, ReadAdrBase], (instregex "^LDR[SDQ]l$",
 
 // Load vector reg, immed post-index
 // Load vector reg, immed pre-index
-def : InstRW<[N1Write_5c_1L, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_5c_1L],
              (instregex "^LDR[BHSDQ](post|pre)$")>;
 
 // Load vector reg, unsigned immed
@@ -501,12 +501,12 @@ def : InstRW<[N1Write_7c_1I_1L, WriteLDHi], (instregex "^LDPN?[HQ]i$")>;
 
 // Load vector pair, immed post-index, S/D-form
 // Load vector pair, immed pre-index, S/D-form
-def : InstRW<[N1Write_5c_1L, WriteLDHi, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_5c_1L, WriteLDHi],
              (instregex "^LDP[SD](pre|post)$")>;
 
 // Load vector pair, immed post-index, Q-form
 // Load vector pair, immed pre-index, Q-form
-def : InstRW<[N1Write_7c_1L, WriteLDHi, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_7c_1L, WriteLDHi],
              (instrs LDPQpost, LDPQpre)>;
 
 
@@ -521,11 +521,11 @@ def : InstRW<[N1Write_2c_2I_2L], (instrs STURQi)>;
 
 // Store vector reg, immed post-index, B/H/S/D-form
 // Store vector reg, immed pre-index, B/H/S/D-form
-def : InstRW<[N1Write_2c_1L_1V, WriteAdr], (instregex "^STR[BHSD](pre|post)$")>;
+def : InstRW<[WriteAdr, N1Write_2c_1L_1V], (instregex "^STR[BHSD](pre|post)$")>;
 
 // Store vector reg, immed pre-index, Q-form
 // Store vector reg, immed post-index, Q-form
-def : InstRW<[N1Write_2c_2L_2V, WriteAdr], (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, N1Write_2c_2L_2V], (instrs STRQpre, STRQpost)>;
 
 // Store vector reg, unsigned immed, B/H/S/D-form
 def : InstRW<[N1Write_2c_1L_1V], (instregex "^STR[BHSD]ui$")>;
@@ -562,15 +562,15 @@ def : InstRW<[N1Write_3c_4L_2V], (instrs STPQi, STNPQi)>;
 
 // Store vector pair, immed post-index, S-form
 // Store vector pair, immed pre-index, S-form
-def : InstRW<[N1Write_2c_1L_1V, WriteAdr], (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, N1Write_2c_1L_1V], (instrs STPSpre, STPSpost)>;
 
 // Store vector pair, immed post-index, D-form
 // Store vector pair, immed pre-index, D-form
-def : InstRW<[N1Write_2c_2L_2V, WriteAdr], (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, N1Write_2c_2L_2V], (instrs STPDpre, STPDpost)>;
 
 // Store vector pair, immed post-index, Q-form
 // Store vector pair, immed pre-index, Q-form
-def : InstRW<[N1Write_3c_4L_2V, WriteAdr], (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, N1Write_3c_4L_2V], (instrs STPQpre, STPQpost)>;
 
 
 // ASIMD integer instructions
@@ -818,25 +818,25 @@ def : InstRW<[N1Write_5c_1M_1V], (instregex "^INSvi(8|16|32|64)gpr$")>;
 // ASIMD load, 1 element, multiple, 1 reg
 def : InstRW<[N1Write_5c_1L],
              (instregex "^LD1Onev(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
-def : InstRW<[N1Write_5c_1L, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_5c_1L],
              (instregex "^LD1Onev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 2 reg
 def : InstRW<[N1Write_5c_2L],
              (instregex "^LD1Twov(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
-def : InstRW<[N1Write_5c_2L, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_5c_2L],
              (instregex "^LD1Twov(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 3 reg
 def : InstRW<[N1Write_6c_3L],
              (instregex "^LD1Threev(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
-def : InstRW<[N1Write_6c_3L, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_6c_3L],
              (instregex "^LD1Threev(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 4 reg
 def : InstRW<[N1Write_6c_4L],
              (instregex "^LD1Fourv(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
-def : InstRW<[N1Write_6c_4L, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_6c_4L],
              (instregex "^LD1Fourv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
 
 // ASIMD load, 1 element, one lane
@@ -844,7 +844,7 @@ def : InstRW<[N1Write_6c_4L, WriteAdr],
 def : InstRW<[N1Write_7c_1L_1V],
              (instregex "LD1(i|Rv)(8|16|32|64)$",
                         "LD1Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
-def : InstRW<[N1Write_7c_1L_1V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_7c_1L_1V],
              (instregex "LD1i(8|16|32|64)_POST$",
                         "LD1Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
 
@@ -855,7 +855,7 @@ def : InstRW<[N1Write_7c_2L_2V],
              (instregex "LD2Twov(8b|16b|4h|8h|2s|4s|2d)$",
                         "LD2i(8|16|32|64)$",
                         "LD2Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
-def : InstRW<[N1Write_7c_2L_2V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_7c_2L_2V],
              (instregex "LD2Twov(8b|16b|4h|8h|2s|4s|2d)_POST$",
                         "LD2i(8|16|32|64)_POST$",
                         "LD2Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
@@ -863,7 +863,7 @@ def : InstRW<[N1Write_7c_2L_2V, WriteAdr],
 // ASIMD load, 3 element, multiple
 def : InstRW<[N1Write_8c_3L_3V],
              (instregex "LD3Threev(8b|16b|4h|8h|2s|4s|2d)$")>;
-def : InstRW<[N1Write_8c_3L_3V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_8c_3L_3V],
              (instregex "LD3Threev(8b|16b|4h|8h|2s|4s|2d)_POST$")>;
 
 // ASIMD load, 3 element, one lane
@@ -871,20 +871,20 @@ def : InstRW<[N1Write_8c_3L_3V, WriteAdr],
 def : InstRW<[N1Write_7c_2L_3V],
              (instregex "LD3i(8|16|32|64)$",
                         "LD3Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
-def : InstRW<[N1Write_7c_2L_3V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_7c_2L_3V],
              (instregex "LD3i(8|16|32|64)_POST$",
                         "LD3Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
 
 // ASIMD load, 4 element, multiple, D-form
 def : InstRW<[N1Write_8c_3L_4V],
              (instregex "LD4Fourv(8b|4h|2s)$")>;
-def : InstRW<[N1Write_8c_3L_4V, WriteAdr], 
+def : InstRW<[WriteAdr, N1Write_8c_3L_4V],
              (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
 
 // ASIMD load, 4 element, multiple, Q-form
 def : InstRW<[N1Write_10c_4L_4V],
              (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[N1Write_10c_4L_4V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_10c_4L_4V],
              (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 4 element, one lane
@@ -892,7 +892,7 @@ def : InstRW<[N1Write_10c_4L_4V, WriteAdr],
 def : InstRW<[N1Write_8c_4L_4V],
              (instregex "LD4i(8|16|32|64)$",
                         "LD4Rv(8b|16b|4h|8h|2s|4s|1d|2d)$")>;
-def : InstRW<[N1Write_8c_4L_4V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_8c_4L_4V],
              (instregex "LD4i(8|16|32|64)_POST$",
                         "LD4Rv(8b|16b|4h|8h|2s|4s|1d|2d)_POST$")>;
 
@@ -903,127 +903,127 @@ def : InstRW<[N1Write_8c_4L_4V, WriteAdr],
 // ASIMD store, 1 element, multiple, 1 reg, D-form
 def : InstRW<[N1Write_2c_1L_1V],
              (instregex "ST1Onev(8b|4h|2s|1d)$")>;
-def : InstRW<[N1Write_2c_1L_1V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_2c_1L_1V],
              (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 1 reg, Q-form
 def : InstRW<[N1Write_2c_1L_1V],
              (instregex "ST1Onev(16b|8h|4s|2d)$")>;
-def : InstRW<[N1Write_2c_1L_1V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_2c_1L_1V],
              (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 2 reg, D-form
 def : InstRW<[N1Write_2c_1L_2V],
              (instregex "ST1Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[N1Write_2c_1L_2V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_2c_1L_2V],
              (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 2 reg, Q-form
 def : InstRW<[N1Write_3c_2L_2V],
              (instregex "ST1Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[N1Write_3c_2L_2V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_3c_2L_2V],
              (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 3 reg, D-form
 def : InstRW<[N1Write_3c_2L_3V],           
              (instregex "ST1Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[N1Write_3c_2L_3V, WriteAdr], 
+def : InstRW<[WriteAdr, N1Write_3c_2L_3V],
              (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 3 reg, Q-form
 def : InstRW<[N1Write_4c_3L_3V],
              (instregex "ST1Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[N1Write_4c_3L_3V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_4c_3L_3V],
              (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 4 reg, D-form
 def : InstRW<[N1Write_3c_2L_2V],
              (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[N1Write_3c_2L_2V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_3c_2L_2V],
              (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 4 reg, Q-form
 def : InstRW<[N1Write_5c_4L_4V],
              (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[N1Write_5c_4L_4V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_5c_4L_4V],
              (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, one lane
 def : InstRW<[N1Write_4c_1L_1V],
              (instregex "ST1i(8|16|32|64)$")>;
-def : InstRW<[N1Write_4c_1L_1V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_4c_1L_1V],
              (instregex "ST1i(8|16|32|64)_POST$")>;
 
 // ASIMD store, 2 element, multiple, D-form, B/H/S
 def : InstRW<[N1Write_4c_1L_1V],
              (instregex "ST2Twov(8b|4h|2s)$")>;
-def : InstRW<[N1Write_4c_1L_1V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_4c_1L_1V],
              (instregex "ST2Twov(8b|4h|2s)_POST$")>;
 
 // ASIMD store, 2 element, multiple, Q-form
 def : InstRW<[N1Write_5c_2L_2V],
              (instregex "ST2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[N1Write_5c_2L_2V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_5c_2L_2V],
              (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 2 element, one lane
 def : InstRW<[N1Write_4c_1L_1V],
              (instregex "ST2i(8|16|32|64)$")>;
-def : InstRW<[N1Write_4c_1L_1V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_4c_1L_1V],
              (instregex "ST2i(8|16|32|64)_POST$")>;
 
 // ASIMD store, 3 element, multiple, D-form, B/H/S
 def : InstRW<[N1Write_5c_2L_2V],
              (instregex "ST3Threev(8b|4h|2s)$")>;
-def : InstRW<[N1Write_5c_2L_2V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_5c_2L_2V],
              (instregex "ST3Threev(8b|4h|2s)_POST$")>;
 
 // ASIMD store, 3 element, multiple, Q-form
 def : InstRW<[N1Write_6c_3L_3V],
              (instregex "ST3Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[N1Write_6c_3L_3V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_6c_3L_3V],
              (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 3 element, one lane, B/H/S
 def : InstRW<[N1Write_4c_3L_3V],
              (instregex "ST3i(8|16|32)$")>;
-def : InstRW<[N1Write_4c_3L_3V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_4c_3L_3V],
              (instregex "ST3i(8|16|32)_POST$")>;
 
 // ASIMD store, 3 element, one lane, D
 def : InstRW<[N1Write_5c_3L_3V],
              (instrs ST3i64)>;
-def : InstRW<[N1Write_5c_3L_3V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_5c_3L_3V],
              (instrs ST3i64_POST)>;
 
 // ASIMD store, 4 element, multiple, D-form, B/H/S
 def : InstRW<[N1Write_7c_3L_3V],
              (instregex "ST4Fourv(8b|4h|2s)$")>;
-def : InstRW<[N1Write_7c_3L_3V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_7c_3L_3V],
              (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
 
 // ASIMD store, 4 element, multiple, Q-form, B/H/S
 def : InstRW<[N1Write_9c_6L_6V],
              (instregex "ST4Fourv(16b|8h|4s)$")>;
-def : InstRW<[N1Write_9c_6L_6V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_9c_6L_6V],
              (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
 
 // ASIMD store, 4 element, multiple, Q-form, D
 def : InstRW<[N1Write_6c_4L_4V],
              (instrs ST4Fourv2d)>;
-def : InstRW<[N1Write_6c_4L_4V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_6c_4L_4V],
              (instrs ST4Fourv2d_POST)>;
 
 // ASIMD store, 4 element, one lane, B/H/S
 def : InstRW<[N1Write_5c_3L_3V],
              (instregex "ST4i(8|16|32)$")>;
-def : InstRW<[N1Write_5c_3L_3V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_5c_3L_3V],
              (instregex "ST4i(8|16|32)_POST$")>;
 
 // ASIMD store, 4 element, one lane, D
 def : InstRW<[N1Write_4c_3L_3V],
              (instrs ST4i64)>;
-def : InstRW<[N1Write_4c_3L_3V, WriteAdr],
+def : InstRW<[WriteAdr, N1Write_4c_3L_3V],
              (instrs ST4i64_POST)>;
 
 

diff  --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-writeback.s
index b4e7a4e40ba7721..8fe21167a5bd375 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-writeback.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-writeback.s
@@ -1162,28 +1162,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
-
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ld1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .  .   ld1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: [0,3]     D==========eER .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==========eeeeeER  .    .  .   ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: [0,5]     .D==============eER .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==============eeeeeER  .  .   ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,7]     .D===================eER .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===================eeeeeER.   ld1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,9]     .D========================eER   add	x0, x27, #1
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  ..   ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeER ..   ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,3]     D==eE---R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeeER..   ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,5]     .D==eE---R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeER.   ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7]     .D===eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeeER   ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9]     .D====eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1193,43 +1193,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     20.0   0.0    0.0       ld1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: 9.     1     25.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 3.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 9.     1     5.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.0    0.1    1.5       <total>
 
 # CHECK:      [1] Code Region - G02
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
-
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ld1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .  .   ld1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: [0,3]     D==========eER .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==========eeeeeER  .    .  .   ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: [0,5]     .D==============eER .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==============eeeeeER  .  .   ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===================eER .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===================eeeeeER.   ld1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .D========================eER   add	x0, x27, #1
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  ..   ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeER ..   ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,3]     D==eE---R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeeER..   ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,5]     .D==eE---R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeER.   ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeeER   ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .D====eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1239,43 +1239,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     20.0   0.0    0.0       ld1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     25.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 3.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     5.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.0    0.1    1.5       <total>
 
 # CHECK:      [2] Code Region - G03
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
-
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ld1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .  .   ld1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     D==========eER .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==========eeeeeER  .    .  .   ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==============eER .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==============eeeeeER  .  .   ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===================eER .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===================eeeeeER.   ld1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .D========================eER   add	x0, x27, #1
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  ..   ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeER ..   ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE---R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeeER..   ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE---R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeER.   ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeeER   ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .D====eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1285,43 +1285,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     20.0   0.0    0.0       ld1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     25.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     5.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.0    0.1    1.5       <total>
 
 # CHECK:      [3] Code Region - G04
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        1900
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.76
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    3.75
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 4.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
-
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ld1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .  .   ld1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,3]     D==========eER .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=========eeeeeER  .    .  .   ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5]     .D==============eER .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==============eeeeeER  .  .   ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7]     .D===================eER .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==================eeeeeER.   ld1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9]     . D=======================eER   add	x0, x27, #1
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  ..   ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeER ..   ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,3]     D==eE---R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeER..   ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .D==eE---R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeER.   ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .D===eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeER   ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     . D===eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1331,43 +1331,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     10.0   0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     19.0   0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 9.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     12.7   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 3.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.5       <total>
 
 # CHECK:      [4] Code Region - G05
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.80
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    3.94
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
-
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ld1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .  .   ld1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3]     D==========eER .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=========eeeeeER  .    .  .   ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5]     .D==============eER .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==============eeeeeER  .  .   ld1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7]     .D===================eER .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==================eeeeeER.   ld1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     . D=======================eER   add	x0, x27, #1
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  ..   ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeER ..   ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     D==eE---R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeER..   ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .D==eE---R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeER.   ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .D===eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeER   ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1377,43 +1377,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     10.0   0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     19.0   0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     12.7   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.5       <total>
 
 # CHECK:      [5] Code Region - G06
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.80
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    3.94
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
-
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ld1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .  .   ld1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     D==========eER .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=========eeeeeER  .    .  .   ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .D==============eER .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==============eeeeeER  .  .   ld1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===================eER .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==================eeeeeER.   ld1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     . D=======================eER   add	x0, x27, #1
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  ..   ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeER ..   ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE---R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeER..   ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE---R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeER.   ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeER   ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1423,43 +1423,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     10.0   0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     19.0   0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     12.7   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.5       <total>
 
 # CHECK:      [6] Code Region - G07
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2804
+# CHECK-NEXT: Total Cycles:      707
 # CHECK-NEXT: Total uOps:        2300
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.82
-# CHECK-NEXT: IPC:               0.36
+# CHECK-NEXT: uOps Per Cycle:    3.25
+# CHECK-NEXT: IPC:               1.41
 # CHECK-NEXT: Block RThroughput: 6.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    ..   ld1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .    ..   ld1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     D==========eER .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=========eeeeeeER .    .    ..   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,5]     .D===============eER.    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D==============eeeeeeER.    ..   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,7]     . D====================eER    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D===================eeeeeeER.   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,9]     .  D=========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  .  .   ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE---R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeER .  .   ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE---R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER  .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,5]     .D==eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeeER .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,7]     . D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D==eeeeeeER   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,9]     .  D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1469,43 +1469,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     10.0   0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 5.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 7.     1     21.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     20.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 9.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.5    0.2    1.8       <total>
 
 # CHECK:      [7] Code Region - G08
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total Cycles:      757
 # CHECK-NEXT: Total uOps:        2500
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.83
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    3.30
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    .  .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,3]     .D===========eER    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==========eeeeeeER    .    .  .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,5]     . D================eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===============eeeeeeER   .  .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,7]     .  D=====================eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====================eeeeeeER.   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,9]     .   D==========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .   .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,1]     D=eE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.   .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3]     .D=eE----R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeER  .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,5]     . D==eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeER .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,7]     .  D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeER   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,9]     .   D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1515,43 +1515,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     16.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    2.0       <total>
 
 # CHECK:      [8] Code Region - G09
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total Cycles:      757
 # CHECK-NEXT: Total uOps:        2500
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.83
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    3.30
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    .  .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===========eER    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==========eeeeeeER    .    .  .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D================eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===============eeeeeeER   .  .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=====================eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====================eeeeeeER.   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D==========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .   .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.   .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE----R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeER  .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D==eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeER .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeER   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1561,43 +1561,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     16.0   0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    2.0       <total>
 
 # CHECK:      [9] Code Region - G10
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total Cycles:      856
 # CHECK-NEXT: Total uOps:        2700
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.90
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    3.15
+# CHECK-NEXT: IPC:               1.17
 # CHECK-NEXT: Block RThroughput: 8.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    .  .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===========eER    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==========eeeeeeER    .    .  .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     . D================eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===============eeeeeeER   .  .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D=====================eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====================eeeeeeER.   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,9]     .   D==========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .   .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.   .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE----R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeER  .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D==eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeER .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeER   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,9]     .   D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1607,43 +1607,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     16.0   0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    2.0       <total>
 
 # CHECK:      [10] Code Region - G11
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total Cycles:      1006
 # CHECK-NEXT: Total uOps:        3000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    2.98
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    .  .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,3]     .D===========eER    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==========eeeeeeER    .    .  .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,5]     . D================eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===============eeeeeeER   .  .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D=====================eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====================eeeeeeER.   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,9]     .   D==========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,1]     D=eE----R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.    ..   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,3]     .D=eE----R.    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeeeeeER  ..   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,5]     . D===eE----R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==eeeeeeER ..   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D===eE----R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====eeeeeeER   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,9]     .   D=====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1653,43 +1653,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     16.0   0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    2.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 9.     1     6.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.5    2.0       <total>
 
 # CHECK:      [11] Code Region - G12
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total Cycles:      1006
 # CHECK-NEXT: Total uOps:        3000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    2.98
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    .  .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===========eER    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==========eeeeeeER    .    .  .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     . D================eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===============eeeeeeER   .  .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=====================eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====================eeeeeeER.   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D==========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .    ..   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,1]     D=eE----R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.    ..   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE----R.    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeeeeeER  ..   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     . D===eE----R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==eeeeeeER ..   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D===eE----R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====eeeeeeER   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1699,43 +1699,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     16.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    2.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     6.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.5    2.0       <total>
 
 # CHECK:      [12] Code Region - G13
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3104
+# CHECK-NEXT: Total Cycles:      1209
 # CHECK-NEXT: Total uOps:        2800
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.90
-# CHECK-NEXT: IPC:               0.32
+# CHECK-NEXT: uOps Per Cycle:    2.32
+# CHECK-NEXT: IPC:               0.83
 # CHECK-NEXT: Block RThroughput: 8.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .   .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    .   .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===========eER    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==========eeeeeeER    .    .   .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     . D================eER   .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===============eeeeeeER   .   .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=====================eER  .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====================eeeeeeeER.   ld1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,9]     .   D===========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .    .    .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE----R .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.    .    .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE----R.    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeeeeeER  .    .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D===eE----R  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==eeeeeeER .    .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D===eE----R .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=======eeeeeeeER   ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,9]     .   D========eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1745,22 +1745,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     16.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: 9.     1     28.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     14.1   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     8.0    0.0    0.0       ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 9.     1     9.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.7    0.3    2.1       <total>
 
 # CHECK:      [13] Code Region - G14
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      3503
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    8
@@ -1769,19 +1769,19 @@ add x0, x27, 1
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT:                     0123456789          01234567
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=======eeeeeeeER   .    .    .    .  .   ld1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,3]     D==============eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=============eeeeeeeER .    .    .  .   ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,5]     .D====================eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D====================eeeeeeeER    .  .   ld1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,7]     .D===========================eER   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==========================eeeeeeeER.   ld1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,9]     . D=================================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,1]     D=eE-----R.    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=======eeeeeeeER   .    .    .    . .   ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,3]     D========eE-----R   .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=============eeeeeeeER .    .    . .   ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,5]     .D==============eE-----R .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D====================eeeeeeeER    . .   ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .D=====================eE-----R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==========================eeeeeeeER   ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,9]     . D===========================eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1791,22 +1791,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     9.0    0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     15.0   0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: 7.     1     28.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     22.0   0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     27.0   0.0    0.0       ld1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: 9.     1     34.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     17.7   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     28.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     14.7   0.1    2.5       <total>
 
 # CHECK:      [14] Code Region - G15
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      3503
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    8
@@ -1815,19 +1815,19 @@ add x0, x27, 1
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT:                     0123456789          01234567
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=======eeeeeeeER   .    .    .    .  .   ld1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,3]     D==============eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=============eeeeeeeER .    .    .  .   ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .D====================eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D====================eeeeeeeER    .  .   ld1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .D===========================eER   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==========================eeeeeeeER.   ld1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,9]     . D=================================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE-----R.    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=======eeeeeeeER   .    .    .    . .   ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,3]     D========eE-----R   .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=============eeeeeeeER .    .    . .   ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .D==============eE-----R .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D====================eeeeeeeER    . .   ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D=====================eE-----R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==========================eeeeeeeER   ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,9]     . D===========================eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1837,43 +1837,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     9.0    0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     15.0   0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: 7.     1     28.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     22.0   0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     27.0   0.0    0.0       ld1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: 9.     1     34.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     17.7   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     28.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     14.7   0.1    2.5       <total>
 
 # CHECK:      [15] Code Region - G16
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      1103
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.57
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    1.81
+# CHECK-NEXT: IPC:               0.91
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=======eeeeeeeER   .    .    .    .  .   ld1r	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,3]     D==============eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=============eeeeeeeER .    .    .  .   ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: [0,5]     .D====================eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D====================eeeeeeeER    .  .   ld1r	{ v1.2s }, [x27], #4
-# CHECK-NEXT: [0,7]     .D===========================eER   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==========================eeeeeeeER.   ld1r	{ v1.4h }, [x27], #2
-# CHECK-NEXT: [0,9]     . D=================================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.  .   ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE-----R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeER  .   ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,3]     D==eE-----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeER .   ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: [0,5]     .D==eE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeER.   ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: [0,7]     .D===eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeER   ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: [0,9]     . D===eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1883,43 +1883,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld1r	{ v1.1d }, [x27], #8
-# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld1r	{ v1.2s }, [x27], #4
-# CHECK-NEXT: 7.     1     28.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     27.0   0.0    0.0       ld1r	{ v1.4h }, [x27], #2
-# CHECK-NEXT: 9.     1     34.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     17.7   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 3.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: 5.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: 7.     1     4.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: 9.     1     4.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.5       <total>
 
 # CHECK:      [16] Code Region - G17
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      509
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.57
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    3.93
+# CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld1r	{ v1.4s }, [x27], #4
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=======eeeeeeeER   .    .    .    .  .   ld1r	{ v1.8b }, [x27], #1
-# CHECK-NEXT: [0,3]     D==============eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=============eeeeeeeER .    .    .  .   ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: [0,5]     .D====================eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D====================eeeeeeeER    .  .   ld1r	{ v1.16b }, [x27], #1
-# CHECK-NEXT: [0,7]     .D===========================eER   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==========================eeeeeeeER.   ld1r	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     . D=================================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.  .   ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: [0,1]     D=eE-----R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeER  .   ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: [0,3]     D==eE-----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeER .   ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: [0,5]     .D==eE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeER.   ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: [0,7]     .D===eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeER   ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1929,43 +1929,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.4s }, [x27], #4
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld1r	{ v1.8b }, [x27], #1
-# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld1r	{ v1.16b }, [x27], #1
-# CHECK-NEXT: 7.     1     28.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     27.0   0.0    0.0       ld1r	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     34.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     17.7   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: 3.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: 5.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: 7.     1     4.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.5       <total>
 
 # CHECK:      [17] Code Region - G18
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      509
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.57
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    3.93
+# CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld1r	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=======eeeeeeeER   .    .    .    .  .   ld1r	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     D==============eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=============eeeeeeeER .    .    .  .   ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .D====================eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D====================eeeeeeeER    .  .   ld1r	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===========================eER   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==========================eeeeeeeER.   ld1r	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     . D=================================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.  .   ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE-----R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeER  .   ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE-----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeER .   ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeER.   ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeER   ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1975,43 +1975,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld1r	{ v1.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld1r	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     28.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     27.0   0.0    0.0       ld1r	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     34.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     17.7   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.5       <total>
 
 # CHECK:      [18] Code Region - G19
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      509
 # CHECK-NEXT: Total uOps:        2600
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.74
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    5.11
+# CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld1r	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=======eeeeeeeER   .    .    .    .  .   ld1r	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     D==============eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=============eeeeeeeER .    .    .  .   ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5]     .D====================eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D===================eeeeeeeER    .  .   ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7]     . D==========================eER   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D=========================eeeeeeeER.   ld2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9]     .  D================================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.  .   ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE-----R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeER  .   ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE-----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeER .   ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .D==eE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeeeER.   ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     . D==eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=eeeeeeeER   ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .  D==eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2021,43 +2021,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld1r	{ v1.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     20.0   0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 7.     1     27.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     26.0   0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 9.     1     33.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     17.3   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    2.5       <total>
 
 # CHECK:      [19] Code Region - G20
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      509
 # CHECK-NEXT: Total uOps:        3000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.86
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    5.89
+# CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.  .   ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     D=eE-----R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeER  .   ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=eE-----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeER .   ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     . D=eE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeeER.   ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D=eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeeER   ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2067,43 +2067,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    2.5       <total>
 
 # CHECK:      [20] Code Region - G21
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      509
 # CHECK-NEXT: Total uOps:        3000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.86
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    5.89
+# CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.  .   ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE-----R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeER  .   ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE-----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeER .   ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeeER.   ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeeER   ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2113,43 +2113,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    2.5       <total>
 
 # CHECK:      [21] Code Region - G22
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      2909
 # CHECK-NEXT: Total uOps:        3000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.86
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    1.03
+# CHECK-NEXT: IPC:               0.34
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT:                     0123456789          01234567
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE-----R.    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .D=======eE-----R   .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,5]     . D=============eE-----R .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    . .   ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D===================eE-----R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER   ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .   D=========================eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2159,22 +2159,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     8.0    0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     14.0   0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     20.0   0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     26.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.5   0.1    2.5       <total>
 
 # CHECK:      [22] Code Region - G23
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      3503
 # CHECK-NEXT: Total uOps:        3000
 
 # CHECK:      Dispatch Width:    8
@@ -2183,19 +2183,19 @@ add x0, x27, 1
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT:                     0123456789          01234567
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,1]     D=eE-----R.    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    . .   ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,3]     .D=======eE-----R   .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D=============eE-----R .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    . .   ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .  D===================eE-----R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER   ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .   D=========================eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2205,43 +2205,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     8.0    0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     14.0   0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     20.0   0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     26.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.5   0.1    2.5       <total>
 
 # CHECK:      [23] Code Region - G24
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      2303
 # CHECK-NEXT: Total uOps:        3000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.86
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    1.30
+# CHECK-NEXT: IPC:               0.43
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld2r	{ v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .   ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE-----R.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .   ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,3]     .D=======eE-----R   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .   ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D=============eE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D============eeeeeeeER.   ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,7]     .  D=============eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============eeeeeeeER   ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .   D=============eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2251,43 +2251,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     8.0    0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 5.     1     14.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 7.     1     14.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     13.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     14.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     9.9    0.1    2.5       <total>
 
 # CHECK:      [24] Code Region - G25
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      509
 # CHECK-NEXT: Total uOps:        3000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.86
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    5.89
+# CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld2r	{ v1.2s, v2.2s }, [x27], #8
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld2r	{ v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.  .   ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     D=eE-----R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeER  .   ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,3]     .D=eE-----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeER .   ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,5]     . D=eE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeeER.   ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,7]     .  D=eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeeER   ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,9]     .   D=eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2297,43 +2297,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], #8
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 7.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: 9.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    2.5       <total>
 
 # CHECK:      [25] Code Region - G26
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      509
 # CHECK-NEXT: Total uOps:        3000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.86
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    5.89
+# CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld2r	{ v1.16b, v2.16b }, [x27], #2
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld2r	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.  .   ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: [0,1]     D=eE-----R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeER  .   ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE-----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeER .   ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeeER.   ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeeER   ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2343,43 +2343,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], #2
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    2.5       <total>
 
 # CHECK:      [26] Code Region - G27
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3604
+# CHECK-NEXT: Total Cycles:      609
 # CHECK-NEXT: Total uOps:        3200
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.89
-# CHECK-NEXT: IPC:               0.28
+# CHECK-NEXT: uOps Per Cycle:    5.25
+# CHECK-NEXT: IPC:               1.64
 # CHECK-NEXT: Block RThroughput: 5.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .   .   ld2r	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .   .   ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .   .   ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     . D===================eER.    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .   .   ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=========================eER   .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D========================eeeeeeeeER.   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,9]     .   D================================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.   .   ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE-----R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeER   .   ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE-----R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeER  .   ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE-----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeeeER .   ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeeeeER   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,9]     .   D=eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2389,43 +2389,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 9.     1     33.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.6   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 9.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    2.6       <total>
 
 # CHECK:      [27] Code Region - G28
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      759
 # CHECK-NEXT: Total uOps:        4000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    5.27
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1]     D=eE------R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   ..   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,3]     .D=eE------R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeeeER ..   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,5]     . D==eE------R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeeER..   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,7]     .  D==eE------R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeeeER   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,9]     .   D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2435,43 +2435,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    3.0       <total>
 
 # CHECK:      [28] Code Region - G29
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      759
 # CHECK-NEXT: Total uOps:        4000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    5.27
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     D=eE------R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   ..   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE------R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeeeER ..   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D==eE------R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeeER..   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE------R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeeeER   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2481,43 +2481,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    3.0       <total>
 
 # CHECK:      [29] Code Region - G30
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3804
+# CHECK-NEXT: Total Cycles:      1909
 # CHECK-NEXT: Total uOps:        3800
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               0.26
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               0.52
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    ..   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    ..   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    ..   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     . D======================eER  .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeER .    ..   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,7]     .  D============================eER.    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D===========================eeeeeeeER.   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,9]     .   D==================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    . .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   .    .    . .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeeeER .    .    . .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D==eE------R .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D========eeeeeeeER    . .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,7]     .  D=========eE-----R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==============eeeeeeeER   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,9]     .   D===============eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2527,22 +2527,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 7.     1     29.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     28.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 9.     1     35.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     18.6   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     9.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 7.     1     10.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 9.     1     16.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     6.1    0.2    2.8       <total>
 
 # CHECK:      [30] Code Region - G31
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      3503
 # CHECK-NEXT: Total uOps:        3500
 
 # CHECK:      Dispatch Width:    8
@@ -2551,19 +2551,19 @@ add x0, x27, 1
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT:                     0123456789          01234567
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE-----R.    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .D=======eE-----R   .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    . .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,5]     . D=============eE-----R .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,7]     .  D===================eE-----R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D=========================eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2573,22 +2573,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     8.0    0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     14.0   0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     20.0   0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     26.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.5   0.1    2.5       <total>
 
 # CHECK:      [31] Code Region - G32
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      3503
 # CHECK-NEXT: Total uOps:        3500
 
 # CHECK:      Dispatch Width:    8
@@ -2597,19 +2597,19 @@ add x0, x27, 1
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT:                     0123456789          01234567
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE-----R.    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,3]     .D=======eE-----R   .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D=============eE-----R .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    . .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,7]     .  D===================eE-----R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D=========================eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2619,43 +2619,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     8.0    0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     14.0   0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     20.0   0.0    5.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     26.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.5   0.1    2.5       <total>
 
 # CHECK:      [32] Code Region - G33
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      758
 # CHECK-NEXT: Total uOps:        3500
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    4.62
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     D=eE-----R.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeER    .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,3]     .D=eE-----R    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeeER  .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,5]     . D==eE-----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeER .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,7]     .  D==eE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeeER   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: [0,9]     .   D===eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2665,43 +2665,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 5.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: 9.     1     4.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    2.5       <total>
 
 # CHECK:      [33] Code Region - G34
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      758
 # CHECK-NEXT: Total uOps:        3500
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    4.62
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1]     D=eE-----R.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeER    .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,3]     .D=eE-----R    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeeER  .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,5]     . D==eE-----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeER .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeeER   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2711,43 +2711,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 5.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    2.5       <total>
 
 # CHECK:      [34] Code Region - G35
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      758
 # CHECK-NEXT: Total uOps:        3500
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    4.62
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE-----R.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeER    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE-----R    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeeER  .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D==eE-----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeER .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeeER   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2757,43 +2757,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    2.5       <total>
 
 # CHECK:      [35] Code Region - G36
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4304
+# CHECK-NEXT: Total Cycles:      960
 # CHECK-NEXT: Total uOps:        4500
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.05
-# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: uOps Per Cycle:    4.69
+# CHECK-NEXT: IPC:               1.04
 # CHECK-NEXT: Block RThroughput: 9.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .    .    ..   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeeeeER.    .    .    .    .    ..   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,3]     . D===============eER    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .  D==============eeeeeeeeER  .    .    .    ..   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,5]     .   D=====================eER .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    D====================eeeeeeeeER    .    ..   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .D===========================eER   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    . D==========================eeeeeeeeeeER.   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .  D===================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .   .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE-----R.    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeeeER .   .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3]     . DeE--------R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  DeeeeeeeeER .   .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,5]     .   DeE------R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    DeeeeeeeeER   .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .DeE------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    . DeeeeeeeeeeER   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .  DeE--------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2803,43 +2803,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 5.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 7.     1     28.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     27.0   0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 9.     1     36.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     18.1   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 3.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 9.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.1    0.4    3.3       <total>
 
 # CHECK:      [36] Code Region - G37
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4604
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        4800
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.04
-# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: uOps Per Cycle:    4.76
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .    .   .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeeeER    .    .    .    .    .   .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,3]     .  D===============eER   .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D==============eeeeeeeeeeER    .    .    .   .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,5]     .    D=======================eER   .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D======================eeeeeeeeeeER    .   .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . D===============================eER   .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D==============================eeeeeeeeER.   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D=====================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeeeER.  .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3]     .  DeE--------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeeeER .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,5]     .    DeE--------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeeeER   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE--------R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2849,43 +2849,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     23.0   0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 9.     1     38.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 3.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 5.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    3.6       <total>
 
 # CHECK:      [37] Code Region - G38
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4604
+# CHECK-NEXT: Total Cycles:      1011
 # CHECK-NEXT: Total uOps:        4800
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.04
-# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: uOps Per Cycle:    4.75
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .    .   .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeeeER    .    .    .    .    .   .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .  D===============eER   .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D==============eeeeeeeeER .    .    .    .   .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    D=====================eER.    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D====================eeeeeeeeeeER .    .   .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . D=============================eER.    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D============================eeeeeeeeeeER.   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D=====================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE------R    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeeeER.    .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE--------R.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.    .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE------R.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeeeER .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE--------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeeeER   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE--------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2895,22 +2895,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 5.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 9.     1     38.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     18.7   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    3.6       <total>
 
 # CHECK:      [38] Code Region - G39
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        5000
 
 # CHECK:      Dispatch Width:    8
@@ -2920,18 +2920,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,7]     .    . D=========================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER.   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .    .   D===============================eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .DeE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,3]     .  D======eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    D============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,7]     .    . D==================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .   D========================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2941,22 +2941,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     13.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     19.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     25.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.0   0.1    3.0       <total>
 
 # CHECK:      [39] Code Region - G40
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        5000
 
 # CHECK:      Dispatch Width:    8
@@ -2966,18 +2966,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .  .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .  .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .  .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,7]     .    . D=========================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER.   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D===============================eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,1]     .DeE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D======eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,5]     .    D============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    . .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    . D==================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D========================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2987,43 +2987,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     13.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     19.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     25.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.0   0.1    3.0       <total>
 
 # CHECK:      [40] Code Region - G41
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      2103
 # CHECK-NEXT: Total uOps:        5000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.25
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    2.38
+# CHECK-NEXT: IPC:               0.48
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: [0,7]     .    . D=========================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER.   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .   D===============================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,1]     .DeE------R    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D======eE------R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D=====eeeeeeeeER.  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    D=====eE------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D======eeeeeeeeER.   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . D======eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D=====eeeeeeeeER   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .   D=====eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3033,43 +3033,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     6.0    0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 5.     1     6.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     7.0    2.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 7.     1     7.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     6.0    0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: 9.     1     6.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.4    0.3    3.0       <total>
 
 # CHECK:      [41] Code Region - G42
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        5000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.25
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    4.96
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .  .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .  .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: [0,7]     .    . D=========================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER.   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: [0,9]     .    .   D===============================eER   add	x0, x27, #1
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,5]     .    DeE------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3079,43 +3079,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    3.0       <total>
 
 # CHECK:      [42] Code Region - G43
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        5000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.25
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    4.96
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . D=========================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER.   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D===============================eER   add	x0, x27, #1
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3125,43 +3125,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    3.0       <total>
 
 # CHECK:      [43] Code Region - G44
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3404
+# CHECK-NEXT: Total Cycles:      808
 # CHECK-NEXT: Total uOps:        3800
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.12
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    4.70
+# CHECK-NEXT: IPC:               1.24
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234567
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    . .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    . .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    . .   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    D===================eER  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    D===================eeeeeER   . .   ldp	s1, s2, [x27], #248
-# CHECK-NEXT: [0,7]     .    D========================eER  . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .D=======================eeeeeER.   ldp	d1, d2, [x27], #496
-# CHECK-NEXT: [0,9]     .    .D============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE------R    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    D=eeeeeE-R.   ldp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,7]     .    D==eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D==eeeeeER   ldp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,9]     .    .D===eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3171,43 +3171,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     20.0   0.0    0.0       ldp	s1, s2, [x27], #248
-# CHECK-NEXT: 7.     1     25.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ldp	d1, d2, [x27], #496
-# CHECK-NEXT: 9.     1     29.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.1   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    1.0       ldp	s1, s2, [x27], #248
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ldp	d1, d2, [x27], #496
+# CHECK-NEXT: 9.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.8    0.5    2.6       <total>
 
 # CHECK:      [44] Code Region - G45
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2506
+# CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.80
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    3.94
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .   ldp	q1, q2, [x27], #992
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=======eeeeeER.    .    .    .   ldp	s1, s2, [x27, #248]!
-# CHECK-NEXT: [0,3]     D============eER    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===========eeeeeER.    .    .   ldp	d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,5]     .D================eER    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D================eeeeeeeER   .   ldp	q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,7]     .D=======================eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D======================eeeeER   ldp	w1, w2, [x27], #248
-# CHECK-NEXT: [0,9]     . D=======================eE--R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER. .   ldp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     D=eE-----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeE-R. .   ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeER. .   ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .D==eE---R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeER   ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     .D===eE-----R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeE--R   ldp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3217,43 +3217,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	q1, q2, [x27], #992
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ldp	s1, s2, [x27, #248]!
-# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     12.0   0.0    0.0       ldp	d1, d2, [x27, #496]!
-# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ldp	q1, q2, [x27, #992]!
-# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ldp	w1, w2, [x27], #248
-# CHECK-NEXT: 9.     1     24.0   0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     14.7   0.1    0.2       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    1.0       ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     4.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    2.0       ldp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.4       <total>
 
 # CHECK:      [45] Code Region - G46
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1304
+# CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.53
-# CHECK-NEXT: IPC:               0.77
+# CHECK-NEXT: uOps Per Cycle:    3.94
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    ..   ldp	x1, x2, [x27], #496
-# CHECK-NEXT: [0,1]     D=eE--R   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  .    ..   ldp	w1, w2, [x27, #248]!
-# CHECK-NEXT: [0,3]     D==eE--R  .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeER .    ..   ldp	x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,5]     .D==eE--R .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeER    ..   ldpsw	x1, x2, [x27], #248
-# CHECK-NEXT: [0,7]     .D=======eER   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D======eeeeeER.   ldpsw	x1, x2, [x27, #248]!
-# CHECK-NEXT: [0,9]     . D===========eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   ..   ldp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     D=eE--R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  ..   ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     D==eE--R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER ..   ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .D==eE--R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeER.   ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: [0,7]     .D===eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeER   ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: [0,9]     . D===eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3269,37 +3269,37 @@ add x0, x27, 1
 # CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldp	x1, x2, [x27, #496]!
 # CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldpsw	x1, x2, [x27], #248
-# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     7.0    0.0    0.0       ldpsw	x1, x2, [x27, #248]!
-# CHECK-NEXT: 9.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.3    0.1    0.6       <total>
+# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: 9.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.2       <total>
 
 # CHECK:      [46] Code Region - G47
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
-
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ldr	b1, [x27], #254
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .  .   ldr	h1, [x27], #254
-# CHECK-NEXT: [0,3]     D==========eER .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==========eeeeeER  .    .  .   ldr	s1, [x27], #254
-# CHECK-NEXT: [0,5]     .D==============eER .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==============eeeeeER  .  .   ldr	d1, [x27], #254
-# CHECK-NEXT: [0,7]     .D===================eER .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===================eeeeeER.   ldr	q1, [x27], #254
-# CHECK-NEXT: [0,9]     .D========================eER   add	x0, x27, #1
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  ..   ldr	b1, [x27], #254
+# CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeER ..   ldr	h1, [x27], #254
+# CHECK-NEXT: [0,3]     D==eE---R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeeER..   ldr	s1, [x27], #254
+# CHECK-NEXT: [0,5]     .D==eE---R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeER.   ldr	d1, [x27], #254
+# CHECK-NEXT: [0,7]     .D===eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeeER   ldr	q1, [x27], #254
+# CHECK-NEXT: [0,9]     .D====eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3309,43 +3309,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27], #254
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ldr	h1, [x27], #254
-# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ldr	s1, [x27], #254
-# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ldr	d1, [x27], #254
-# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     20.0   0.0    0.0       ldr	q1, [x27], #254
-# CHECK-NEXT: 9.     1     25.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	h1, [x27], #254
+# CHECK-NEXT: 3.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldr	s1, [x27], #254
+# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldr	d1, [x27], #254
+# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldr	q1, [x27], #254
+# CHECK-NEXT: 9.     1     5.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.0    0.1    1.5       <total>
 
 # CHECK:      [47] Code Region - G48
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
-
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ldr	b1, [x27, #254]!
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .  .   ldr	h1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D==========eER .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==========eeeeeER  .    .  .   ldr	s1, [x27, #254]!
-# CHECK-NEXT: [0,5]     .D==============eER .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==============eeeeeER  .  .   ldr	d1, [x27, #254]!
-# CHECK-NEXT: [0,7]     .D===================eER .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D===================eeeeeER.   ldr	q1, [x27, #254]!
-# CHECK-NEXT: [0,9]     .D========================eER   add	x0, x27, #1
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  ..   ldr	b1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeER ..   ldr	h1, [x27, #254]!
+# CHECK-NEXT: [0,3]     D==eE---R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeeER..   ldr	s1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .D==eE---R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeER.   ldr	d1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .D===eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeeER   ldr	q1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .D====eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3355,16 +3355,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27, #254]!
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ldr	h1, [x27, #254]!
-# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ldr	s1, [x27, #254]!
-# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ldr	d1, [x27, #254]!
-# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     20.0   0.0    0.0       ldr	q1, [x27, #254]!
-# CHECK-NEXT: 9.     1     25.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	h1, [x27, #254]!
+# CHECK-NEXT: 3.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldr	s1, [x27, #254]!
+# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldr	d1, [x27, #254]!
+# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldr	q1, [x27, #254]!
+# CHECK-NEXT: 9.     1     5.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.0    0.1    1.5       <total>
 
 # CHECK:      [48] Code Region - G49
 
@@ -3508,28 +3508,27 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      704
+# CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        1700
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    2.41
-# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: uOps Per Cycle:    3.37
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeeeER   .   ldrsh	x1, [x27, #254]!
-# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  .   ldrsw	x1, [x27], #254
-# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeER .   ldrsw	x1, [x27, #254]!
-# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeE-R .   st1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,7]     .D====eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D====eeER.   st1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: [0,9]     . D=====eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER .   ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER.   ldrsw	x1, [x27], #254
+# CHECK-NEXT: [0,3]     D==eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeER   ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .D==eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeE-R   st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,7]     .D===eE-R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeER   st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3545,37 +3544,36 @@ add x0, x27, 1
 # CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrsw	x1, [x27, #254]!
 # CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     3.0    0.0    1.0       st1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: 7.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: 9.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.3    0.1    0.7       <total>
+# CHECK-NEXT: 7.     1     4.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.9    0.1    0.8       <total>
 
 # CHECK:      [52] Code Region - G53
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.99
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    3.97
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.  .   st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3585,43 +3583,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [53] Code Region - G54
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.99
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    3.97
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.  .   st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3631,43 +3628,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [54] Code Region - G55
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2100
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    2.09
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    4.17
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.  .   st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3677,43 +3673,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [55] Code Region - G56
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2700
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    2.69
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    5.36
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=eeER   .  .   st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==eeER .  .   st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,5]     . D====eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===eeER  .   st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D=====eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====eeER.   st1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,9]     .   D======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.  .   st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3723,43 +3718,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [56] Code Region - G57
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2800
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    2.79
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    5.56
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=eeER   .  .   st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==eeER .  .   st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,5]     . D====eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===eeER  .   st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=====eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====eeER.   st1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.  .   st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3769,43 +3763,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [57] Code Region - G58
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2800
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    2.79
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    5.56
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=eeER   .  .   st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==eeER .  .   st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,5]     . D====eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===eeER  .   st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=====eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====eeER.   st1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.  .   st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER .   st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER.   st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3815,43 +3808,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    0.0       <total>
 
 # CHECK:      [58] Code Region - G59
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1704
+# CHECK-NEXT: Total Cycles:      755
 # CHECK-NEXT: Total uOps:        3700
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    2.17
-# CHECK-NEXT: IPC:               0.59
+# CHECK-NEXT: uOps Per Cycle:    4.90
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER    .    .    .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     D===eER   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D==eeeeER.    .    .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,3]     .D======eER    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=====eeeER  .    .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,5]     . D========eER .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=======eeeER    .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,7]     .  D==========eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=========eeeeER.   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,9]     .   D=============eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeER    . .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     D=eE-R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER  . .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3]     .D=eE--R  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeER . .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,5]     . D==eE-R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeER. .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,7]     .  D==eE-R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeER   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,9]     .   D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3861,43 +3854,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 3.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     6.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 5.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     8.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 7.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     10.0   0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 9.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     7.3    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 5.     1     3.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 7.     1     3.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    0.7       <total>
 
 # CHECK:      [59] Code Region - G60
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1804
+# CHECK-NEXT: Total Cycles:      755
 # CHECK-NEXT: Total uOps:        3800
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    2.11
-# CHECK-NEXT: IPC:               0.55
+# CHECK-NEXT: uOps Per Cycle:    5.03
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01
-
-# CHECK:      [0,0]     DeeeER    .    .    ..   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,1]     D===eER   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D==eeeeER.    .    ..   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,3]     .D======eER    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=====eeeeER .    ..   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,5]     . D=========eER.    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D========eeeER   ..   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D===========eER  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==========eeeeER.   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D==============eER   add	x0, x27, #1
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeER    . .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1]     D=eE-R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER  . .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3]     .D=eE--R  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeER. .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,5]     . D==eE--R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeER. .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE-R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeER   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3907,43 +3900,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 1.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 3.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     6.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 5.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     9.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     11.0   0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     7.8    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    0.8       <total>
 
 # CHECK:      [60] Code Region - G61
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1704
+# CHECK-NEXT: Total Cycles:      755
 # CHECK-NEXT: Total uOps:        3700
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    2.17
-# CHECK-NEXT: IPC:               0.59
+# CHECK-NEXT: uOps Per Cycle:    4.90
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER    .    .    .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D===eER   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D==eeeER .    .    .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=====eER.    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D====eeeeER  .    .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D========eER .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=======eeeER    .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D==========eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=========eeeeER.   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=============eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeER    . .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE-R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeER   . .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE-R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeER. .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D==eE--R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeER. .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE-R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeER   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3953,43 +3946,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     8.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     10.0   0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     7.1    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    0.7       <total>
 
 # CHECK:      [61] Code Region - G62
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1504
+# CHECK-NEXT: Total Cycles:      704
 # CHECK-NEXT: Total uOps:        3600
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    2.39
-# CHECK-NEXT: IPC:               0.66
+# CHECK-NEXT: uOps Per Cycle:    5.11
+# CHECK-NEXT: IPC:               1.42
 # CHECK-NEXT: Block RThroughput: 6.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
+# CHECK-NEXT:                     0
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    .  .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D====eER  .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D===eeER .    .  .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,3]     .D=====eER.    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D====eeeeeER .  .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,5]     .  D========eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D========eeER  .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D==========eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=========eeER.   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,9]     .   D===========eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeE-R   .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3]     .D=eE-R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeER   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,5]     .  D=eE---R   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeE--R   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D==eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeER   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,9]     .   D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3999,43 +3992,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     4.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 3.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 5.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     9.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 7.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     10.0   0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 9.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     7.2    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    1.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3.     1     2.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 5.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    2.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.2    0.3    1.1       <total>
 
 # CHECK:      [62] Code Region - G63
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1904
+# CHECK-NEXT: Total Cycles:      804
 # CHECK-NEXT: Total uOps:        4200
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    2.21
-# CHECK-NEXT: IPC:               0.53
+# CHECK-NEXT: uOps Per Cycle:    5.22
+# CHECK-NEXT: IPC:               1.24
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012
-
-# CHECK:      [0,0]     DeeeeeER  .    .    . .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,1]     .D====eER .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D====eeER.    .    . .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,3]     .D======eER    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=====eeeeeER.    . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,5]     .  D=========eER    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .   D========eeeeeER. .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,7]     .    D============eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    D============eeER.   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    D==============eER   add	x0, x27, #1
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  ..   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1]     .DeE---R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeE--R  ..   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3]     .D=eE--R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeER.   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,5]     .  D=eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   DeeeeeER   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,7]     .    DeE---R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    D==eeER   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4045,43 +4038,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 3.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     6.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 5.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     9.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 7.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     13.0   0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     8.4    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    2.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 5.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 7.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    2.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.8    0.4    1.3       <total>
 
 # CHECK:      [63] Code Region - G64
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1604
+# CHECK-NEXT: Total Cycles:      705
 # CHECK-NEXT: Total uOps:        3800
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    2.37
-# CHECK-NEXT: IPC:               0.62
+# CHECK-NEXT: uOps Per Cycle:    5.39
+# CHECK-NEXT: IPC:               1.42
 # CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .   .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .D====eER .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D====eeER.    .   .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D======eER    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=====eeER   .   .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=======eER  .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D======eeeeeER  .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .   D==========eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==========eeER.   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D============eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  ..   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE---R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeE--R  ..   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE--R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeER  ..   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D==eER  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeER   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .   D=eE---R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeE--R   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D==eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4091,43 +4084,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     6.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     7.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     11.0   0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     7.4    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    2.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    2.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.9    0.2    1.4       <total>
 
 # CHECK:      [64] Code Region - G65
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2204
+# CHECK-NEXT: Total Cycles:      706
 # CHECK-NEXT: Total uOps:        3200
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.45
-# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: uOps Per Cycle:    4.53
+# CHECK-NEXT: IPC:               1.42
 # CHECK-NEXT: Block RThroughput: 5.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .D====eER .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D===eeeeeER  .    .    .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .  D=======eER .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .  D=======eeeeER   .    .   st1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,5]     .  D===========eER  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .   D==========eeeeER    .   st1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,7]     .   D==============eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==============eeeeER.   st1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .   D==================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE---R  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeER. .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE---R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  D=eeeeER .   st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,5]     .  D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   D=eeeeER.   st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,7]     .   D==eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeER   st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4137,43 +4130,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     4.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     11.0   0.0    0.0       st1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: 7.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     9.8    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.1    0.3    1.2       <total>
 
 # CHECK:      [65] Code Region - G66
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    3.95
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    .    .  .   st1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============eeeeER.   st1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .   st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4183,43 +4176,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     10.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
 
 # CHECK:      [66] Code Region - G67
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2104
+# CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        2200
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.05
-# CHECK-NEXT: IPC:               0.48
+# CHECK-NEXT: uOps Per Cycle:    4.34
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234
-
-# CHECK:      [0,0]     DeeeeER   .    .    .   .   st1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,1]     D====eER  .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D====eeeeER    .    .   .   st1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     D========eER   .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=======eeeeER.    .   .   st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,5]     .D===========eER    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D===========eeeeER .   .   st1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .D===============eER.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============eeeeeER.   st2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,9]     . D===================eER   add	x0, x27, #1
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   ..   st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,1]     D=eE--R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  ..   st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     D==eE--R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER ..   st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,5]     .D==eE--R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER..   st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE--R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeER   st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,9]     . D===eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4229,43 +4222,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 9.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     10.3   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 9.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.1       <total>
 
 # CHECK:      [67] Code Region - G68
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2204
+# CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        2400
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.09
-# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: uOps Per Cycle:    4.73
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 3.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .   st2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,1]     D====eER  .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D====eeeeER    .    .    .   st2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,3]     D========eER   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=======eeeeeER    .    .   st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,5]     .D============eER   .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D===========eeeeER.    .   st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,7]     . D===============eER    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D==============eeeeeER.   st2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,9]     .  D===================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   ..   st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,1]     D=eE--R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  ..   st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3]     D==eE--R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeER..   st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,5]     .D==eE---R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeER..   st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,7]     . D==eE--R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=eeeeeER   st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,9]     .  D==eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4275,43 +4268,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 5.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 9.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     10.4   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 9.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    1.2       <total>
 
 # CHECK:      [68] Code Region - G69
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2304
+# CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        2600
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.13
-# CHECK-NEXT: IPC:               0.43
+# CHECK-NEXT: uOps Per Cycle:    5.13
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeeER  .    .    .    ..   st2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    ..   st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=========eER .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D========eeeeER   .    ..   st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D============eER  .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D============eeeeER    ..   st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     . D================eER   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D===============eeeeeER.   st2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .  D====================eER   add	x0, x27, #1
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  ..   st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER ..   st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE---R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeER ..   st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE--R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeER..   st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     . D==eE--R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=eeeeeER   st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .  D==eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4321,43 +4314,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     13.0   0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     16.0   0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     21.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     11.1   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.9    0.1    1.3       <total>
 
 # CHECK:      [69] Code Region - G70
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2204
+# CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2400
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.09
-# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: uOps Per Cycle:    4.74
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 3.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .   st2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D====eER  .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D===eeeeeER   .    .    .   st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D========eER  .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=======eeeeeER   .    .   st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     . D============eER  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===========eeeeER    .   st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,7]     .  D===============eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D===============eeeeER.   st2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,9]     .  D===================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .   st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER .   st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE---R .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeER.   st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeER.   st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .  D=eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=eeeeER   st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,9]     .  D==eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4367,43 +4360,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     4.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     16.0   0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 9.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     10.4   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 7.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 9.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.7    0.1    1.2       <total>
 
 # CHECK:      [70] Code Region - G71
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    3.95
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    .    .  .   st2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============eeeeER.   st2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .   st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4413,43 +4406,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     10.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
 
 # CHECK:      [71] Code Region - G72
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    3.95
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    .    .  .   st2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============eeeeER.   st2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .   st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4459,39 +4452,39 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     10.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
 
 # CHECK:      [72] Code Region - G73
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
-# CHECK-NEXT: Total Cycles:      1604
+# CHECK-NEXT: Total Cycles:      407
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.25
-# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: uOps Per Cycle:    4.91
+# CHECK-NEXT: IPC:               1.47
 # CHECK-NEXT: Block RThroughput: 3.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
+# CHECK-NEXT:                     0
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .   .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,1]     D======eER.    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=====eeeeeER .   .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,3]     .D==========eER.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=========eeeeeER.   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,5]     . D==============eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,1]     D=eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,3]     .D=eE---R .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeER   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,5]     . D==eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4501,39 +4494,39 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     10.0   0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     8.3    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.8    0.3    1.7       <total>
 
 # CHECK:      [73] Code Region - G74
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2904
+# CHECK-NEXT: Total Cycles:      708
 # CHECK-NEXT: Total uOps:        3800
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.31
-# CHECK-NEXT: IPC:               0.34
+# CHECK-NEXT: uOps Per Cycle:    5.37
+# CHECK-NEXT: IPC:               1.41
 # CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    . .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=====eeeeeER .    .    .    . .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,3]     .D==========eER.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=========eeeeeeER.    .    . .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,5]     . D===============eER    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==============eeeeeeER    . .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,7]     .  D====================eER   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D===================eeeeeeER.   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .   .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     D=eE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER .   .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3]     .D=eE---R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeER  .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,5]     . D==eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeER .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,7]     .  D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeER   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4543,43 +4536,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     10.0   0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 5.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     15.0   0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 7.     1     21.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     20.0   0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.3   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    1.9       <total>
 
 # CHECK:      [74] Code Region - G75
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2704
+# CHECK-NEXT: Total Cycles:      707
 # CHECK-NEXT: Total uOps:        3400
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.26
-# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: uOps Per Cycle:    4.81
+# CHECK-NEXT: IPC:               1.41
 # CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .    .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=========eER .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D========eeeeeeER .    .    .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D==============eER.    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=============eeeeeER .    .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D==================eER.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=================eeeeeeER.   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=======================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  .  .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE---R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER .  .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE---R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER  .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeER  .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE---R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeeeeeER   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D==eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4589,43 +4582,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     14.0   0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     19.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     18.0   0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     12.1   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    1.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.7    0.2    1.7       <total>
 
 # CHECK:      [75] Code Region - G76
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2204
+# CHECK-NEXT: Total Cycles:      755
 # CHECK-NEXT: Total uOps:        4000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.81
-# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: uOps Per Cycle:    5.30
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=====eeeeER  .    .    .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,3]     .D=========eER .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D========eeeeER   .    .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,5]     . D============eER  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===========eeeeER    .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .  D===============eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==============eeeeER.   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,9]     .   D==================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER . .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeE-R . .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,3]     .D=eE---R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeER. .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,5]     . D==eE--R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeER .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeER   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4635,43 +4628,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 5.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     10.8   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    1.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    1.4       <total>
 
 # CHECK:      [76] Code Region - G77
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total Cycles:      755
 # CHECK-NEXT: Total uOps:        4000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    2.00
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    5.30
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    .    .  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D===eeeeER    .    .  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,3]     .D=======eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D======eeeeER.    .  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D==========eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=========eeeeER .  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,7]     .  D=============eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============eeeeER.   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,9]     .   D================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   . .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,1]     D=eE--R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER  . .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,3]     .D=eE--R  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeER. .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D==eE--R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeER .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeER   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,9]     .   D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4681,43 +4674,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     4.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 3.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     7.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 5.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     10.0   0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 7.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     13.0   0.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 9.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     9.0    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    1.0       <total>
 
 # CHECK:      [77] Code Region - G78
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2704
+# CHECK-NEXT: Total Cycles:      808
 # CHECK-NEXT: Total uOps:        4200
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.55
-# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: uOps Per Cycle:    5.20
+# CHECK-NEXT: IPC:               1.24
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D====eER  .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D===eeeeeER   .    .    .    .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,3]     .D========eER  .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=======eeeeeER   .    .    .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D============eER  .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===========eeeeeeER  .    .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,7]     .   D================eER .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    D===============eeeeeeeER.   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,9]     .    D======================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .    .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE--R   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER .    .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,3]     .D=eE---R .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeER    .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D==eE---R    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeER  .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,7]     .   D=eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    D=eeeeeeeER   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,9]     .    D==eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4727,43 +4720,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     4.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 5.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 7.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     16.0   0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 9.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     10.8   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    1.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 9.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.0    0.3    1.7       <total>
 
 # CHECK:      [78] Code Region - G79
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4104
+# CHECK-NEXT: Total Cycles:      1207
 # CHECK-NEXT: Total uOps:        5800
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.41
-# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: uOps Per Cycle:    4.81
+# CHECK-NEXT: IPC:               0.83
 # CHECK-NEXT: Block RThroughput: 12.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01234
-
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .    .   .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeeeER .    .    .    .    .   .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,3]     . D==============eER.    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .  D=============eeeeeeeER    .    .    .   .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,5]     .  D====================eER   .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .   D===================eeeeeeeeeER.    .   .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,7]     .    D===========================eER    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .D==========================eeeeeeeeeER.   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,9]     .    . D==================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    .  .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     D=eE-----R.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeeER  .  .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3]     . DeE-------R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  DeeeeeeeER  .  .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,5]     .  D=eE-----R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   D==eeeeeeeeeER.   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,7]     .    D==eE-------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D=eeeeeeeeeER   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,9]     .    . D=eE-------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4773,43 +4766,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     14.0   0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     20.0   0.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 7.     1     28.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     27.0   0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 9.     1     35.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     17.6   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 3.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    2.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 7.     1     3.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 9.     1     2.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.8    0.4    3.1       <total>
 
 # CHECK:      [79] Code Region - G80
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3604
+# CHECK-NEXT: Total Cycles:      1007
 # CHECK-NEXT: Total uOps:        4800
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.33
-# CHECK-NEXT: IPC:               0.28
+# CHECK-NEXT: uOps Per Cycle:    4.77
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 9.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    .   .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D====eeeeeeeER    .    .    .    .   .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     . D===========eER   .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .  D==========eeeeeeeER  .    .    .   .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .  D=================eER .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .   D================eeeeeeeeeER   .   .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    D========================eER  .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .D=======================eeeeeeeER.   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .D==============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .    ..   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE----R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   ..   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     . D=eE-----R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  D=eeeeeeeER ..   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .  D==eE-----R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   D=eeeeeeeeeER   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    D=eE-------R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D=eeeeeeeER   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .D==eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4819,43 +4812,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     17.0   0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     25.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     24.0   0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     31.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    1.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.9    0.4    2.6       <total>
 
 # CHECK:      [80] Code Region - G81
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3304
+# CHECK-NEXT: Total Cycles:      1057
 # CHECK-NEXT: Total uOps:        5200
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.57
-# CHECK-NEXT: IPC:               0.30
+# CHECK-NEXT: uOps Per Cycle:    4.92
+# CHECK-NEXT: IPC:               0.95
 # CHECK-NEXT: Block RThroughput: 10.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    ..   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .D========eER  .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=======eeeeeeeeeER    .    .    ..   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .  D===============eER   .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D==============eeeeeER    .    ..   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .   D===================eER   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    D==================eeeeeER    ..   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,7]     .    D=======================eER   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .D======================eeeeeER.   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .D===========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeeER   ..   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-------R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeeER ..   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE-------R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==eeeeeER ..   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .   D===eE---R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    D===eeeeeER.   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,7]     .    D====eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D===eeeeeER   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .D====eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4865,43 +4858,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     23.0   0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 9.     1     28.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.3   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    3.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 5.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 7.     1     5.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     5.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.9    0.6    2.3       <total>
 
 # CHECK:      [81] Code Region - G82
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total Cycles:      756
 # CHECK-NEXT: Total uOps:        4000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    5.29
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
-
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,3]     .D=========eER .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D========eeeeeER  .    .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,5]     . D=============eER .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D============eeeeeER  .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .  D=================eER .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D================eeeeeER.   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,9]     .   D=====================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .  .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE---R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .D=eE---R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeER  .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,5]     . D==eE---R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeER .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE---R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeER   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4911,41 +4904,41 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 5.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     13.0   0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 7.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     17.0   0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 9.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     11.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 5.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    1.5       <total>
 
 # CHECK:      [82] Code Region - G83
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      800
-# CHECK-NEXT: Total Cycles:      1804
+# CHECK-NEXT: Total Cycles:      605
 # CHECK-NEXT: Total uOps:        3200
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    1.77
-# CHECK-NEXT: IPC:               0.44
+# CHECK-NEXT: uOps Per Cycle:    5.29
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    ..   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,1]     D=====eER .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D====eeeeeER  .    ..   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .D=========eER .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D========eeeeER   ..   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,5]     . D============eER  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===========eeeeER.   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .  D===============eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,1]     D=eE---R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE---R .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeER.   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,5]     . D==eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeER   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4955,34 +4948,34 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 5.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     9.0    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.0    0.3    1.3       <total>
 
 # CHECK:      [83] Code Region - G84
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      404
+# CHECK-NEXT: Total Cycles:      204
 # CHECK-NEXT: Total uOps:        1000
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    2.48
-# CHECK-NEXT: IPC:               0.99
+# CHECK-NEXT: uOps Per Cycle:    4.90
+# CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 1.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     01234567
+# CHECK-NEXT: Index     012345
 
-# CHECK:      [0,0]     DeeER. .   stp	s1, s2, [x27], #248
-# CHECK-NEXT: [0,1]     D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=eeER.   stp	d1, d2, [x27], #496
-# CHECK-NEXT: [0,3]     .D===eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.   stp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,1]     D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER   stp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,3]     .D=eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4992,37 +4985,36 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	s1, s2, [x27], #248
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       stp	d1, d2, [x27], #496
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.5    0.3    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       stp	d1, d2, [x27], #496
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.3    0.0       <total>
 
 # CHECK:      [84] Code Region - G85
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1104
+# CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        3100
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    2.81
-# CHECK-NEXT: IPC:               0.91
+# CHECK-NEXT: uOps Per Cycle:    4.41
+# CHECK-NEXT: IPC:               1.42
 # CHECK-NEXT: Block RThroughput: 6.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER    .   .   stp	q1, q2, [x27], #992
-# CHECK-NEXT: [0,1]     D===eER   .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D==eeER  .   .   stp	s1, s2, [x27, #248]!
-# CHECK-NEXT: [0,3]     .D====eER .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D===eeER.   .   stp	d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,5]     . D=====eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D====eeeER .   stp	q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,7]     .  D=======eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D======eER.   stp	w1, w2, [x27], #248
-# CHECK-NEXT: [0,9]     .   D=======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeER   .   stp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     D=eE-R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER   .   stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .D=eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER  .   stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     . D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeER   stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     .  D==eE-R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eE-R   stp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .   D==eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5032,43 +5024,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       stp	q1, q2, [x27], #992
-# CHECK-NEXT: 1.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       stp	s1, s2, [x27, #248]!
-# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       stp	d1, d2, [x27, #496]!
-# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     5.0    0.0    0.0       stp	q1, q2, [x27, #992]!
-# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     7.0    0.0    0.0       stp	w1, w2, [x27], #248
-# CHECK-NEXT: 9.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     5.1    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       stp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       stp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       stp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     3.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    1.0       stp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.9    0.2    0.3       <total>
 
 # CHECK:      [85] Code Region - G86
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      704
+# CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2300
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    3.27
-# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: uOps Per Cycle:    4.56
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeER .    .   stp	x1, x2, [x27], #496
-# CHECK-NEXT: [0,1]     D=eER.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeER.    .   stp	w1, w2, [x27, #248]!
-# CHECK-NEXT: [0,3]     .D=eER    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeER    .   stp	x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,5]     . D=eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D=eeER  .   str	b1, [x27], #254
-# CHECK-NEXT: [0,7]     .  D==eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D==eeER.   str	h1, [x27], #254
-# CHECK-NEXT: [0,9]     .  D====eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeER .  .   stp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeER.  .   stp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeER  .   stp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     . D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeER.   str	b1, [x27], #254
+# CHECK-NEXT: [0,7]     .  D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=eeER   str	h1, [x27], #254
+# CHECK-NEXT: [0,9]     .  D==eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5084,37 +5075,36 @@ add x0, x27, 1
 # CHECK-NEXT: 4.     1     1.0    0.0    0.0       stp	x1, x2, [x27, #496]!
 # CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     2.0    0.0    0.0       str	b1, [x27], #254
-# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       str	h1, [x27], #254
-# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.2    0.1    0.0       <total>
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       str	h1, [x27], #254
+# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.8    0.1    0.0       <total>
 
 # CHECK:      [86] Code Region - G87
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2200
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    2.19
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    4.37
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeER.    .  .   str	s1, [x27], #254
-# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D==eeER   .  .   str	d1, [x27], #254
-# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===eeER .  .   str	q1, [x27], #254
-# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D====eeER  .   str	b1, [x27, #254]!
-# CHECK-NEXT: [0,7]     . D======eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D======eeER.   str	h1, [x27, #254]!
-# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.  .   str	s1, [x27], #254
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeER  .   str	d1, [x27], #254
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   str	q1, [x27], #254
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeER.   str	b1, [x27, #254]!
+# CHECK-NEXT: [0,7]     . D==eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   str	h1, [x27, #254]!
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5124,43 +5114,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27], #254
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       str	d1, [x27], #254
-# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       str	q1, [x27], #254
-# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     5.0    0.0    0.0       str	b1, [x27, #254]!
-# CHECK-NEXT: 7.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     7.0    0.0    0.0       str	h1, [x27, #254]!
-# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     5.0    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	d1, [x27], #254
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       str	q1, [x27], #254
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       str	b1, [x27, #254]!
+# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       str	h1, [x27, #254]!
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.5    0.1    0.0       <total>
 
 # CHECK:      [87] Code Region - G88
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      804
+# CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2200
 
 # CHECK:      Dispatch Width:    8
-# CHECK-NEXT: uOps Per Cycle:    2.74
-# CHECK-NEXT: IPC:               1.24
+# CHECK-NEXT: uOps Per Cycle:    4.37
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeER.    ..   str	s1, [x27, #254]!
-# CHECK-NEXT: [0,1]     D==eER    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D==eeER   ..   str	d1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D====eER  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===eeER ..   str	q1, [x27, #254]!
-# CHECK-NEXT: [0,5]     .D=====eER..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D====eER..   str	w1, [x27], #254
-# CHECK-NEXT: [0,7]     . D=====eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D=====eER.   str	x1, [x27], #254
-# CHECK-NEXT: [0,9]     . D======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.  .   str	s1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeER  .   str	d1, [x27, #254]!
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   str	q1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eER .   str	w1, [x27], #254
+# CHECK-NEXT: [0,7]     . D==eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eER.   str	x1, [x27], #254
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -5170,16 +5159,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       str	s1, [x27, #254]!
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       str	d1, [x27, #254]!
-# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       str	q1, [x27, #254]!
-# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     5.0    0.0    0.0       str	w1, [x27], #254
-# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     6.0    0.0    0.0       str	x1, [x27], #254
-# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.6    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       str	d1, [x27, #254]!
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       str	q1, [x27, #254]!
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       str	w1, [x27], #254
+# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       str	x1, [x27], #254
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.5    0.1    0.0       <total>
 
 # CHECK:      [88] Code Region - G89
 


        


More information about the llvm-commits mailing list