[llvm] 3b13e02 - [AArch64] Fix postinc operands for Cortex-A57 scheduling

David Green via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 12 02:05:52 PDT 2023


Author: David Green
Date: 2023-10-12T10:05:45+01:00
New Revision: 3b13e02d6d3248050d29fc73b22705b0ffda0f48

URL: https://github.com/llvm/llvm-project/commit/3b13e02d6d3248050d29fc73b22705b0ffda0f48
DIFF: https://github.com/llvm/llvm-project/commit/3b13e02d6d3248050d29fc73b22705b0ffda0f48.diff

LOG: [AArch64] Fix postinc operands for Cortex-A57 scheduling

Similar to D159254, this fixes the order of WriteAdr operands on post/pre-inc
loads/stores in the Cortex-A57 scheduling model.

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64SchedA57.td
    llvm/test/tools/llvm-mca/AArch64/Cortex/A57-writeback.s

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64SchedA57.td b/llvm/lib/Target/AArch64/AArch64SchedA57.td
index 8ce229374000054..277ec772cf0f10e 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA57.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA57.td
@@ -183,165 +183,165 @@ def : InstRW<[A57Write_3cyc_1W], (instregex "^CRC32")>;
 // -----------------------------------------------------------------------------
 
 def : InstRW<[A57Write_8cyc_1L_1V],           (instregex "LD1i(8|16|32)$")>;
-def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1i(8|16|32)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_8cyc_1L_1V], (instregex "LD1i(8|16|32)_POST$")>;
 def : InstRW<[A57Write_5cyc_1L],            (instregex "LD1i(64)$")>;
-def : InstRW<[A57Write_5cyc_1L, WriteAdr],  (instregex "LD1i(64)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_5cyc_1L],  (instregex "LD1i(64)_POST$")>;
 
 def : InstRW<[A57Write_8cyc_1L_1V],           (instregex "LD1Rv(8b|4h|2s)$")>;
-def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_8cyc_1L_1V], (instregex "LD1Rv(8b|4h|2s)_POST$")>;
 def : InstRW<[A57Write_5cyc_1L],            (instregex "LD1Rv(1d)$")>;
-def : InstRW<[A57Write_5cyc_1L, WriteAdr],  (instregex "LD1Rv(1d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_5cyc_1L],  (instregex "LD1Rv(1d)_POST$")>;
 def : InstRW<[A57Write_8cyc_1L_1V],           (instregex "LD1Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_8cyc_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
 
 def : InstRW<[A57Write_5cyc_1L],              (instregex "LD1Onev(8b|4h|2s|1d)$")>;
-def : InstRW<[A57Write_5cyc_1L, WriteAdr],    (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_5cyc_1L],    (instregex "LD1Onev(8b|4h|2s|1d)_POST$")>;
 def : InstRW<[A57Write_5cyc_1L],              (instregex "LD1Onev(16b|8h|4s|2d)$")>;
-def : InstRW<[A57Write_5cyc_1L, WriteAdr],    (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_5cyc_1L],    (instregex "LD1Onev(16b|8h|4s|2d)_POST$")>;
 def : InstRW<[A57Write_5cyc_1L],              (instregex "LD1Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[A57Write_5cyc_1L, WriteAdr],    (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_5cyc_1L],    (instregex "LD1Twov(8b|4h|2s|1d)_POST$")>;
 def : InstRW<[A57Write_6cyc_2L],             (instregex "LD1Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_6cyc_2L],   (instregex "LD1Twov(16b|8h|4s|2d)_POST$")>;
 def : InstRW<[A57Write_6cyc_2L],             (instregex "LD1Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_6cyc_2L],   (instregex "LD1Threev(8b|4h|2s|1d)_POST$")>;
 def : InstRW<[A57Write_7cyc_3L],            (instregex "LD1Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[A57Write_7cyc_3L, WriteAdr],  (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_7cyc_3L],  (instregex "LD1Threev(16b|8h|4s|2d)_POST$")>;
 def : InstRW<[A57Write_6cyc_2L],             (instregex "LD1Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_6cyc_2L],   (instregex "LD1Fourv(8b|4h|2s|1d)_POST$")>;
 def : InstRW<[A57Write_8cyc_4L],           (instregex "LD1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[A57Write_8cyc_4L, WriteAdr], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_8cyc_4L], (instregex "LD1Fourv(16b|8h|4s|2d)_POST$")>;
 
 def : InstRW<[A57Write_8cyc_1L_2V],           (instregex "LD2i(8|16)$")>;
-def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD2i(8|16)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_8cyc_1L_2V], (instregex "LD2i(8|16)_POST$")>;
 def : InstRW<[A57Write_6cyc_2L],            (instregex "LD2i(32)$")>;
-def : InstRW<[A57Write_6cyc_2L, WriteAdr],  (instregex "LD2i(32)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_6cyc_2L],  (instregex "LD2i(32)_POST$")>;
 def : InstRW<[A57Write_8cyc_1L_1V],            (instregex "LD2i(64)$")>;
-def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr],  (instregex "LD2i(64)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_8cyc_1L_1V],  (instregex "LD2i(64)_POST$")>;
 
 def : InstRW<[A57Write_8cyc_1L_1V],            (instregex "LD2Rv(8b|4h|2s)$")>;
-def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr],  (instregex "LD2Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_8cyc_1L_1V],  (instregex "LD2Rv(8b|4h|2s)_POST$")>;
 def : InstRW<[A57Write_5cyc_1L],             (instregex "LD2Rv(1d)$")>;
-def : InstRW<[A57Write_5cyc_1L, WriteAdr],   (instregex "LD2Rv(1d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_5cyc_1L],   (instregex "LD2Rv(1d)_POST$")>;
 def : InstRW<[A57Write_8cyc_1L_2V],           (instregex "LD2Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_8cyc_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
 
 def : InstRW<[A57Write_8cyc_1L_1V],             (instregex "LD2Twov(8b|4h|2s)$")>;
-def : InstRW<[A57Write_8cyc_1L_1V, WriteAdr],   (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_8cyc_1L_1V],   (instregex "LD2Twov(8b|4h|2s)_POST$")>;
 def : InstRW<[A57Write_9cyc_2L_2V],           (instregex "LD2Twov(16b|8h|4s)$")>;
-def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr], (instregex "LD2Twov(16b|8h|4s)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_9cyc_2L_2V], (instregex "LD2Twov(16b|8h|4s)_POST$")>;
 def : InstRW<[A57Write_6cyc_2L],             (instregex "LD2Twov(2d)$")>;
-def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD2Twov(2d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_6cyc_2L],   (instregex "LD2Twov(2d)_POST$")>;
 
 def : InstRW<[A57Write_9cyc_1L_3V],           (instregex "LD3i(8|16)$")>;
-def : InstRW<[A57Write_9cyc_1L_3V, WriteAdr], (instregex "LD3i(8|16)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_9cyc_1L_3V], (instregex "LD3i(8|16)_POST$")>;
 def : InstRW<[A57Write_8cyc_1L_2V],            (instregex "LD3i(32)$")>;
-def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr],  (instregex "LD3i(32)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_8cyc_1L_2V],  (instregex "LD3i(32)_POST$")>;
 def : InstRW<[A57Write_6cyc_2L],             (instregex "LD3i(64)$")>;
-def : InstRW<[A57Write_6cyc_2L, WriteAdr],   (instregex "LD3i(64)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_6cyc_2L],   (instregex "LD3i(64)_POST$")>;
 
 def : InstRW<[A57Write_8cyc_1L_2V],             (instregex "LD3Rv(8b|4h|2s)$")>;
-def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr],   (instregex "LD3Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_8cyc_1L_2V],   (instregex "LD3Rv(8b|4h|2s)_POST$")>;
 def : InstRW<[A57Write_6cyc_2L],              (instregex "LD3Rv(1d)$")>;
-def : InstRW<[A57Write_6cyc_2L, WriteAdr],    (instregex "LD3Rv(1d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_6cyc_2L],    (instregex "LD3Rv(1d)_POST$")>;
 def : InstRW<[A57Write_9cyc_1L_3V],            (instregex "LD3Rv(16b|8h|4s)$")>;
-def : InstRW<[A57Write_9cyc_1L_3V, WriteAdr],  (instregex "LD3Rv(16b|8h|4s)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_9cyc_1L_3V],  (instregex "LD3Rv(16b|8h|4s)_POST$")>;
 def : InstRW<[A57Write_9cyc_2L_3V],           (instregex "LD3Rv(2d)$")>;
-def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD3Rv(2d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_9cyc_2L_3V], (instregex "LD3Rv(2d)_POST$")>;
 
 def : InstRW<[A57Write_9cyc_2L_2V],               (instregex "LD3Threev(8b|4h|2s)$")>;
-def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr],     (instregex "LD3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_9cyc_2L_2V],     (instregex "LD3Threev(8b|4h|2s)_POST$")>;
 def : InstRW<[A57Write_10cyc_3L_4V],           (instregex "LD3Threev(16b|8h|4s)$")>;
-def : InstRW<[A57Write_10cyc_3L_4V, WriteAdr], (instregex "LD3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_10cyc_3L_4V], (instregex "LD3Threev(16b|8h|4s)_POST$")>;
 def : InstRW<[A57Write_8cyc_4L],               (instregex "LD3Threev(2d)$")>;
-def : InstRW<[A57Write_8cyc_4L, WriteAdr],     (instregex "LD3Threev(2d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_8cyc_4L],     (instregex "LD3Threev(2d)_POST$")>;
 
 def : InstRW<[A57Write_9cyc_2L_3V],           (instregex "LD4i(8|16)$")>;
-def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD4i(8|16)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_9cyc_2L_3V], (instregex "LD4i(8|16)_POST$")>;
 def : InstRW<[A57Write_8cyc_1L_2V],             (instregex "LD4i(32)$")>;
-def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr],   (instregex "LD4i(32)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_8cyc_1L_2V],   (instregex "LD4i(32)_POST$")>;
 def : InstRW<[A57Write_9cyc_2L_3V],           (instregex "LD4i(64)$")>;
-def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr], (instregex "LD4i(64)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_9cyc_2L_3V], (instregex "LD4i(64)_POST$")>;
 
 def : InstRW<[A57Write_8cyc_1L_2V],              (instregex "LD4Rv(8b|4h|2s)$")>;
-def : InstRW<[A57Write_8cyc_1L_2V, WriteAdr],    (instregex "LD4Rv(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_8cyc_1L_2V],    (instregex "LD4Rv(8b|4h|2s)_POST$")>;
 def : InstRW<[A57Write_6cyc_2L],               (instregex "LD4Rv(1d)$")>;
-def : InstRW<[A57Write_6cyc_2L, WriteAdr],     (instregex "LD4Rv(1d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_6cyc_2L],     (instregex "LD4Rv(1d)_POST$")>;
 def : InstRW<[A57Write_9cyc_2L_3V],            (instregex "LD4Rv(16b|8h|4s)$")>;
-def : InstRW<[A57Write_9cyc_2L_3V, WriteAdr],  (instregex "LD4Rv(16b|8h|4s)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_9cyc_2L_3V],  (instregex "LD4Rv(16b|8h|4s)_POST$")>;
 def : InstRW<[A57Write_9cyc_2L_4V],           (instregex "LD4Rv(2d)$")>;
-def : InstRW<[A57Write_9cyc_2L_4V, WriteAdr], (instregex "LD4Rv(2d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_9cyc_2L_4V], (instregex "LD4Rv(2d)_POST$")>;
 
 def : InstRW<[A57Write_9cyc_2L_2V],                (instregex "LD4Fourv(8b|4h|2s)$")>;
-def : InstRW<[A57Write_9cyc_2L_2V, WriteAdr],      (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_9cyc_2L_2V],      (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
 def : InstRW<[A57Write_11cyc_4L_4V],           (instregex "LD4Fourv(16b|8h|4s)$")>;
-def : InstRW<[A57Write_11cyc_4L_4V, WriteAdr], (instregex "LD4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_11cyc_4L_4V], (instregex "LD4Fourv(16b|8h|4s)_POST$")>;
 def : InstRW<[A57Write_8cyc_4L],                (instregex "LD4Fourv(2d)$")>;
-def : InstRW<[A57Write_8cyc_4L, WriteAdr],      (instregex "LD4Fourv(2d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_8cyc_4L],      (instregex "LD4Fourv(2d)_POST$")>;
 
 // Vector Store
 // -----------------------------------------------------------------------------
 
 def : InstRW<[A57Write_1cyc_1S],            (instregex "ST1i(8|16|32)$")>;
-def : InstRW<[A57Write_1cyc_1S, WriteAdr],  (instregex "ST1i(8|16|32)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1S],  (instregex "ST1i(8|16|32)_POST$")>;
 def : InstRW<[A57Write_3cyc_1S_1V],           (instregex "ST1i(64)$")>;
-def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST1i(64)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_3cyc_1S_1V], (instregex "ST1i(64)_POST$")>;
 
 def : InstRW<[A57Write_1cyc_1S],                  (instregex "ST1Onev(8b|4h|2s|1d)$")>;
-def : InstRW<[A57Write_1cyc_1S, WriteAdr],        (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_1cyc_1S],        (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
 def : InstRW<[A57Write_2cyc_2S],                 (instregex "ST1Onev(16b|8h|4s|2d)$")>;
-def : InstRW<[A57Write_2cyc_2S, WriteAdr],       (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_2cyc_2S],       (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
 def : InstRW<[A57Write_2cyc_2S],                 (instregex "ST1Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[A57Write_2cyc_2S, WriteAdr],       (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_2cyc_2S],       (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
 def : InstRW<[A57Write_4cyc_4S],               (instregex "ST1Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[A57Write_4cyc_4S, WriteAdr],     (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_4cyc_4S],     (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
 def : InstRW<[A57Write_3cyc_3S],                (instregex "ST1Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[A57Write_3cyc_3S, WriteAdr],      (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_3cyc_3S],      (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
 def : InstRW<[A57Write_6cyc_6S],             (instregex "ST1Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[A57Write_6cyc_6S, WriteAdr],   (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_6cyc_6S],   (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
 def : InstRW<[A57Write_4cyc_4S],               (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[A57Write_4cyc_4S, WriteAdr],     (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_4cyc_4S],     (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
 def : InstRW<[A57Write_8cyc_8S],           (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[A57Write_8cyc_8S, WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_8cyc_8S], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
 
 def : InstRW<[A57Write_3cyc_1S_1V],           (instregex "ST2i(8|16|32)$")>;
-def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr], (instregex "ST2i(8|16|32)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_3cyc_1S_1V], (instregex "ST2i(8|16|32)_POST$")>;
 def : InstRW<[A57Write_2cyc_2S],           (instregex "ST2i(64)$")>;
-def : InstRW<[A57Write_2cyc_2S, WriteAdr], (instregex "ST2i(64)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_2cyc_2S], (instregex "ST2i(64)_POST$")>;
 
 def : InstRW<[A57Write_3cyc_2S_1V],              (instregex "ST2Twov(8b|4h|2s)$")>;
-def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr],    (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_3cyc_2S_1V],    (instregex "ST2Twov(8b|4h|2s)_POST$")>;
 def : InstRW<[A57Write_4cyc_4S_2V],           (instregex "ST2Twov(16b|8h|4s)$")>;
-def : InstRW<[A57Write_4cyc_4S_2V, WriteAdr], (instregex "ST2Twov(16b|8h|4s)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_4cyc_4S_2V], (instregex "ST2Twov(16b|8h|4s)_POST$")>;
 def : InstRW<[A57Write_4cyc_4S],             (instregex "ST2Twov(2d)$")>;
-def : InstRW<[A57Write_4cyc_4S, WriteAdr],   (instregex "ST2Twov(2d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_4cyc_4S],   (instregex "ST2Twov(2d)_POST$")>;
 
 def : InstRW<[A57Write_3cyc_1S_1V],            (instregex "ST3i(8|16)$")>;
-def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr],  (instregex "ST3i(8|16)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_3cyc_1S_1V],  (instregex "ST3i(8|16)_POST$")>;
 def : InstRW<[A57Write_3cyc_3S],           (instregex "ST3i(32)$")>;
-def : InstRW<[A57Write_3cyc_3S, WriteAdr], (instregex "ST3i(32)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_3cyc_3S], (instregex "ST3i(32)_POST$")>;
 def : InstRW<[A57Write_3cyc_2S_1V],           (instregex "ST3i(64)$")>;
-def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr], (instregex "ST3i(64)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_3cyc_2S_1V], (instregex "ST3i(64)_POST$")>;
 
 def : InstRW<[A57Write_3cyc_3S_2V],                 (instregex "ST3Threev(8b|4h|2s)$")>;
-def : InstRW<[A57Write_3cyc_3S_2V, WriteAdr],       (instregex "ST3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_3cyc_3S_2V],       (instregex "ST3Threev(8b|4h|2s)_POST$")>;
 def : InstRW<[A57Write_6cyc_6S_4V],           (instregex "ST3Threev(16b|8h|4s)$")>;
-def : InstRW<[A57Write_6cyc_6S_4V, WriteAdr], (instregex "ST3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_6cyc_6S_4V], (instregex "ST3Threev(16b|8h|4s)_POST$")>;
 def : InstRW<[A57Write_6cyc_6S],                (instregex "ST3Threev(2d)$")>;
-def : InstRW<[A57Write_6cyc_6S, WriteAdr],      (instregex "ST3Threev(2d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_6cyc_6S],      (instregex "ST3Threev(2d)_POST$")>;
 
 def : InstRW<[A57Write_3cyc_1S_1V],             (instregex "ST4i(8|16)$")>;
-def : InstRW<[A57Write_3cyc_1S_1V, WriteAdr],   (instregex "ST4i(8|16)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_3cyc_1S_1V],   (instregex "ST4i(8|16)_POST$")>;
 def : InstRW<[A57Write_4cyc_4S],           (instregex "ST4i(32)$")>;
-def : InstRW<[A57Write_4cyc_4S, WriteAdr], (instregex "ST4i(32)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_4cyc_4S], (instregex "ST4i(32)_POST$")>;
 def : InstRW<[A57Write_3cyc_2S_1V],            (instregex "ST4i(64)$")>;
-def : InstRW<[A57Write_3cyc_2S_1V, WriteAdr],  (instregex "ST4i(64)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_3cyc_2S_1V],  (instregex "ST4i(64)_POST$")>;
 
 def : InstRW<[A57Write_4cyc_4S_2V],                  (instregex "ST4Fourv(8b|4h|2s)$")>;
-def : InstRW<[A57Write_4cyc_4S_2V, WriteAdr],        (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_4cyc_4S_2V],        (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
 def : InstRW<[A57Write_8cyc_8S_4V],           (instregex "ST4Fourv(16b|8h|4s)$")>;
-def : InstRW<[A57Write_8cyc_8S_4V, WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_8cyc_8S_4V], (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
 def : InstRW<[A57Write_8cyc_8S],                (instregex "ST4Fourv(2d)$")>;
-def : InstRW<[A57Write_8cyc_8S, WriteAdr],      (instregex "ST4Fourv(2d)_POST$")>;
+def : InstRW<[WriteAdr, A57Write_8cyc_8S],      (instregex "ST4Fourv(2d)_POST$")>;
 
 // Vector - Integer
 // -----------------------------------------------------------------------------
@@ -592,38 +592,38 @@ def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDNPDi)>;
 def : InstRW<[A57Write_6cyc_2L, WriteLDHi], (instrs LDNPQi)>;
 def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDNPSi)>;
 def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDPDi)>;
-def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPDpost)>;
-def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPDpre)>;
+def : InstRW<[WriteAdr, A57Write_5cyc_1L, WriteLDHi], (instrs LDPDpost)>;
+def : InstRW<[WriteAdr, A57Write_5cyc_1L, WriteLDHi], (instrs LDPDpre)>;
 def : InstRW<[A57Write_6cyc_2L, WriteLDHi], (instrs LDPQi)>;
-def : InstRW<[A57Write_6cyc_2L, WriteLDHi, WriteAdr], (instrs LDPQpost)>;
-def : InstRW<[A57Write_6cyc_2L, WriteLDHi, WriteAdr], (instrs LDPQpre)>;
+def : InstRW<[WriteAdr, A57Write_6cyc_2L, WriteLDHi], (instrs LDPQpost)>;
+def : InstRW<[WriteAdr, A57Write_6cyc_2L, WriteLDHi], (instrs LDPQpre)>;
 def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi], (instrs LDPSWi)>;
-def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi, WriteAdr], (instrs LDPSWpost)>;
-def : InstRW<[A57Write_5cyc_1I_2L, WriteLDHi, WriteAdr], (instrs LDPSWpre)>;
+def : InstRW<[WriteAdr, A57Write_5cyc_1I_2L, WriteLDHi], (instrs LDPSWpost)>;
+def : InstRW<[WriteAdr, A57Write_5cyc_1I_2L, WriteLDHi], (instrs LDPSWpre)>;
 def : InstRW<[A57Write_5cyc_1L, WriteLDHi], (instrs LDPSi)>;
-def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPSpost)>;
-def : InstRW<[A57Write_5cyc_1L, WriteLDHi, WriteAdr], (instrs LDPSpre)>;
+def : InstRW<[WriteAdr, A57Write_5cyc_1L, WriteLDHi], (instrs LDPSpost)>;
+def : InstRW<[WriteAdr, A57Write_5cyc_1L, WriteLDHi], (instrs LDPSpre)>;
 def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRBpost)>;
-def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRBpre)>;
+def : InstRW<[WriteAdr, A57Write_5cyc_1L], (instrs LDRBpre)>;
 def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRBroW)>;
 def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRBroX)>;
 def : InstRW<[A57Write_5cyc_1L], (instrs LDRBui)>;
 def : InstRW<[A57Write_5cyc_1L], (instrs LDRDl)>;
 def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRDpost)>;
-def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRDpre)>;
+def : InstRW<[WriteAdr, A57Write_5cyc_1L], (instrs LDRDpre)>;
 def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRDroW)>;
 def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRDroX)>;
 def : InstRW<[A57Write_5cyc_1L], (instrs LDRDui)>;
 def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRHHroW)>;
 def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRHHroX)>;
 def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRHpost)>;
-def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRHpre)>;
+def : InstRW<[WriteAdr, A57Write_5cyc_1L], (instrs LDRHpre)>;
 def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRHroW)>;
 def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRHroX)>;
 def : InstRW<[A57Write_5cyc_1L], (instrs LDRHui)>;
 def : InstRW<[A57Write_5cyc_1L], (instrs LDRQl)>;
 def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRQpost)>;
-def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRQpre)>;
+def : InstRW<[WriteAdr, A57Write_5cyc_1L], (instrs LDRQpre)>;
 def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRQroW)>;
 def : InstRW<[A57Write_6cyc_1I_1L, ReadAdrBase], (instrs LDRQroX)>;
 def : InstRW<[A57Write_5cyc_1L], (instrs LDRQui)>;
@@ -633,7 +633,7 @@ def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHXroW)>;
 def : InstRW<[A57Write_5cyc_1I_1L, ReadAdrBase], (instrs LDRSHXroX)>;
 def : InstRW<[A57Write_5cyc_1L], (instrs LDRSl)>;
 def : InstRW<[A57Write_5cyc_1L, WriteI], (instrs LDRSpost)>;
-def : InstRW<[A57Write_5cyc_1L, WriteAdr], (instrs LDRSpre)>;
+def : InstRW<[WriteAdr, A57Write_5cyc_1L], (instrs LDRSpre)>;
 def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRSroW)>;
 def : InstRW<[A57Write_5cyc_1L, ReadAdrBase], (instrs LDRSroX)>;
 def : InstRW<[A57Write_5cyc_1L], (instrs LDRSui)>;

diff  --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A57-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A57-writeback.s
index 6993401ffa259a2..5248392188f0322 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A57-writeback.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A57-writeback.s
@@ -1162,28 +1162,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ld1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .  .   ld1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: [0,3]     .D=========eER .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D========eeeeeER  .    .  .   ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: [0,5]     . D=============eER .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D============eeeeeER  .  .   ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,7]     .  D=================eER .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D================eeeeeER.   ld1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,9]     .   D=====================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  ..   ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER ..   ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=eE---R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeER..   ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,5]     . D=eE---R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeER.   ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7]     .  D=eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeER   ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9]     .   D=eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1193,43 +1193,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: 5.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: 7.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     17.0   0.0    0.0       ld1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: 9.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     11.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 5.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 7.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 9.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    1.5       <total>
 
 # CHECK:      [1] Code Region - G02
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ld1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .  .   ld1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: [0,3]     .D=========eER .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D========eeeeeER  .    .  .   ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: [0,5]     . D=============eER .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D============eeeeeER  .  .   ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=================eER .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D================eeeeeER.   ld1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=====================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  ..   ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER ..   ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=eE---R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeER..   ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,5]     . D=eE---R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeER.   ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeER   ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1239,43 +1239,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: 5.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     17.0   0.0    0.0       ld1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     11.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 5.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    1.5       <total>
 
 # CHECK:      [2] Code Region - G03
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ld1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .  .   ld1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=========eER .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D========eeeeeER  .    .  .   ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=============eER .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D============eeeeeER  .  .   ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=================eER .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D================eeeeeER.   ld1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=====================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  ..   ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER ..   ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE---R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeER..   ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE---R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeER.   ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeER   ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1285,43 +1285,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     17.0   0.0    0.0       ld1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     11.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    1.5       <total>
 
 # CHECK:      [3] Code Region - G04
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2604
+# CHECK-NEXT: Total Cycles:      607
 # CHECK-NEXT: Total uOps:        1600
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.61
-# CHECK-NEXT: IPC:               0.38
+# CHECK-NEXT: uOps Per Cycle:    2.64
+# CHECK-NEXT: IPC:               1.65
 # CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .   .   ld1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .   .   ld1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,3]     .D=========eER .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D========eeeeeeER .    .   .   ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5]     .  D=============eER.    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=============eeeeeER .   .   ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7]     .   D=================eER.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=================eeeeeER.   ld1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9]     .    D=====================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  . .   ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE---R  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER . .   ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=eE---R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER .   ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .  DeE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeER.   ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .   D=eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeeeeER   ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    D=eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1331,43 +1331,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 5.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     14.0   0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 7.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     18.0   0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 9.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     11.7   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.6    0.2    1.6       <total>
 
 # CHECK:      [4] Code Region - G05
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2804
+# CHECK-NEXT: Total Cycles:      807
 # CHECK-NEXT: Total uOps:        1800
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.64
-# CHECK-NEXT: IPC:               0.36
+# CHECK-NEXT: uOps Per Cycle:    2.23
+# CHECK-NEXT: IPC:               1.24
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    ..   ld1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=====eeeeeER .    .    .    ..   ld1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3]     . D=========eER.    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .  D========eeeeeeER.    .    ..   ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5]     .   D=============eER    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    D============eeeeeeER    ..   ld1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .D=================eER   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .D=================eeeeeER.   ld1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    . D=====================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .   .   ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=eeeeeER.   .   ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     . D=eE---R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  DeeeeeeER  .   ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .   DeE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    DeeeeeeER.   ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .DeE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D=eeeeeER   ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    . D=eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1377,43 +1377,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 5.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 7.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     18.0   0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     11.7   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    1.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    1.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.4    0.4    1.8       <total>
 
 # CHECK:      [5] Code Region - G06
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2704
+# CHECK-NEXT: Total Cycles:      707
 # CHECK-NEXT: Total uOps:        1700
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.63
-# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: uOps Per Cycle:    2.40
+# CHECK-NEXT: IPC:               1.41
 # CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .   ld1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=====eeeeeER .    .    .    .   ld1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     . D=========eER.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=========eeeeeER .    .    .   ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .  D=============eER.    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .   D============eeeeeeER.    .   ld1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    D=================eER    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    D=================eeeeeER.   ld1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .D=====================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .  .   ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE----R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=eeeeeER.  .   ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     . D=eE---R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeER  .   ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .  D=eE---R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   DeeeeeeER.   ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    DeE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    D=eeeeeER   ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .D=eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1423,43 +1423,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     10.0   0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     18.0   0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     11.8   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    1.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    1.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.6    0.3    1.7       <total>
 
 # CHECK:      [6] Code Region - G07
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3104
+# CHECK-NEXT: Total Cycles:      1107
 # CHECK-NEXT: Total uOps:        2100
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.68
-# CHECK-NEXT: IPC:               0.32
+# CHECK-NEXT: uOps Per Cycle:    1.90
+# CHECK-NEXT: IPC:               0.90
 # CHECK-NEXT: Block RThroughput: 11.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .   .   ld1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D====eeeeeeER.    .    .    .   .   ld1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .  D=========eER    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D========eeeeeeER    .    .   .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,5]     .    D=============eER   .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D============eeeeeeeER  .   .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,7]     .    . D==================eER .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D=================eeeeeeER.   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,9]     .    .   D======================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .    . .   ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE----R .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeER    . .   ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE----R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeER  . .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,5]     .    DeE----R  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeER .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,7]     .    . DeE-----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D=eeeeeeER   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,9]     .    .   D=eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1469,43 +1469,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 5.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 7.     1     19.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     18.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 9.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     11.8   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    2.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 9.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.2    0.6    2.1       <total>
 
 # CHECK:      [7] Code Region - G08
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3304
+# CHECK-NEXT: Total Cycles:      1307
 # CHECK-NEXT: Total uOps:        2300
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.70
-# CHECK-NEXT: IPC:               0.30
+# CHECK-NEXT: uOps Per Cycle:    1.76
+# CHECK-NEXT: IPC:               0.77
 # CHECK-NEXT: Block RThroughput: 13.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    ..   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D====eeeeeeeER    .    .    .    ..   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,3]     .  D==========eER   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D=========eeeeeeER   .    .    ..   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,5]     .    D==============eER  .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D=============eeeeeeeER .    ..   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,7]     .    . D===================eER.    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D==================eeeeeeeER.   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,9]     .    .   D========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .    .   .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,1]     .DeE----R .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   .   .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3]     .  DeE-----R   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D=eeeeeeER .   .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,5]     .    D=eE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D=eeeeeeeER  .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,7]     .    . D=eE-----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D==eeeeeeeER   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .   D==eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1515,43 +1515,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     10.0   0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     14.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     19.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 9.     1     25.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     12.6   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    2.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 7.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    2.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 9.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.8    0.7    2.3       <total>
 
 # CHECK:      [8] Code Region - G09
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3204
+# CHECK-NEXT: Total Cycles:      1207
 # CHECK-NEXT: Total uOps:        2200
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.69
-# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: uOps Per Cycle:    1.82
+# CHECK-NEXT: IPC:               0.83
 # CHECK-NEXT: Block RThroughput: 12.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D====eeeeeeeER    .    .    .    .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .  D==========eER   .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D=========eeeeeeER   .    .    .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    D==============eER  .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D=============eeeeeeER  .    .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . D==================eER .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D=================eeeeeeeER.   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D=======================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .    .  .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE----R .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   .  .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE-----R   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D=eeeeeeER .  .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    D=eE----R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D=eeeeeeER  .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . D=eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D=eeeeeeeER   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D=eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1561,43 +1561,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     10.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     14.0   0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     19.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     18.0   0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     12.3   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    2.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.6    0.6    2.2       <total>
 
 # CHECK:      [9] Code Region - G10
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3404
+# CHECK-NEXT: Total Cycles:      1407
 # CHECK-NEXT: Total uOps:        2400
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.71
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    1.71
+# CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 14.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234567
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    . .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D====eeeeeeeER    .    .    .    . .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .  D==========eER   .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D=========eeeeeeeER  .    .    . .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    D===============eER .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==============eeeeeeER .    . .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,7]     .    . D===================eER.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D==================eeeeeeeeER.   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .   D=========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .    .    .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE----R .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   .    .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE-----R   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D=eeeeeeeER.    .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    D=eE-----R.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==eeeeeeER   .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . D==eE----R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D==eeeeeeeeER   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .   D==eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1607,43 +1607,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     10.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     19.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 9.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     12.9   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    2.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    2.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 9.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.0    0.7    2.4       <total>
 
 # CHECK:      [10] Code Region - G11
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3404
+# CHECK-NEXT: Total Cycles:      1407
 # CHECK-NEXT: Total uOps:        2400
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.71
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    1.71
+# CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 14.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234567
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    . .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D====eeeeeeER.    .    .    .    . .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,3]     .  D=========eER    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D========eeeeeeeeER  .    .    . .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,5]     .    D===============eER .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==============eeeeeeER .    . .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,7]     .    . D===================eER.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D==================eeeeeeeeER.   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .   D=========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .    .    .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeE----R .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeER    .    .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,3]     .  DeE----R    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.    .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,5]     .    DeE------R.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==eeeeeeER   .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . D==eE----R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D==eeeeeeeeER   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .   D==eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1653,43 +1653,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 5.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 7.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     19.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 9.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     12.7   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 3.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    3.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 9.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.8    0.7    2.4       <total>
 
 # CHECK:      [11] Code Region - G12
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3404
+# CHECK-NEXT: Total Cycles:      1407
 # CHECK-NEXT: Total uOps:        2400
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.71
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    1.71
+# CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 14.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234567
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    . .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeER   .    .    .    . .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .  D===========eER  .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D==========eeeeeeeeER.    .    . .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     .    D=================eER    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D================eeeeeeER    . .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . D=====================eER   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D====================eeeeeeER.   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D=========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .    .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,1]     .DeE------R    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D==eeeeeeER  .    .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D==eE----R  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==eeeeeeeeER   .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    D==eE------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D====eeeeeeER .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . D====eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D====eeeeeeER   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1699,43 +1699,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 9.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     14.3   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    3.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    1.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     5.0    3.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.4    0.9    2.4       <total>
 
 # CHECK:      [12] Code Region - G13
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3804
+# CHECK-NEXT: Total Cycles:      1910
 # CHECK-NEXT: Total uOps:        2600
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.68
-# CHECK-NEXT: IPC:               0.26
+# CHECK-NEXT: uOps Per Cycle:    1.36
+# CHECK-NEXT: IPC:               0.52
 # CHECK-NEXT: Block RThroughput: 15.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    ..   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeER   .    .    .    .    ..   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .  D===========eER  .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D==========eeeeeeeeER.    .    .    ..   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    D=================eER    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D================eeeeeeeeER  .    ..   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . D=======================eER .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D======================eeeeeeeeER.   ld1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,9]     .    .   D=============================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .  .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE------R    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D==eeeeeeER  .    .    .  .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D==eE----R  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==eeeeeeeeER   .    .  .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    D==eE------R   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D====eeeeeeeeER    .  .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . D====eE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D==========eeeeeeeeER   ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,9]     .    .   D==========eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1745,22 +1745,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.1   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    3.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    1.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     5.0    3.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     5.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     11.0   0.0    0.0       ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 9.     1     11.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.6    0.8    2.8       <total>
 
 # CHECK:      [13] Code Region - G14
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    3
@@ -1770,18 +1770,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .  .   ld1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .  .   ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .  .   ld1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,7]     .    . D=========================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER.   ld1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,9]     .    .   D===============================eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,1]     .DeE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    . .   ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D======eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    . .   ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,5]     .    D============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    . .   ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .    . D==================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER   ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,9]     .    .   D========================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1791,22 +1791,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     13.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     19.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     25.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.0   0.1    3.0       <total>
 
 # CHECK:      [14] Code Region - G15
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3704
+# CHECK-NEXT: Total Cycles:      3703
 # CHECK-NEXT: Total uOps:        1900
 
 # CHECK:      Dispatch Width:    3
@@ -1816,18 +1816,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .   ld1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .   ld1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .   ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .   ld1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    . D=========================eER    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    . D=========================eeeeeER.   ld1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .    .  D=============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .   .   ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .DeE------R    .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .   .   ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,3]     .  D======eE------R .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .   .   ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    D============eE------R   .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.   .   ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    . D==================eE------R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    . D=========================eeeeeER   ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .  D=========================eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1837,43 +1837,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     13.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     19.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     26.0   0.0    0.0       ld1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.4   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     26.0   0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.2   0.1    2.7       <total>
 
 # CHECK:      [15] Code Region - G16
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3404
+# CHECK-NEXT: Total Cycles:      1205
 # CHECK-NEXT: Total uOps:        1800
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.53
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    1.49
+# CHECK-NEXT: IPC:               0.83
 # CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01234567
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    . .   ld1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .    .    . .   ld1r	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,3]     .D=========eER .    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D========eeeeeeeeER    .    .    . .   ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: [0,5]     .  D===============eER   .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .   D==============eeeeeeeeER .    . .   ld1r	{ v1.2s }, [x27], #4
-# CHECK-NEXT: [0,7]     .    D=====================eER.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .D====================eeeeeeeeER.   ld1r	{ v1.4h }, [x27], #2
-# CHECK-NEXT: [0,9]     .    . D===========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  .    ..   ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE---R  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER .    ..   ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,3]     .D=eE---R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeeER  ..   ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: [0,5]     .  DeE------R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   DeeeeeeeeER..   ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: [0,7]     .    DeE------R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .DeeeeeeeeER   ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: [0,9]     .    . DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1883,43 +1883,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld1r	{ v1.1d }, [x27], #8
-# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: 5.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     15.0   0.0    0.0       ld1r	{ v1.2s }, [x27], #4
-# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld1r	{ v1.4h }, [x27], #2
-# CHECK-NEXT: 9.     1     28.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.3   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.2    0.3    2.4       <total>
 
 # CHECK:      [16] Code Region - G17
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3704
+# CHECK-NEXT: Total Cycles:      908
 # CHECK-NEXT: Total uOps:        1900
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.51
-# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: uOps Per Cycle:    2.09
+# CHECK-NEXT: IPC:               1.10
 # CHECK-NEXT: Block RThroughput: 6.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .   ld1r	{ v1.4s }, [x27], #4
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .   ld1r	{ v1.8b }, [x27], #1
-# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .   ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .   ld1r	{ v1.16b }, [x27], #1
-# CHECK-NEXT: [0,7]     .    . D=========================eER    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    . D=========================eeeeeER.   ld1r	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .  D=============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    ..   ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: [0,1]     .DeE------R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  ..   ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: [0,3]     .  DeE------R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER..   ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: [0,5]     .    DeE------R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER   ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: [0,7]     .    . DeE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    . DeeeeeE--R   ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .  DeE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1929,43 +1929,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.4s }, [x27], #4
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1r	{ v1.8b }, [x27], #1
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1r	{ v1.16b }, [x27], #1
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     26.0   0.0    0.0       ld1r	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.4   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    2.0       ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.4    3.1       <total>
 
 # CHECK:      [17] Code Region - G18
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    1.98
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1r	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .  .   ld1r	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .  .   ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .  .   ld1r	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . D=========================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER.   ld1r	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D===============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.  .   ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1975,43 +1975,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1r	{ v1.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1r	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld1r	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    3.0       <total>
 
 # CHECK:      [18] Code Region - G19
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3804
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.53
-# CHECK-NEXT: IPC:               0.26
+# CHECK-NEXT: uOps Per Cycle:    1.98
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    ..   ld1r	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    ..   ld1r	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeER.    .    .    ..   ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5]     .    D=================eER    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D================eeeeeeeeER  .    ..   ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7]     .    . D=======================eER .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D======================eeeeeeeeER.   ld2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .   D=============================eER   add	x0, x27, #1
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeER  .  .   ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    DeE----R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2021,43 +2021,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1r	{ v1.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    2.8       <total>
 
 # CHECK:      [19] Code Region - G20
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4104
+# CHECK-NEXT: Total Cycles:      1008
 # CHECK-NEXT: Total uOps:        2600
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.63
-# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: uOps Per Cycle:    2.58
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 9.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01234
-
-# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .   .   ld2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1]     .D========eER  .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=======eeeeeeeeER.    .    .    .    .   .   ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3]     .  D==============eER    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D=============eeeeeeeeeER .    .    .   .   ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5]     .    D=====================eER.    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D====================eeeeeeeeeER  .   .   ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7]     .    . D============================eER .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D===========================eeeeeeER.   ld2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeER   . .   ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeE-------R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  . .   ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .  DeE------R  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeeER .   ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .    DeE-------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeeER   ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . DeE-------R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeE-R   ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2067,43 +2067,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 5.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 7.     1     29.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     28.0   0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     33.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     18.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    1.0       ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    3.3       <total>
 
 # CHECK:      [20] Code Region - G21
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4204
+# CHECK-NEXT: Total Cycles:      1010
 # CHECK-NEXT: Total uOps:        2400
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.57
-# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: uOps Per Cycle:    2.38
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012345
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .    .   ld2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .    .   ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeeER  .    .    .    .   ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    D====================eER .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D===================eeeeeeeeER    .    .   ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . D==========================eER   .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D=========================eeeeeeeeeER.   ld2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D=================================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .   .   ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE------R    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .   .   ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE------R  .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeeER   .   ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE-------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER  .   ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeeER   ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE-------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2113,43 +2113,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     20.0   0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     27.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     26.0   0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     34.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     17.1   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    3.2       <total>
 
 # CHECK:      [21] Code Region - G22
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4104
+# CHECK-NEXT: Total Cycles:      3410
 # CHECK-NEXT: Total uOps:        2600
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.63
-# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: uOps Per Cycle:    0.76
+# CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 8.7
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01234
-
-# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .   .   ld2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .D========eER  .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=======eeeeeeeeER.    .    .    .    .   .   ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,3]     .  D==============eER    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D=============eeeeeeeeER  .    .    .   .   ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,5]     .    D====================eER .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D===================eeeeeeeeER    .   .   ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    . D==========================eER   .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D=========================eeeeeeeeER.   ld2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D================================eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .  .   ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-------R   .    .    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=======eeeeeeeeER.    .    .    .    .  .   ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .  D=======eE------R.    .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D=============eeeeeeeeER  .    .    .  .   ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,5]     .    D=============eE------R  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D===================eeeeeeeeER    .  .   ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    . D===================eE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D=========================eeeeeeeeER   ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D=========================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2159,22 +2159,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    0.0    7.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     8.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     14.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     20.0   0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: 7.     1     27.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     20.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     26.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 9.     1     33.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     17.4   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     26.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.8   0.1    3.1       <total>
 
 # CHECK:      [22] Code Region - G23
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3804
+# CHECK-NEXT: Total Cycles:      3803
 # CHECK-NEXT: Total uOps:        2400
 
 # CHECK:      Dispatch Width:    3
@@ -2184,18 +2184,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    ..   ld2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    ..   ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    ..   ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    D===================eER  .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    ..   ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,7]     .    . D=========================eER    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeER.   ld2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .    .   D=============================eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .   ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,1]     .DeE------R    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .   ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,3]     .  D======eE------R .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .   ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    D============eE------R   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .   ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .    . D==================eE------R.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeER   ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .   D========================eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2205,43 +2205,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     7.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     13.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     19.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.3   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     25.0   0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.0   0.1    2.8       <total>
 
 # CHECK:      [23] Code Region - G24
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      2403
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.57
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.83
+# CHECK-NEXT: IPC:               0.42
 # CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    .  .   ld2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D====eeeeeeeeER   .    .    .    .  .   ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,3]     .  D===========eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D==========eeeeeeeeER.    .    .  .   ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    D=================eER    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    D=================eeeeeER.    .  .   ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,7]     .    .D=====================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    . D====================eeeeeeeeER.   ld2r	{ v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .  D===========================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeeeeER .    .    .    ..   ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .DeE----R .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D====eeeeeeeeER   .    ..   ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,3]     .  D====eE------R   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==========eeeeeeeeER..   ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    D==========eE------R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    D==========eeeeeE--R..   ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .D==========eE-----R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    . D=========eeeeeeeeER   ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .  D=========eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2251,43 +2251,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     5.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     21.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: 9.     1     28.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     14.2   0.1    0.0       <total>
+# CHECK-NEXT: 5.     1     11.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     11.0   0.0    2.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 7.     1     11.0   0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     10.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     10.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     7.6    0.1    2.9       <total>
 
 # CHECK:      [24] Code Region - G25
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        2200
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.55
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    2.18
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 7.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2r	{ v1.2s, v2.2s }, [x27], #8
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .  .   ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .  .   ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .  .   ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: [0,7]     .    . D=========================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER.   ld2r	{ v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: [0,9]     .    .   D===============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.  .   ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,5]     .    DeE------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2297,43 +2297,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], #8
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    3.0       <total>
 
 # CHECK:      [25] Code Region - G26
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3704
+# CHECK-NEXT: Total Cycles:      909
 # CHECK-NEXT: Total uOps:        2100
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.57
-# CHECK-NEXT: IPC:               0.27
+# CHECK-NEXT: uOps Per Cycle:    2.31
+# CHECK-NEXT: IPC:               1.10
 # CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .   ld2r	{ v1.16b, v2.16b }, [x27], #2
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeER    .    .    .    .    .   ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     . D===========eER   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .  D==========eeeeeeeeER .    .    .    .   ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     .   D=================eER.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    D================eeeeeeeeER   .    .   ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .D=======================eER  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    . D======================eeeeeeeeER.   ld2r	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .  D=============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    . .   ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: [0,1]     .DeE------R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeE-R    . .   ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     . D=eE----R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  DeeeeeeeeER . .   ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .   DeE------R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    DeeeeeeeeER .   ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .DeE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    . DeeeeeeeeER   ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .  DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2343,43 +2343,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], #2
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.1   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    1.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.1    0.4    2.9       <total>
 
 # CHECK:      [26] Code Region - G27
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        2500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.62
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    2.48
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 8.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2r	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .  .   ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeER   .    .    .  .   ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    D===================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==================eeeeeeeeER.    .  .   ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . D=========================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeER.   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,9]     .    .   D===============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .  .   ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE------R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeER.  .   ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER .   ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeER   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .   DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2389,43 +2389,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 9.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    3.0       <total>
 
 # CHECK:      [27] Code Region - G28
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4704
+# CHECK-NEXT: Total Cycles:      1210
 # CHECK-NEXT: Total uOps:        3600
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.77
-# CHECK-NEXT: IPC:               0.21
+# CHECK-NEXT: uOps Per Cycle:    2.98
+# CHECK-NEXT: IPC:               0.83
 # CHECK-NEXT: Block RThroughput: 12.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    .    .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,1]     .D========eER  .    .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=======eeeeeeeeeER    .    .    .    .    .    .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,3]     .  D===============eER   .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D==============eeeeeeeeeeER    .    .    .    .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,5]     .    .D======================eER   .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    . D=====================eeeeeeeeeER.    .    .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,7]     .    .  D=============================eER    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .   D============================eeeeeeeeeeER.   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,9]     .    .    .D====================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    ..   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1]     .DeE-------R   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeeER .    ..   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,3]     .  DeE-------R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeeeER   ..   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .DeE-------R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . DeeeeeeeeeER ..   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,7]     .    .  DeE-------R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .   DeeeeeeeeeeER   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .DeE-------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2435,43 +2435,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 3.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 5.     1     1.0    1.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 7.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 9.     1     1.0    1.0    7.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.7    3.5       <total>
 
 # CHECK:      [28] Code Region - G29
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4604
+# CHECK-NEXT: Total Cycles:      1410
 # CHECK-NEXT: Total uOps:        3600
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.78
-# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: uOps Per Cycle:    2.55
+# CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 14.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeeeeeER  .    .    .    .    .    .    .   .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,1]     . D========eER .    .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .  D=======eeeeeeeeER    .    .    .    .    .   .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .   D==============eER   .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    D=============eeeeeeeeeER.    .    .    .   .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .D=====================eER    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    . D====================eeeeeeeeeER .    .   .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .  D============================eER.    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .   D===========================eeeeeeeeeeER.   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .D===================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeeeER  .    .  .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     . DeE-------R  .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeeeeeeER .    .  .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .   DeE------R .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    D==eeeeeeeeeER .  .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .D==eE-------R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . D==eeeeeeeeeER  .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .  D==eE-------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .   D==eeeeeeeeeeER   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .D=eE--------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2481,43 +2481,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     29.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     28.0   0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     36.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     18.3   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    3.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    1.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     2.0    0.0    8.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.1    0.8    3.5       <total>
 
 # CHECK:      [29] Code Region - G30
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4704
+# CHECK-NEXT: Total Cycles:      2511
 # CHECK-NEXT: Total uOps:        3600
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.77
-# CHECK-NEXT: IPC:               0.21
+# CHECK-NEXT: uOps Per Cycle:    1.43
+# CHECK-NEXT: IPC:               0.40
 # CHECK-NEXT: Block RThroughput: 12.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    .    .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .D========eER  .    .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=======eeeeeeeeeeER   .    .    .    .    .    .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .   D===============eER  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    D==============eeeeeeeeeeER   .    .    .    .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    . D======================eER  .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .  D=====================eeeeeeeeeER    .    .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,7]     .    .   D=============================eER   .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    D============================eeeeeeeeeER.   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,9]     .    .    .D====================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-------R   .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeeeER.    .    .    .    .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .   DeE-------R.    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    DeeeeeeeeeeER  .    .    .    .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    . DeE-------R  .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .  D=======eeeeeeeeeER   .    .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,7]     .    .   D=======eE-------R   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    D==============eeeeeeeeeER   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,9]     .    .    .D==============eE-------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2527,22 +2527,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    1.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     8.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 7.     1     8.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 9.     1     15.0   0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.2    0.5    3.5       <total>
 
 # CHECK:      [30] Code Region - G31
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4504
+# CHECK-NEXT: Total Cycles:      4503
 # CHECK-NEXT: Total uOps:        3000
 
 # CHECK:      Dispatch Width:    3
@@ -2552,18 +2552,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012345678
-
-# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    .  .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .D========eER  .    .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=======eeeeeeeeeER    .    .    .    .    .  .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     .  D===============eER   .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D==============eeeeeeeeeER.    .    .    .  .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,5]     .    D======================eER    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D=====================eeeeeeeeeER .    .  .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,7]     .    . D=============================eER.    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D============================eeeeeeeeeER.   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D====================================eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-------R   .    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=======eeeeeeeeeER    .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .  D=======eE-------R    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==============eeeeeeeeeER.    .    .    . .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,5]     .    D==============eE-------R.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D=====================eeeeeeeeeER .    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,7]     .    . D=====================eE-------R .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D============================eeeeeeeeeER   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D============================eE-------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2573,22 +2573,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    0.0    7.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     8.0    0.0    7.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     15.0   0.0    7.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     22.0   0.0    7.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     29.0   0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.0   0.1    3.5       <total>
 
 # CHECK:      [31] Code Region - G32
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3704
+# CHECK-NEXT: Total Cycles:      3703
 # CHECK-NEXT: Total uOps:        2400
 
 # CHECK:      Dispatch Width:    3
@@ -2598,18 +2598,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,1]     .D========eER  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=======eeeeeeeeER.    .    .    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,3]     .  D==============eER    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D=============eeeeeeeeER  .    .    .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    D====================eER .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D===================eeeeeeER .    .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,7]     .    . D========================eER.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D=======================eeeeeeER.   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .   .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-------R   .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=======eeeeeeeeER.    .    .    .   .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,3]     .  D=======eE------R.    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D=============eeeeeeeeER  .    .   .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    D=============eE------R  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D===================eeeeeeER .   .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,7]     .    . D===================eE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D=======================eeeeeeER   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D=======================eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2619,43 +2619,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    0.0    7.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     8.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     14.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     20.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 7.     1     25.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     20.0   0.0    4.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     24.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 9.     1     29.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.6   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     24.0   0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     13.4   0.1    2.7       <total>
 
 # CHECK:      [32] Code Region - G33
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      1110
 # CHECK-NEXT: Total uOps:        2700
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.67
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    2.43
+# CHECK-NEXT: IPC:               0.90
 # CHECK-NEXT: Block RThroughput: 9.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    .    .  .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D====eeeeeeeeeER  .    .    .    .    .  .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: [0,3]     .   D===========eER .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    D==========eeeeeeeeER    .    .    .  .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: [0,5]     .    .D=================eER   .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    . D================eeeeeeeeER .    .  .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: [0,7]     .    .  D=======================eER.    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .   D======================eeeeeeeeeER.   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: [0,9]     .    .    D==============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .    .    .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     .DeE----R .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeeER .    .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,3]     .   DeE------R .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    DeeeeeeeeER    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,5]     .    .DeE------R    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . DeeeeeeeeER  .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,7]     .    .  DeE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .   DeeeeeeeeeER   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: [0,9]     .    .    DeE-------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2665,43 +2665,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: 9.     1     31.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     14.8   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 3.     1     1.0    1.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: 9.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.6    2.9       <total>
 
 # CHECK:      [33] Code Region - G34
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4104
+# CHECK-NEXT: Total Cycles:      1109
 # CHECK-NEXT: Total uOps:        2800
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.68
-# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: uOps Per Cycle:    2.52
+# CHECK-NEXT: IPC:               0.90
 # CHECK-NEXT: Block RThroughput: 9.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01234
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .   .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeeER.    .    .    .    .   .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: [0,3]     .  D==============eER    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D=============eeeeeeeeeER .    .    .   .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: [0,5]     .    D=====================eER.    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D====================eeeeeeER.    .   .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . D=========================eER    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D========================eeeeeeeeeER.   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    D===============================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .   .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1]     .DeE------R    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeeER .   .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,3]     .  DeE-------R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeeER   .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,5]     .    DeE-------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeE-R   .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE-----R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeeER   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2711,43 +2711,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: 5.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     17.1   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 3.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 5.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    1.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    1.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.6    3.2       <total>
 
 # CHECK:      [34] Code Region - G35
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4204
+# CHECK-NEXT: Total Cycles:      1010
 # CHECK-NEXT: Total uOps:        2700
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.64
-# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: uOps Per Cycle:    2.67
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 9.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012345
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .    .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeER .    .    .    .    .    .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .  D=============eER.    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D============eeeeeeeeeER  .    .    .    .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    D====================eER .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D===================eeeeeeeeER    .    .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . D==========================eER   .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D=========================eeeeeeeeeER.   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D=================================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .   .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE------R    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .   .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE------R  .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeeeeeER   .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE-------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeeeER  .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeeeeeER   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE-------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2757,43 +2757,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     21.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     20.0   0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     27.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     26.0   0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     34.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     17.1   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    3.2       <total>
 
 # CHECK:      [35] Code Region - G36
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4604
+# CHECK-NEXT: Total Cycles:      1311
 # CHECK-NEXT: Total uOps:        3400
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.74
-# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: uOps Per Cycle:    2.59
+# CHECK-NEXT: IPC:               0.76
 # CHECK-NEXT: Block RThroughput: 13.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    .   .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .D========eER  .    .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=======eeeeeeeeER.    .    .    .    .    .   .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,3]     .  D==============eER    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D=============eeeeeeeeeER .    .    .    .   .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,5]     .    D=====================eER.    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D====================eeeeeeeeeER  .    .   .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,7]     .    . D============================eER .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D===========================eeeeeeeeeeeER.   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .    .D===================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .  .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-------R   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeER  .    .  .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3]     .  DeE------R  .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==eeeeeeeeeER  .  .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,5]     .    D==eE-------R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==eeeeeeeeeER.  .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . D==eE-------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D==eeeeeeeeeeeER   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .DeE---------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2803,43 +2803,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 3.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     14.0   0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 5.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     21.0   0.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 7.     1     29.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     28.0   0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 9.     1     36.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     18.3   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    3.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 7.     1     3.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 9.     1     1.0    0.0    9.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.0    0.7    3.6       <total>
 
 # CHECK:      [36] Code Region - G37
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4804
+# CHECK-NEXT: Total Cycles:      1610
 # CHECK-NEXT: Total uOps:        3800
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.79
-# CHECK-NEXT: IPC:               0.21
+# CHECK-NEXT: uOps Per Cycle:    2.36
+# CHECK-NEXT: IPC:               0.62
 # CHECK-NEXT: Block RThroughput: 16.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          01
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    .    ..   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,1]     .D========eER  .    .    .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=======eeeeeeeeeeeER  .    .    .    .    .    ..   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,3]     .    D===============eER .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .D==============eeeeeeeeeeeER .    .    .    ..   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,5]     .    .   D======================eER.    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    D=====================eeeeeeeeER   .    ..   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .D============================eER  .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    . D===========================eeeeeeeeeER.   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .  D===================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeE-------R   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeeeeER    .    .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3]     .    DeE-------R    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeeeeeeeeeER.    .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .   DeE-------R.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    DeeeeeeeeER    .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .DeE------R    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . D==eeeeeeeeeER   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .  D==eE-------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2849,43 +2849,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 7.     1     29.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     28.0   0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 9.     1     36.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     18.7   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 3.     1     1.0    1.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 5.     1     1.0    1.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    3.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.4    0.9    3.4       <total>
 
 # CHECK:      [37] Code Region - G38
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      5104
+# CHECK-NEXT: Total Cycles:      1610
 # CHECK-NEXT: Total uOps:        4200
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.82
-# CHECK-NEXT: IPC:               0.20
+# CHECK-NEXT: uOps Per Cycle:    2.61
+# CHECK-NEXT: IPC:               0.62
 # CHECK-NEXT: Block RThroughput: 16.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789          01234
-# CHECK-NEXT: Index     0123456789          0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    .    .   .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     .D========eER  .    .    .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=======eeeeeeeeeeeER  .    .    .    .    .    .   .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    D===============eER .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .D==============eeeeeeeeeER   .    .    .    .   .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    . D======================eER  .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .  D=====================eeeeeeeeeeeER  .    .   .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .D=============================eER .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    . D============================eeeeeeeeeeeER.   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    D====================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-------R   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeeeeER    .    .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeE-------R    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeeeeeeeER  .    .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    . DeE-------R  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .  DeeeeeeeeeeeER   .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .DeE-------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . DeeeeeeeeeeeER   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    DeE-------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2895,22 +2895,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    1.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    1.0    7.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.8    3.5       <total>
 
 # CHECK:      [38] Code Region - G39
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4504
+# CHECK-NEXT: Total Cycles:      4503
 # CHECK-NEXT: Total uOps:        3500
 
 # CHECK:      Dispatch Width:    3
@@ -2920,18 +2920,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012345678
-
-# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,1]     . D=======eER  .    .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .  D======eeeeeeeeeER    .    .    .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,3]     .    D=============eER   .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .D============eeeeeeeeeER.    .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    .  D===================eER    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .   D==================eeeeeeeeeER .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .D=========================eER.    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    . D========================eeeeeeeeeER.   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .    .    .   D===============================eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,1]     . DeE------R   .    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  D======eeeeeeeeeER    .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,3]     .    D=====eE-------R    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .D============eeeeeeeeeER.    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    .  D===========eE-------R.    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   D==================eeeeeeeeeER .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .D=================eE-------R .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . D========================eeeeeeeeeER   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .    .    .   D=======================eE-------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2941,22 +2941,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    1.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     6.0    0.0    7.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     12.0   0.0    7.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     18.0   0.0    7.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     24.0   0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT:        1     12.6   0.2    3.4       <total>
 
 # CHECK:      [39] Code Region - G40
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4304
+# CHECK-NEXT: Total Cycles:      4303
 # CHECK-NEXT: Total uOps:        3100
 
 # CHECK:      Dispatch Width:    3
@@ -2966,18 +2966,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    ..   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,1]     . D=======eER  .    .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .  D======eeeeeeeeeER    .    .    .    .    ..   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .    D=============eER   .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .D============eeeeeeeeeER.    .    .    ..   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,5]     .    .  D===================eER    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .   D==================eeeeeeeeER  .    ..   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,7]     .    .    D=========================eER .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .D========================eeeeeeeeER.   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    . D===============================eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123456789          012345
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,1]     . DeE------R   .    .    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  D======eeeeeeeeeER    .    .    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    D=====eE-------R    .    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .D============eeeeeeeeeER.    .    .    .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,5]     .    .  D===========eE-------R.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   D==================eeeeeeeeER  .    .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    .    D==================eE------R  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .D========================eeeeeeeeER   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    . D========================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2987,43 +2987,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    1.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     6.0    0.0    7.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     12.0   0.0    7.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     19.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     25.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     12.8   0.2    3.2       <total>
 
 # CHECK:      [40] Code Region - G41
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4104
+# CHECK-NEXT: Total Cycles:      2303
 # CHECK-NEXT: Total uOps:        3100
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.76
-# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: uOps Per Cycle:    1.35
+# CHECK-NEXT: IPC:               0.43
 # CHECK-NEXT: Block RThroughput: 10.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01234
-
-# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .    .    .    .   .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,1]     . D=======eER  .    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .  D======eeeeeeeeeER    .    .    .    .   .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .    D=============eER   .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .D============eeeeeeER   .    .    .   .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,5]     .    . D=================eER  .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .  D================eeeeeeeeeER    .   .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    D=======================eER   .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .D======================eeeeeeeeER.   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .    . D=============================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .    .    .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,1]     . DeE------R   .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  D======eeeeeeeeeER    .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .    D=====eE-------R    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .D=====eeeeeeE-R    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,5]     .    . D=====eE-----R    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .  D=====eeeeeeeeeER.   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    D====eE-------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .D====eeeeeeeeER   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .    . D====eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3033,43 +3033,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     1.0    1.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.5   0.1    0.0       <total>
+# CHECK-NEXT: 3.     1     6.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     6.0    1.0    1.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 5.     1     6.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    1.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 7.     1     5.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    1.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: 9.     1     5.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.8    0.5    3.2       <total>
 
 # CHECK:      [41] Code Region - G42
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4304
+# CHECK-NEXT: Total Cycles:      1309
 # CHECK-NEXT: Total uOps:        3100
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.72
-# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: uOps Per Cycle:    2.37
+# CHECK-NEXT: IPC:               0.76
 # CHECK-NEXT: Block RThroughput: 10.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .    ..   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeeER.    .    .    .    .    ..   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: [0,3]     .   D=============eER    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    D============eeeeeeeeER  .    .    .    ..   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: [0,5]     .    .D===================eER .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    . D==================eeeeeeeeeER   .    ..   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: [0,7]     .    .   D=========================eER  .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    D========================eeeeeeeeeER.   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: [0,9]     .    .    . D===============================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    ..   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1]     .DeE------R    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeeER .    ..   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .   DeE------R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    DeeeeeeeeER    ..   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,5]     .    .DeE------R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . DeeeeeeeeeER ..   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,7]     .    .   DeE------R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    DeeeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: [0,9]     .    .    . DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3079,43 +3079,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: 3.     1     1.0    1.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: 7.     1     1.0    1.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: 9.     1     1.0    1.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.8    3.0       <total>
 
 # CHECK:      [42] Code Region - G43
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      1209
 # CHECK-NEXT: Total uOps:        2900
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.72
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    2.40
+# CHECK-NEXT: IPC:               0.83
 # CHECK-NEXT: Block RThroughput: 9.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    .    .  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D====eeeeeeeeeER  .    .    .    .    .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .   D===========eER .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    D==========eeeeeeeeER    .    .    .  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .D=================eER   .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    . D================eeeeeeeeER .    .  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .  D=======================eER.    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .   D======================eeeeeeeeeER.   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .D=============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .    .    .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE----R .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeeER .    .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .   DeE------R .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    DeeeeeeeeER    .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .DeE------R    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . DeeeeeeeeER  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .  DeE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .   DeeeeeeeeeER   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .DeE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3125,43 +3125,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     14.7   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    1.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.7    2.8       <total>
 
 # CHECK:      [43] Code Region - G44
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3604
+# CHECK-NEXT: Total Cycles:      1206
 # CHECK-NEXT: Total uOps:        2700
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.75
-# CHECK-NEXT: IPC:               0.28
+# CHECK-NEXT: uOps Per Cycle:    2.24
+# CHECK-NEXT: IPC:               0.83
 # CHECK-NEXT: Block RThroughput: 9.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .   .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .D=======eER   .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D======eeeeeeeeeER.    .    .    .   .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .   D=============eER    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    D============eeeeeeeeeER .    .   .   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    . D===================eER.    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .  D==================eeeeeER .   .   ldp	s1, s2, [x27], #248
-# CHECK-NEXT: [0,7]     .    .   D======================eER.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    D=====================eeeeeER.   ldp	d1, d2, [x27], #496
-# CHECK-NEXT: [0,9]     .    .    .D=========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    . .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE------R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeeeER . .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .   DeE------R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    DeeeeeeeeeER.   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    . DeE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .  DeeeeeE-R.   ldp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,7]     .    .   DeE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    DeeeeeER   ldp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,9]     .    .    .DeE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3171,43 +3171,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ldp	s1, s2, [x27], #248
-# CHECK-NEXT: 7.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     22.0   0.0    0.0       ldp	d1, d2, [x27], #496
-# CHECK-NEXT: 9.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.3   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    1.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    1.0       ldp	s1, s2, [x27], #248
+# CHECK-NEXT: 7.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ldp	d1, d2, [x27], #496
+# CHECK-NEXT: 9.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.7    2.6       <total>
 
 # CHECK:      [44] Code Region - G45
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2306
+# CHECK-NEXT: Total Cycles:      1005
 # CHECK-NEXT: Total uOps:        2200
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.95
-# CHECK-NEXT: IPC:               0.43
+# CHECK-NEXT: uOps Per Cycle:    2.19
+# CHECK-NEXT: IPC:               1.00
 # CHECK-NEXT: Block RThroughput: 7.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .  .   ldp	q1, q2, [x27], #992
-# CHECK-NEXT: [0,1]     .D=====eER.    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D====eeeeeER .    .    .  .   ldp	s1, s2, [x27, #248]!
-# CHECK-NEXT: [0,3]     .  D========eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D=======eeeeeER .    .  .   ldp	d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,5]     .    D===========eER.    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==========eeeeeeER.  .   ldp	q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,7]     .    . D===============eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D==============eeeeER   ldp	w1, w2, [x27], #248
-# CHECK-NEXT: [0,9]     .    .   D==============eE--R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .   .   ldp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     .DeE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeER.   .   ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .  DeE---R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeeER  .   ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    DeE---R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeeER   ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     .    . DeE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeER   ldp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .    .   DeE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3217,43 +3217,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	q1, q2, [x27], #992
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ldp	s1, s2, [x27, #248]!
-# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     8.0    0.0    0.0       ldp	d1, d2, [x27, #496]!
-# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     11.0   0.0    0.0       ldp	q1, q2, [x27, #992]!
-# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     15.0   0.0    0.0       ldp	w1, w2, [x27], #248
-# CHECK-NEXT: 9.     1     15.0   0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     9.8    0.1    0.2       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ldp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    1.6       <total>
 
 # CHECK:      [45] Code Region - G46
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1307
+# CHECK-NEXT: Total Cycles:      1006
 # CHECK-NEXT: Total uOps:        2400
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    1.84
-# CHECK-NEXT: IPC:               0.77
+# CHECK-NEXT: uOps Per Cycle:    2.39
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    .   .   ldp	x1, x2, [x27], #496
-# CHECK-NEXT: [0,1]     .DeE--R   .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeER .    .   .   ldp	w1, w2, [x27, #248]!
-# CHECK-NEXT: [0,3]     .  DeE--R .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   DeeeeER    .   .   ldp	x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,5]     .    DeE--R    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .DeeeeeER .   .   ldpsw	x1, x2, [x27], #248
-# CHECK-NEXT: [0,7]     .    . D====eER.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D===eeeeeER.   ldpsw	x1, x2, [x27, #248]!
-# CHECK-NEXT: [0,9]     .    .   D=======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .    .   ldp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     .DeE--R   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeER .    .   ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .  DeE--R .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeER    .   ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .    DeE--R    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeeeER .   ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: [0,7]     .    . DeE---R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeeeER   ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: [0,9]     .    .   DeE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3269,10 +3269,10 @@ add x0, x27, 1
 # CHECK-NEXT: 4.     1     1.0    1.0    0.0       ldp	x1, x2, [x27, #496]!
 # CHECK-NEXT: 5.     1     1.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     1.0    1.0    0.0       ldpsw	x1, x2, [x27], #248
-# CHECK-NEXT: 7.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldpsw	x1, x2, [x27, #248]!
-# CHECK-NEXT: 9.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.4    0.4    0.6       <total>
+# CHECK-NEXT: 7.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: 9.     1     1.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    1.2       <total>
 
 # CHECK:      [46] Code Region - G47
 
@@ -3324,28 +3324,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    2.96
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .  .   ldr	b1, [x27, #254]!
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .  .   ldr	h1, [x27, #254]!
-# CHECK-NEXT: [0,3]     .D=========eER .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D========eeeeeER  .    .  .   ldr	s1, [x27, #254]!
-# CHECK-NEXT: [0,5]     . D=============eER .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D============eeeeeER  .  .   ldr	d1, [x27, #254]!
-# CHECK-NEXT: [0,7]     .  D=================eER .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D================eeeeeER.   ldr	q1, [x27, #254]!
-# CHECK-NEXT: [0,9]     .   D=====================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  ..   ldr	b1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eE---R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER ..   ldr	h1, [x27, #254]!
+# CHECK-NEXT: [0,3]     .D=eE---R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeER..   ldr	s1, [x27, #254]!
+# CHECK-NEXT: [0,5]     . D=eE---R..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeER.   ldr	d1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .  D=eE---R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeeeeER   ldr	q1, [x27, #254]!
+# CHECK-NEXT: [0,9]     .   D=eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3355,16 +3355,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27, #254]!
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       ldr	h1, [x27, #254]!
-# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     9.0    0.0    0.0       ldr	s1, [x27, #254]!
-# CHECK-NEXT: 5.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     13.0   0.0    0.0       ldr	d1, [x27, #254]!
-# CHECK-NEXT: 7.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     17.0   0.0    0.0       ldr	q1, [x27, #254]!
-# CHECK-NEXT: 9.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     11.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ldr	h1, [x27, #254]!
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ldr	s1, [x27, #254]!
+# CHECK-NEXT: 5.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       ldr	d1, [x27, #254]!
+# CHECK-NEXT: 7.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    0.0    0.0       ldr	q1, [x27, #254]!
+# CHECK-NEXT: 9.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    0.1    1.5       <total>
 
 # CHECK:      [48] Code Region - G49
 
@@ -3508,7 +3508,7 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      604
+# CHECK-NEXT: Total Cycles:      603
 # CHECK-NEXT: Total uOps:        1600
 
 # CHECK:      Dispatch Width:    3
@@ -3517,18 +3517,18 @@ add x0, x27, 1
 # CHECK-NEXT: Block RThroughput: 5.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeeeER  .   ldrsh	x1, [x27, #254]!
-# CHECK-NEXT: [0,1]     D=eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeeeeER .   ldrsw	x1, [x27], #254
-# CHECK-NEXT: [0,3]     .D=eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeeER.   ldrsw	x1, [x27, #254]!
-# CHECK-NEXT: [0,5]     . D=eE--R.   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  DeE--R.   st1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,7]     .  D=eE-R.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   DeeER.   st1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: [0,9]     .    D=eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER .   ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER.   ldrsw	x1, [x27], #254
+# CHECK-NEXT: [0,3]     .D=eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeER   ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: [0,5]     . D=eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeE--R   st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,7]     .  D=eE-R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   DeeER   st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3546,35 +3546,34 @@ add x0, x27, 1
 # CHECK-NEXT: 6.     1     1.0    0.0    2.0       st1	{ v1.1d }, [x27], #8
 # CHECK-NEXT: 7.     1     2.0    0.0    1.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.5    0.1    0.9       <total>
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.4    0.1    0.9       <total>
 
 # CHECK:      [52] Code Region - G53
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      704
+# CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        1700
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    2.41
+# CHECK-NEXT: uOps Per Cycle:    2.42
 # CHECK-NEXT: IPC:               1.42
 # CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeER .    .   st1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: [0,1]     D=eER.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeER.    .   st1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,3]     .D=eER    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeER   .   st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,5]     .  D=eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=eER  .   st1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: [0,7]     .   D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    DeeER.   st1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .D=eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeER .   .   st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     D=eER.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeER.   .   st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,3]     .D=eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER  .   st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,5]     .  DeER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eER .   st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,7]     .   D=eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    DeeER   st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3588,12 +3587,12 @@ add x0, x27, 1
 # CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
 # CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       st1	{ v1.8b }, [x27], #8
 # CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.6    0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.4    0.2    0.0       <total>
 
 # CHECK:      [53] Code Region - G54
 
@@ -3612,11 +3611,11 @@ add x0, x27, 1
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeER.    .   st1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: [0,1]     .D=eER    .   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .DeER.    .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     .D=eER    .   st1	{ v1.1d }, [x27], x28
 # CHECK-NEXT: [0,3]     . D=eER   .   add	x0, x27, #1
 # CHECK-NEXT: [0,4]     .  DeeER  .   st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     .   D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,5]     .   DeER  .   add	x0, x27, #1
 # CHECK-NEXT: [0,6]     .   D=eER .   st1	{ v1.2s }, [x27], x28
 # CHECK-NEXT: [0,7]     .    D=eER.   add	x0, x27, #1
 # CHECK-NEXT: [0,8]     .    D=eER.   st1	{ v1.4h }, [x27], x28
@@ -3630,22 +3629,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    1.0    0.0       st1	{ v1.1d }, [x27], x28
 # CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       st1	{ v1.2s }, [x27], x28
 # CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     2.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
 # CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.8    0.1    0.0       <total>
+# CHECK-NEXT:        1     1.6    0.3    0.0       <total>
 
 # CHECK:      [54] Code Region - G55
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      904
+# CHECK-NEXT: Total Cycles:      903
 # CHECK-NEXT: Total uOps:        1900
 
 # CHECK:      Dispatch Width:    3
@@ -3654,19 +3653,19 @@ add x0, x27, 1
 # CHECK-NEXT: Block RThroughput: 9.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     .D=eER    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=eER    . .   st1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     . D=eER   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .  DeeER  . .   st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     .   D=eER . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    DeeER. .   st1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .D=eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    . DeeER.   st1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .  D=eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.    ..   st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeER.    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=eER    ..   st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     . D=eER   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  DeeER  ..   st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .   DeER  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    DeeER..   st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .DeER..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    . DeeER   st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .  DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3676,22 +3675,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    1.0    0.0       st1	{ v1.8b }, [x27], x28
 # CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 9.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     1.6    0.1    0.0       <total>
+# CHECK-NEXT: 5.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.2    0.4    0.0       <total>
 
 # CHECK:      [55] Code Region - G56
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1404
+# CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        2400
 
 # CHECK:      Dispatch Width:    3
@@ -3700,19 +3699,19 @@ add x0, x27, 1
 # CHECK-NEXT: Block RThroughput: 14.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234567
+# CHECK-NEXT:                     0123456
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    . .   st1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,1]     .D===eER  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D==eeER .    . .   st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,3]     .  D===eER.    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D==eeER    . .   st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,5]     .    D===eER   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==eeeeER. .   st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,7]     .    . D=====eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D====eeER.   st1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,9]     .    .   D=====eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .    ..   st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeE--R   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D==eeER .    ..   st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,3]     .  D==eER .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==eeER    ..   st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,5]     .    D==eER    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==eeeeER..   st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,7]     .    . D==eE--R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D====eeER   st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,9]     .    .   D====eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3722,22 +3721,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 1.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 9.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.9    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    3.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    1.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    1.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    3.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.0    0.9    0.4       <total>
 
 # CHECK:      [56] Code Region - G57
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1604
+# CHECK-NEXT: Total Cycles:      1603
 # CHECK-NEXT: Total uOps:        2600
 
 # CHECK:      Dispatch Width:    3
@@ -3746,19 +3745,19 @@ add x0, x27, 1
 # CHECK-NEXT: Block RThroughput: 16.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    .   .   st1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,1]     .D===eER  .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D==eeeeER    .   .   st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,3]     .  D=====eER   .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D====eeER  .   .   st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,5]     .    D=====eER .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D====eeeeER  .   st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . D=======eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D======eeER.   st1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D=======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .    .  .   st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1]     .DeE--R   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D==eeeeER    .  .   st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,3]     .  D==eE--R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D====eeER  .  .   st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,5]     .    D====eER  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D====eeeeER .   st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . D====eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D======eeER   st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D======eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3768,22 +3767,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 1.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 3.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     5.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 9.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     5.3    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    3.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    3.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     5.0    1.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    3.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.2    1.1    0.6       <total>
 
 # CHECK:      [57] Code Region - G58
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1604
+# CHECK-NEXT: Total Cycles:      1603
 # CHECK-NEXT: Total uOps:        2600
 
 # CHECK:      Dispatch Width:    3
@@ -3792,19 +3791,19 @@ add x0, x27, 1
 # CHECK-NEXT: Block RThroughput: 16.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
+# CHECK-NEXT:                     012345678
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.    .    .   .   st1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     .D=eER    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . DeeeeER .    .   .   st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .  D===eER.    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D==eeER    .   .   st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    D===eER   .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D==eeeeER.   .   st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    . D=====eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D====eeeeER.   st1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D=======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.    .    .  .   st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeER.    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeER .    .  .   st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE--R .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==eeER    .  .   st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    D==eER    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D==eeeeER.  .   st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    . D==eE--R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  D====eeeeER   st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .   D====eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3814,22 +3813,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 9.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.7    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    3.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    1.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    3.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.6    0.9    0.6       <total>
 
 # CHECK:      [58] Code Region - G59
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2104
+# CHECK-NEXT: Total Cycles:      2103
 # CHECK-NEXT: Total uOps:        3100
 
 # CHECK:      Dispatch Width:    3
@@ -3839,18 +3838,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234
-
-# CHECK:      [0,0]     DeeeER    .    .    .   .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     .D==eER   .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=eeeeeeER   .    .   .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,3]     .   D=====eER  .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    D====eeeER.    .   .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,5]     .    .D======eER    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    . D=====eeeER  .   .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,7]     .    .  D=======eER .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .   D======eeeeeeER.   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,9]     .    .    .D==========eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeER    .    .    .  .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     .DeE-R    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=eeeeeeER   .    .  .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3]     .   DeE----R   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    D====eeeER.    .  .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .D====eE-R.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . D=====eeeER  .  .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,7]     .    .  D=====eE-R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .   D======eeeeeeER   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,9]     .    .    .D=====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3860,22 +3859,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 3.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 5.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 9.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     5.6    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    2.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 3.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    5.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 5.     1     5.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    2.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 7.     1     6.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    2.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 9.     1     6.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.0    1.2    1.1       <total>
 
 # CHECK:      [59] Code Region - G60
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2404
+# CHECK-NEXT: Total Cycles:      2403
 # CHECK-NEXT: Total uOps:        3400
 
 # CHECK:      Dispatch Width:    3
@@ -3885,18 +3884,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234567
-
-# CHECK:      [0,0]     DeeeER    .    .    .    . .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,1]     .D==eER   .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=eeeeeeER   .    .    . .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,3]     .   D=====eER  .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    D====eeeeeeER  .    . .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,5]     .    . D========eER .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .  D=======eeeER    . .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .   D=========eER   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    D========eeeeeeER.   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    . D============eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123456
+
+# CHECK:      [0,0]     DeeeER    .    .    .    ..   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1]     .DeE-R    .    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=eeeeeeER   .    .    ..   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3]     .   DeE----R   .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    D====eeeeeeER  .    ..   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,5]     .    . D===eE----R  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .  D=======eeeER    ..   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .   D=======eE-R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    D========eeeeeeER   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    . D=======eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3906,22 +3905,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 3.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 5.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     8.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     9.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     6.6    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    2.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    5.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     8.0    5.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     8.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     9.0    2.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     8.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.7    1.5    1.4       <total>
 
 # CHECK:      [60] Code Region - G61
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2104
+# CHECK-NEXT: Total Cycles:      2103
 # CHECK-NEXT: Total uOps:        3100
 
 # CHECK:      Dispatch Width:    3
@@ -3931,18 +3930,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234
-
-# CHECK:      [0,0]     DeeeER    .    .    .   .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     .D==eER   .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=eeeER .    .    .   .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .  D===eER.    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D==eeeeeeER.    .   .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .D======eER    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    . D=====eeeER  .   .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .  D=======eER .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .   D======eeeeeeER.   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .D==========eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeER    .    .    .  .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-R    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=eeeER .    .    .  .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .  D=eE-R .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==eeeeeeER.    .  .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .D=eE----R.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . D=====eeeER  .  .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .  D=====eE-R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .   D======eeeeeeER   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .D=====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3952,22 +3951,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    2.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    5.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     6.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    2.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     6.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.6    1.2    1.1       <total>
 
 # CHECK:      [61] Code Region - G62
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2604
+# CHECK-NEXT: Total Cycles:      2603
 # CHECK-NEXT: Total uOps:        3600
 
 # CHECK:      Dispatch Width:    3
@@ -3977,18 +3976,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          012345678
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .   .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     . D====eER.    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .  D===eeeeER  .    .    .   .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,3]     .   D======eER .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    D=====eeeeeeeeER    .   .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,5]     .    .  D==========eER   .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .   D=========eeeeER.   .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,7]     .    .    D============eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .D===========eeeeER.   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,9]     .    .    . D==============eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .  .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeE---R .    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  D===eeeeER  .    .    .  .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3]     .   D===eE--R  .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    D=====eeeeeeeeER    .  .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .  D===eE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .   D=========eeeeER.  .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,7]     .    .    D=========eE--R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .D===========eeeeER   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    . D===========eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3998,22 +3997,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     4.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 3.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     6.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 5.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     10.0   0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 7.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     12.0   0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 9.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     8.4    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     4.0    4.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     6.0    3.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 5.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     10.0   7.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 7.     1     10.0   0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     12.0   3.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 9.     1     12.0   0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     6.4    1.9    1.5       <total>
 
 # CHECK:      [62] Code Region - G63
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3204
+# CHECK-NEXT: Total Cycles:      3203
 # CHECK-NEXT: Total uOps:        4200
 
 # CHECK:      Dispatch Width:    3
@@ -4022,19 +4021,19 @@ add x0, x27, 1
 # CHECK-NEXT: Block RThroughput: 32.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT:                     0123456789          01234
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,1]     .  D=====eER   .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   D====eeeeER.    .    .    .    .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,3]     .    D=======eER    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .D======eeeeeeeeER  .    .    .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,5]     .    .   D===========eER .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    D==========eeeeeeeeER    .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,7]     .    .    .  D===============eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .   D==============eeeeER.   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    D=================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .   .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1]     .  DeE----R    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   D====eeeeER.    .    .    .   .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3]     .    D====eE--R.    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .D======eeeeeeeeER  .    .   .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,5]     .    .   D====eE------R  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    D==========eeeeeeeeER   .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .  D========eE------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .   D==============eeeeER   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    D==============eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4044,22 +4043,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 3.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     7.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     11.0   0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     9.9    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    5.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 3.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     7.0    3.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 5.     1     5.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     11.0   7.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 7.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     15.0   7.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     15.0   0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     7.4    2.4    2.0       <total>
 
 # CHECK:      [63] Code Region - G64
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2804
+# CHECK-NEXT: Total Cycles:      2803
 # CHECK-NEXT: Total uOps:        3800
 
 # CHECK:      Dispatch Width:    3
@@ -4068,19 +4067,19 @@ add x0, x27, 1
 # CHECK-NEXT: Block RThroughput: 28.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01
+# CHECK-NEXT:                     0123456789          0
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    ..   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .  D=====eER   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   D====eeeeER.    .    .    ..   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    D=======eER    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .D======eeeeER .    .    ..   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    . D=========eER.    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .  D========eeeeeeeeER   ..   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .D=============eER  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    . D============eeeeER.   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .  D===============eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DeE----R    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   D====eeeeER.    .    .    .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    D====eE--R.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .D======eeeeER .    .    .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    . D======eE--R .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .  D========eeeeeeeeER   .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .D======eE------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    . D============eeeeER   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .  D============eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4090,16 +4089,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     7.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     9.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     13.0   0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     8.9    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    5.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     7.0    3.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     7.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     9.0    3.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     7.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     13.0   7.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     13.0   0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     6.8    2.0    1.6       <total>
 
 # CHECK:      [64] Code Region - G65
 
@@ -4118,9 +4117,9 @@ add x0, x27, 1
 # CHECK-NEXT: Index     0123456789          012
 
 # CHECK:      [0,0]     DeeeeeeeeER    .    . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .  D=====eER   .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,1]     .  DeE----R    .    . .   add	x0, x27, #1
 # CHECK-NEXT: [0,2]     .   D====eeeeeeeeER . .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    . D=========eER. .   add	x0, x27, #1
+# CHECK-NEXT: [0,3]     .    . D==eE------R . .   add	x0, x27, #1
 # CHECK-NEXT: [0,4]     .    . D=========eER. .   st1	{ v1.b }[0], [x27], #1
 # CHECK-NEXT: [0,5]     .    .  D=========eER .   add	x0, x27, #1
 # CHECK-NEXT: [0,6]     .    .  D=========eER .   st1	{ v1.b }[8], [x27], #1
@@ -4136,16 +4135,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     10.0   0.0    0.0       st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 1.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    5.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     10.0   7.0    0.0       st1	{ v1.b }[0], [x27], #1
 # CHECK-NEXT: 5.     1     10.0   0.0    0.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     10.0   0.0    0.0       st1	{ v1.b }[8], [x27], #1
 # CHECK-NEXT: 7.     1     10.0   0.0    0.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     10.0   0.0    0.0       st1	{ v1.b }[0], [x27], x28
 # CHECK-NEXT: 9.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     8.2    0.1    0.0       <total>
+# CHECK-NEXT:        1     7.0    1.4    1.0       <total>
 
 # CHECK:      [65] Code Region - G66
 
@@ -4196,28 +4195,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1204
+# CHECK-NEXT: Total Cycles:      805
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    1.66
-# CHECK-NEXT: IPC:               0.83
+# CHECK-NEXT: uOps Per Cycle:    2.48
+# CHECK-NEXT: IPC:               1.24
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345
+# CHECK-NEXT:                     012
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeER .    .    .   st1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,1]     D=eER.    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .DeER.    .    .   st1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .D=eER    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . DeeeER  .    .   st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,5]     .  D==eER .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .   D=eeeER    .   st1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    D===eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .D==eeeeER.   st2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,9]     .    . D=====eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeER .    . .   st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,1]     D=eER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeER.    . .   st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D=eER    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeER  . .   st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,5]     .  DeE-R  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   DeeeER. .   st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    DeE-R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .DeeeeER   st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,9]     .    . DeE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4231,39 +4230,39 @@ add x0, x27, 1
 # CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
 # CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 9.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.5    0.1    0.0       <total>
+# CHECK-NEXT: 5.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 9.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.2    0.3    0.4       <total>
 
 # CHECK:      [67] Code Region - G68
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1704
+# CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        3100
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    1.82
-# CHECK-NEXT: IPC:               0.59
+# CHECK-NEXT: uOps Per Cycle:    2.21
+# CHECK-NEXT: IPC:               0.71
 # CHECK-NEXT: Block RThroughput: 14.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER    .    .    .   st2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,1]     .D==eER   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=eeeER .    .    .   st2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,3]     .  D===eER.    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D==eeeeER  .    .   st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .D====eER .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    . D===eeeER    .   st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,7]     .    .  D=====eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .   D====eeeeER.   st2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,9]     .    .    .D======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeER    .    ..   st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,1]     .DeE-R    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeER  .    ..   st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3]     .  DeE-R  .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeeER    ..   st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .DeE-R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . D=eeeER ..   st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,7]     .    .  D=eE-R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .   D=eeeeER   st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    .DeE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4273,43 +4272,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 3.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 5.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    2.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 7.     1     2.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    1.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 9.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.3    0.7    0.6       <total>
 
 # CHECK:      [68] Code Region - G69
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1804
+# CHECK-NEXT: Total Cycles:      1603
 # CHECK-NEXT: Total uOps:        3200
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    1.77
-# CHECK-NEXT: IPC:               0.55
+# CHECK-NEXT: uOps Per Cycle:    2.00
+# CHECK-NEXT: IPC:               0.62
 # CHECK-NEXT: Block RThroughput: 16.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    .    ..   st2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,1]     . D==eER  .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .  D=eeeeER    .    ..   st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .   D====eER   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    D===eeeER .    ..   st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .D=====eER.    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    . D====eeeER   ..   st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .  D======eER  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .   D=====eeeeER.   st2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .D=======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .    .  .   st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1]     . DeE-R   .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  D=eeeeER    .  .   st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .   D=eE--R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    D===eeeER .  .   st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .D===eE-R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    . D===eeeER  .   st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .  D===eE-R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .   D===eeeeER   st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .D==eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4319,43 +4318,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     5.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     6.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.7    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    2.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    3.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     4.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    1.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    1.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.9    0.9    0.7       <total>
 
 # CHECK:      [69] Code Region - G70
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1704
+# CHECK-NEXT: Total Cycles:      1205
 # CHECK-NEXT: Total uOps:        2900
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    1.70
-# CHECK-NEXT: IPC:               0.59
+# CHECK-NEXT: uOps Per Cycle:    2.41
+# CHECK-NEXT: IPC:               0.83
 # CHECK-NEXT: Block RThroughput: 12.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER    .    .    .   st2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     .D==eER   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=eeeeER.    .    .   st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .   D===eER    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    D==eeeeER .    .   st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .    . D====eER.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .  D===eeeER   .   st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,7]     .    .   D=====eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    D====eeeER.   st2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,9]     .    .    .D======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeER    .    ..   st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-R    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeER .    ..   st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .   DeE-R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    D=eeeeER  ..   st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .    . DeE--R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .  D==eeeER.   st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .    .   D==eE-R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    D=eeeER   st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,9]     .    .    .D=eE-R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4365,43 +4364,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    2.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    3.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 7.     1     3.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 9.     1     2.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.7    0.8    0.6       <total>
 
 # CHECK:      [70] Code Region - G71
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1504
+# CHECK-NEXT: Total Cycles:      1004
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    1.33
-# CHECK-NEXT: IPC:               0.66
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.00
 # CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
+# CHECK-NEXT:                     0123
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER    .    .  .   st2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .D==eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=eeeER .    .  .   st2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     .  D===eER.    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D==eeeER   .  .   st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .    D====eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D===eeeER.  .   st2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,7]     .    . D=====eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D====eeeER.   st2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeER    .  .   st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeER  .  .   st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE-R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeER.  .   st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    DeE-R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeER .   st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,7]     .    . DeE-R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeER   st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE-R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4411,43 +4410,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 5.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 7.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.5       <total>
 
 # CHECK:      [71] Code Region - G72
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1304
+# CHECK-NEXT: Total Cycles:      1003
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    1.53
-# CHECK-NEXT: IPC:               0.77
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.00
 # CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     012
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER    .    ..   st2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,1]     .D==eER   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=eeeER .    ..   st2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,3]     .  D===eER.    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D==eeeER   ..   st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    D====eER  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D===eeER ..   st2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,7]     .    . D====eER..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D===eeER.   st2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D====eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeER    . .   st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-R    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeER  . .   st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .  DeE-R  . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeER. .   st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE-R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeER .   st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .    . DeER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeER   st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4457,22 +4456,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 7.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.6    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 3.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 7.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.3       <total>
 
 # CHECK:      [72] Code Region - G73
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
-# CHECK-NEXT: Total Cycles:      1204
+# CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        2200
 
 # CHECK:      Dispatch Width:    3
@@ -4481,15 +4480,15 @@ add x0, x27, 1
 # CHECK-NEXT: Block RThroughput: 12.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345
+# CHECK-NEXT:                     01234
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,1]     . D====eER.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .  D===eeeER   .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,3]     .    D====eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .D===eeeER.   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,5]     .    .  D====eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .   .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,1]     . DeE---R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  D===eeeER  .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,3]     .    D==eE-R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .D===eeeER   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,5]     .    .  D==eE-R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4499,18 +4498,18 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     4.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.0    0.2    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     4.0    4.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 3.     1     3.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     4.0    2.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 5.     1     3.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    1.3    0.8       <total>
 
 # CHECK:      [73] Code Region - G74
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2704
+# CHECK-NEXT: Total Cycles:      2703
 # CHECK-NEXT: Total uOps:        5100
 
 # CHECK:      Dispatch Width:    3
@@ -4519,19 +4518,19 @@ add x0, x27, 1
 # CHECK-NEXT: Block RThroughput: 27.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT:                     0123456789
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,1]     .  D===eER.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   D==eeeER   .    .    .    .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,3]     .    .D===eER  .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    . D==eeeeeeER  .    .    .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,5]     .    .    D=====eER .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .D====eeeeeeER .    .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,7]     .    .    .   D=======eER.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    D======eeeeeeER.   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    . D==========eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .    .    .    .   .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     .  DeE--R .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   D==eeeER   .    .    .   .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3]     .    .D=eE-R   .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    . D==eeeeeeER  .    .   .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,5]     .    .    DeE----R  .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .D====eeeeeeER .   .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,7]     .    .    .   D==eE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    D======eeeeeeER   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . D=====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4541,22 +4540,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 1.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     5.0    0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    3.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 5.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     5.0    5.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    5.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     6.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.2    1.7    1.5       <total>
 
 # CHECK:      [74] Code Region - G75
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2104
+# CHECK-NEXT: Total Cycles:      2103
 # CHECK-NEXT: Total uOps:        4500
 
 # CHECK:      Dispatch Width:    3
@@ -4566,18 +4565,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234
-
-# CHECK:      [0,0]     DeeeER    .    .    .   .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     . D=eER   .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .  DeeeER .    .    .   .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .    D=eER.    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .DeeeeeeER.    .   .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .   D===eER    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    D==eeeER  .   .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    . D===eER .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .  D==eeeeeeER.   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    .D=====eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeER    .    .    .  .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     . DeER    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  DeeeER .    .    .  .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .    DeER .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .DeeeeeeER.    .  .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .   DeE--R.    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    D==eeeER  .  .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    . D=eE-R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .  D==eeeeeeER   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    .DeE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4587,43 +4586,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     1.0    1.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    3.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    2.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.5    1.1    0.7       <total>
 
 # CHECK:      [75] Code Region - G76
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1804
+# CHECK-NEXT: Total Cycles:      1204
 # CHECK-NEXT: Total uOps:        2800
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    1.55
-# CHECK-NEXT: IPC:               0.55
+# CHECK-NEXT: uOps Per Cycle:    2.33
+# CHECK-NEXT: IPC:               0.83
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    ..   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     .  D===eER.    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   D==eeeER   .    ..   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,3]     .    D====eER  .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .D===eeeER.    ..   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,5]     .    . D=====eER    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .  D====eeeER  ..   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    .   D======eER ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    D=====eeeER.   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .D=======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .    .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DeE--R .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   D==eeeER   .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,3]     .    D==eE-R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .D=eeeER  .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,5]     .    . D=eE-R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .  DeeeER .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    .   DeE-R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    DeeeER   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .DeE-R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4633,43 +4632,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     5.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 7.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     6.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 9.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.9    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    3.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 3.     1     3.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 5.     1     2.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.6    0.6    0.6       <total>
 
 # CHECK:      [76] Code Region - G77
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1504
+# CHECK-NEXT: Total Cycles:      1004
 # CHECK-NEXT: Total uOps:        2100
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    1.40
-# CHECK-NEXT: IPC:               0.66
+# CHECK-NEXT: uOps Per Cycle:    2.09
+# CHECK-NEXT: IPC:               1.00
 # CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
+# CHECK-NEXT:                     0123
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER    .    .  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,1]     .D==eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=eeeER .    .  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,3]     .  D===eER.    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D==eeeER   .  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    D====eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D===eeeER.  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,7]     .    . D=====eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D====eeeER.   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,9]     .    .   D======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeER    .  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,1]     .DeE-R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeER  .  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,3]     .  DeE-R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeER.  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    DeE-R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeER .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE-R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeER   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,9]     .    .   DeE-R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4679,43 +4678,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 3.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 9.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.5       <total>
 
 # CHECK:      [77] Code Region - G78
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2104
+# CHECK-NEXT: Total Cycles:      1903
 # CHECK-NEXT: Total uOps:        3300
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    1.57
-# CHECK-NEXT: IPC:               0.48
+# CHECK-NEXT: uOps Per Cycle:    1.73
+# CHECK-NEXT: IPC:               0.53
 # CHECK-NEXT: Block RThroughput: 19.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234
-
-# CHECK:      [0,0]     DeeeER    .    .    .   .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     .D==eER   .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=eeeER .    .    .   .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,3]     .  D===eER.    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D==eeeER   .    .   .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .    D====eER  .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D===eeeeeeeeER.   .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,7]     .    .   D========eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    D=======eeeeER.   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,9]     .    .    . D=========eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          01
+
+# CHECK:      [0,0]     DeeeER    .    .    ..   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-R    .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D=eeeER .    .    ..   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,3]     .  D=eE-R .    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D=eeeER    .    ..   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .    D=eE-R    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D=eeeeeeeeER  ..   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .   DeE-----R  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    D=====eeeeER   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,9]     .    .    . D====eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4725,22 +4724,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 7.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     8.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 9.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.9    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    2.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 7.     1     1.0    1.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     6.0    6.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.4    1.2    1.0       <total>
 
 # CHECK:      [78] Code Region - G79
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3204
+# CHECK-NEXT: Total Cycles:      3203
 # CHECK-NEXT: Total uOps:        5800
 
 # CHECK:      Dispatch Width:    3
@@ -4749,19 +4748,19 @@ add x0, x27, 1
 # CHECK-NEXT: Block RThroughput: 32.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
+# CHECK-NEXT:                     0123456789          01234
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    .    .    .    .    .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,1]     . D==eER  .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .  D=eeeeeeeeER.    .    .    .    .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,3]     .    . D=====eER    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .  D====eeeeER .    .    .    .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,5]     .    .    D======eER.    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    .D=====eeeeeeeeER   .    .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,7]     .    .    .    D=========eER  .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    .D========eeeeeeeeER.   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,9]     .    .    .    .    D============eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     . DeE-R   .    .    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .  D=eeeeeeeeER.    .    .    .   .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3]     .    . DeE----R.    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .  D====eeeeER .    .    .   .   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,5]     .    .    D===eE--R .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    .D=====eeeeeeeeER   .   .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,7]     .    .    .    D==eE------R   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    .D========eeeeeeeeER   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,9]     .    .    .    .    D=====eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4771,22 +4770,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 3.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 5.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 7.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     9.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 9.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     6.2    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    2.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 3.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    5.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    3.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     9.0    7.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 9.     1     6.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.8    2.0    1.9       <total>
 
 # CHECK:      [79] Code Region - G80
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2804
+# CHECK-NEXT: Total Cycles:      2803
 # CHECK-NEXT: Total uOps:        4800
 
 # CHECK:      Dispatch Width:    3
@@ -4795,19 +4794,19 @@ add x0, x27, 1
 # CHECK-NEXT: Block RThroughput: 28.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          01
+# CHECK-NEXT:                     0123456789          0
 # CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    ..   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     .  D=====eER   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .   D====eeeeER.    .    .    ..   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .D======eER    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    . D=====eeeeER .    .    ..   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .    .   D=======eER.    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    D======eeeeeeeeER   ..   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .    .    .   D==========eER  ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .    D=========eeeeER.   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    . D===========eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     .  DeE----R    .    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .   D====eeeeER.    .    .    .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .D===eE--R.    .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    . D=====eeeeER .    .    .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .    .   D====eE--R .    .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    D======eeeeeeeeER   .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .    .    .   D===eE------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .    D=========eeeeER   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    . D========eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4817,43 +4816,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     6.0    0.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     7.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     10.0   0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     7.3    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     5.0    5.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     6.0    3.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     7.0    3.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     10.0   7.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     9.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     5.2    2.0    1.6       <total>
 
 # CHECK:      [80] Code Region - G81
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2504
+# CHECK-NEXT: Total Cycles:      1905
 # CHECK-NEXT: Total uOps:        4000
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    1.60
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    2.10
+# CHECK-NEXT: IPC:               0.52
 # CHECK-NEXT: Block RThroughput: 19.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .  .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .   D====eER   .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .    D===eeeeeeeeER .    .  .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .    .   D=======eER.    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .    .    D======eeeER   .  .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .    .    .D========eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .    . D=======eeeER.  .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,7]     .    .    .  D=========eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .    .   D========eeeER.   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .    .    D==========eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .  .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .   DeE---R    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .    D===eeeeeeeeER .  .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .    .   DeE------R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .    .    D======eeeER .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .    .    .D======eE-R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .    . D=====eeeER.   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,7]     .    .    .  D=====eE-R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .    .   D====eeeER   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .    .    D====eE-R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4863,43 +4862,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     4.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     7.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 5.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     8.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 7.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     9.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 9.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     7.2    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    1.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     4.0    4.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     7.0    7.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 5.     1     7.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 7.     1     6.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     5.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT:        1     4.3    1.3    1.2       <total>
 
 # CHECK:      [81] Code Region - G82
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1504
+# CHECK-NEXT: Total Cycles:      1004
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    1.33
-# CHECK-NEXT: IPC:               0.66
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               1.00
 # CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
+# CHECK-NEXT:                     0123
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeER    .    .  .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,1]     .D==eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=eeeER .    .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,3]     .  D===eER.    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D==eeeER   .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,5]     .    D====eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D===eeeER.  .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    . D=====eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .  D====eeeER.   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,9]     .    .   D======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeER    .  .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeER  .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .  DeE-R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   DeeeER.  .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,5]     .    DeE-R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .DeeeER .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    . DeE-R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .  DeeeER   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .    .   DeE-R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4909,41 +4908,41 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 3.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 5.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     1.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     1.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.0    0.5    0.5       <total>
 
 # CHECK:      [82] Code Region - G83
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      800
-# CHECK-NEXT: Total Cycles:      1404
+# CHECK-NEXT: Total Cycles:      1204
 # CHECK-NEXT: Total uOps:        2200
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    1.57
-# CHECK-NEXT: IPC:               0.57
+# CHECK-NEXT: uOps Per Cycle:    1.83
+# CHECK-NEXT: IPC:               0.66
 # CHECK-NEXT: Block RThroughput: 12.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01234567
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    . .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,1]     .D===eER  .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D==eeeeER    . .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .  D=====eER   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D====eeeER . .   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,5]     .    D======eER. .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    .D=====eeeER.   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .    . D=======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .    .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,1]     .DeE--R   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . D==eeeeER    .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .  D==eE--R    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D====eeeER .   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,5]     .    D====eE-R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    .D====eeeER   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .    . D====eE-R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4953,14 +4952,14 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 1.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 3.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     5.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 5.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     5.0    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     3.0    3.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     5.0    3.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 5.     1     5.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     5.0    1.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     5.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.5    1.0    0.8       <total>
 
 # CHECK:      [83] Code Region - G84
 


        


More information about the llvm-commits mailing list