[llvm] c1b6ed4 - [AArch64] Fix postinc operands for Neoverse-N2 scheduling
David Green via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 11 09:57:49 PDT 2023
Author: David Green
Date: 2023-10-11T17:57:42+01:00
New Revision: c1b6ed42b75a17d1718aa377f76633b27e15a4e2
URL: https://github.com/llvm/llvm-project/commit/c1b6ed42b75a17d1718aa377f76633b27e15a4e2
DIFF: https://github.com/llvm/llvm-project/commit/c1b6ed42b75a17d1718aa377f76633b27e15a4e2.diff
LOG: [AArch64] Fix postinc operands for Neoverse-N2 scheduling
Similar to D159254, this fixes the order of WriteAdr operands on post/pre-inc
loads/stores in the Neoverse-N2 scheduling model.
Added:
Modified:
llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
index a6e28e653e33c17..517d0da7f47f428 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
@@ -742,7 +742,7 @@ def : SchedAlias<WriteLDIdx, N2Write_4cyc_1I_1L>;
// Load pair, signed immed offset, signed words
def : InstRW<[N2Write_5cyc_1M0, WriteLDHi], (instrs LDPSWi)>;
// Load pair, immed post-index or immed pre-index, signed words
-def : InstRW<[N2Write_5cyc_1M0, WriteLDHi, WriteAdr],
+def : InstRW<[WriteAdr, N2Write_5cyc_1M0, WriteLDHi],
(instregex "^LDPSW(post|pre)$")>;
// Store instructions
@@ -860,7 +860,7 @@ def : InstRW<[N2Write_6cyc_1L], (instregex "^LDR[SDQ]l$",
// Load vector reg, immed post-index
def : InstRW<[N2Write_6cyc_1I_1L, WriteI], (instregex "^LDR[BHSDQ]post$")>;
// Load vector reg, immed pre-index
-def : InstRW<[N2Write_6cyc_1I_1L, WriteAdr], (instregex "^LDR[BHSDQ]pre$")>;
+def : InstRW<[WriteAdr, N2Write_6cyc_1I_1L], (instregex "^LDR[BHSDQ]pre$")>;
// Load vector reg, unsigned immed
def : InstRW<[N2Write_6cyc_1L], (instregex "^LDR[BHSDQ]ui$")>;
@@ -883,12 +883,12 @@ def : InstRW<[N2Write_6cyc_2L, WriteLDHi], (instrs LDPQi, LDNPQi)>;
// Load vector pair, immed post-index, S/D-form
// Load vector pair, immed pre-index, S/D-form
-def : InstRW<[N2Write_6cyc_1I_1L, WriteLDHi, WriteAdr],
+def : InstRW<[WriteAdr, N2Write_6cyc_1I_1L, WriteLDHi],
(instregex "^LDP[SD](pre|post)$")>;
// Load vector pair, immed post-index, Q-form
// Load vector pair, immed pre-index, Q-form
-def : InstRW<[N2Write_6cyc_2I_2L, WriteLDHi, WriteAdr], (instrs LDPQpost,
+def : InstRW<[WriteAdr, N2Write_6cyc_2I_2L, WriteLDHi], (instrs LDPQpost,
LDPQpre)>;
// FP store instructions
@@ -1238,223 +1238,223 @@ def : InstRW<[N2Write_5cyc_1M0_1V], (instregex "^INSvi(8|16|32|64)gpr$")>;
// ASIMD load, 1 element, multiple, 1 reg, D-form
def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1Onev(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_6cyc_1L, WriteAdr],
+def : InstRW<[WriteAdr, N2Write_6cyc_1L],
(instregex "^LD1Onev(8b|4h|2s|1d)_POST$")>;
// ASIMD load, 1 element, multiple, 1 reg, Q-form
def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1Onev(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_6cyc_1L, WriteAdr],
+def : InstRW<[WriteAdr, N2Write_6cyc_1L],
(instregex "^LD1Onev(16b|8h|4s|2d)_POST$")>;
// ASIMD load, 1 element, multiple, 2 reg, D-form
def : InstRW<[N2Write_6cyc_2L], (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_6cyc_2L, WriteAdr],
+def : InstRW<[WriteAdr, N2Write_6cyc_2L],
(instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
// ASIMD load, 1 element, multiple, 2 reg, Q-form
def : InstRW<[N2Write_6cyc_2L], (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_6cyc_2L, WriteAdr],
+def : InstRW<[WriteAdr, N2Write_6cyc_2L],
(instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
// ASIMD load, 1 element, multiple, 3 reg, D-form
def : InstRW<[N2Write_6cyc_3L], (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_6cyc_3L, WriteAdr],
+def : InstRW<[WriteAdr, N2Write_6cyc_3L],
(instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
// ASIMD load, 1 element, multiple, 3 reg, Q-form
def : InstRW<[N2Write_6cyc_3L], (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_6cyc_3L, WriteAdr],
+def : InstRW<[WriteAdr, N2Write_6cyc_3L],
(instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
// ASIMD load, 1 element, multiple, 4 reg, D-form
def : InstRW<[N2Write_7cyc_4L], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_7cyc_4L, WriteAdr],
+def : InstRW<[WriteAdr, N2Write_7cyc_4L],
(instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
// ASIMD load, 1 element, multiple, 4 reg, Q-form
def : InstRW<[N2Write_7cyc_4L], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_7cyc_4L, WriteAdr],
+def : InstRW<[WriteAdr, N2Write_7cyc_4L],
(instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
// ASIMD load, 1 element, one lane, B/H/S
// ASIMD load, 1 element, one lane, D
def : InstRW<[N2Write_8cyc_1L_1V], (instregex "LD1i(8|16|32|64)$")>;
-def : InstRW<[N2Write_8cyc_1L_1V, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_1L_1V], (instregex "LD1i(8|16|32|64)_POST$")>;
// ASIMD load, 1 element, all lanes, D-form, B/H/S
// ASIMD load, 1 element, all lanes, D-form, D
def : InstRW<[N2Write_8cyc_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)_POST$")>;
// ASIMD load, 1 element, all lanes, Q-form
def : InstRW<[N2Write_8cyc_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
// ASIMD load, 2 element, multiple, D-form, B/H/S
def : InstRW<[N2Write_8cyc_1L_2V], (instregex "LD2Twov(8b|4h|2s)$")>;
-def : InstRW<[N2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_1L_2V], (instregex "LD2Twov(8b|4h|2s)_POST$")>;
// ASIMD load, 2 element, multiple, Q-form, B/H/S
// ASIMD load, 2 element, multiple, Q-form, D
def : InstRW<[N2Write_8cyc_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_8cyc_2L_2V, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>;
// ASIMD load, 2 element, one lane, B/H
// ASIMD load, 2 element, one lane, S
// ASIMD load, 2 element, one lane, D
def : InstRW<[N2Write_8cyc_1L_2V], (instregex "LD2i(8|16|32|64)$")>;
-def : InstRW<[N2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_1L_2V], (instregex "LD2i(8|16|32|64)_POST$")>;
// ASIMD load, 2 element, all lanes, D-form, B/H/S
// ASIMD load, 2 element, all lanes, D-form, D
def : InstRW<[N2Write_8cyc_1L_2V], (instregex "LD2Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_1L_2V], (instregex "LD2Rv(8b|4h|2s|1d)_POST$")>;
// ASIMD load, 2 element, all lanes, Q-form
def : InstRW<[N2Write_8cyc_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
// ASIMD load, 3 element, multiple, D-form, B/H/S
def : InstRW<[N2Write_8cyc_2L_3V], (instregex "LD3Threev(8b|4h|2s)$")>;
-def : InstRW<[N2Write_8cyc_2L_3V, WriteAdr], (instregex "LD3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_2L_3V], (instregex "LD3Threev(8b|4h|2s)_POST$")>;
// ASIMD load, 3 element, multiple, Q-form, B/H/S
def : InstRW<[N2Write_8cyc_3L_3V], (instregex "LD3Threev(16b|8h|4s)$")>;
-def : InstRW<[N2Write_8cyc_3L_3V, WriteAdr], (instregex "LD3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_3L_3V], (instregex "LD3Threev(16b|8h|4s)_POST$")>;
// ASIMD load, 3 element, multiple, Q-form, D
def : InstRW<[N2Write_8cyc_3L_3V], (instregex "LD3Threev(2d)$")>;
-def : InstRW<[N2Write_8cyc_3L_3V, WriteAdr], (instregex "LD3Threev(2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_3L_3V], (instregex "LD3Threev(2d)_POST$")>;
// ASIMD load, 3 element, one lane, B/H
// ASIMD load, 3 element, one lane, S
// ASIMD load, 3 element, one lane, D
def : InstRW<[N2Write_8cyc_2L_3V], (instregex "LD3i(8|16|32|64)$")>;
-def : InstRW<[N2Write_8cyc_2L_3V, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_2L_3V], (instregex "LD3i(8|16|32|64)_POST$")>;
// ASIMD load, 3 element, all lanes, D-form, B/H/S
// ASIMD load, 3 element, all lanes, D-form, D
def : InstRW<[N2Write_8cyc_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_8cyc_2L_3V, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)_POST$")>;
// ASIMD load, 3 element, all lanes, Q-form, B/H/S
// ASIMD load, 3 element, all lanes, Q-form, D
def : InstRW<[N2Write_8cyc_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_8cyc_3L_3V, WriteAdr], (instregex "LD3Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)_POST$")>;
// ASIMD load, 4 element, multiple, D-form, B/H/S
def : InstRW<[N2Write_8cyc_3L_4V], (instregex "LD4Fourv(8b|4h|2s)$")>;
-def : InstRW<[N2Write_8cyc_3L_4V, WriteAdr], (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_3L_4V], (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
// ASIMD load, 4 element, multiple, Q-form, B/H/S
// ASIMD load, 4 element, multiple, Q-form, D
def : InstRW<[N2Write_9cyc_4L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_9cyc_4L_4V, WriteAdr], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_9cyc_4L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
// ASIMD load, 4 element, one lane, B/H
// ASIMD load, 4 element, one lane, S
// ASIMD load, 4 element, one lane, D
def : InstRW<[N2Write_8cyc_3L_4V], (instregex "LD4i(8|16|32|64)$")>;
-def : InstRW<[N2Write_8cyc_3L_4V, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_3L_4V], (instregex "LD4i(8|16|32|64)_POST$")>;
// ASIMD load, 4 element, all lanes, D-form, B/H/S
// ASIMD load, 4 element, all lanes, D-form, D
def : InstRW<[N2Write_8cyc_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_8cyc_3L_4V, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)_POST$")>;
// ASIMD load, 4 element, all lanes, Q-form, B/H/S
// ASIMD load, 4 element, all lanes, Q-form, D
def : InstRW<[N2Write_8cyc_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_8cyc_4L_4V, WriteAdr], (instregex "LD4Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)_POST$")>;
// ASIMD store instructions
// -----------------------------------------------------------------------------
// ASIMD store, 1 element, multiple, 1 reg, D-form
def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "ST1Onev(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_2cyc_1L01_1V, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_2cyc_1L01_1V], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
// ASIMD store, 1 element, multiple, 1 reg, Q-form
def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "ST1Onev(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_2cyc_1L01_1V, WriteAdr], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_2cyc_1L01_1V], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
// ASIMD store, 1 element, multiple, 2 reg, D-form
def : InstRW<[N2Write_2cyc_1L01_1V], (instregex "ST1Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_2cyc_1L01_1V, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_2cyc_1L01_1V], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
// ASIMD store, 1 element, multiple, 2 reg, Q-form
def : InstRW<[N2Write_2cyc_2L01_2V], (instregex "ST1Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_2cyc_2L01_2V, WriteAdr], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_2cyc_2L01_2V], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
// ASIMD store, 1 element, multiple, 3 reg, D-form
def : InstRW<[N2Write_2cyc_2L01_2V], (instregex "ST1Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_2cyc_2L01_2V, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_2cyc_2L01_2V], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
// ASIMD store, 1 element, multiple, 3 reg, Q-form
def : InstRW<[N2Write_2cyc_3L01_3V], (instregex "ST1Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_2cyc_3L01_3V, WriteAdr], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_2cyc_3L01_3V], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
// ASIMD store, 1 element, multiple, 4 reg, D-form
def : InstRW<[N2Write_2cyc_2L01_2V], (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_2cyc_2L01_2V, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_2cyc_2L01_2V], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
// ASIMD store, 1 element, multiple, 4 reg, Q-form
def : InstRW<[N2Write_2cyc_4L01_4V], (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_2cyc_4L01_4V, WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_2cyc_4L01_4V], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
// ASIMD store, 1 element, one lane, B/H/S
// ASIMD store, 1 element, one lane, D
def : InstRW<[N2Write_4cyc_1L01_1V], (instregex "ST1i(8|16|32|64)$")>;
-def : InstRW<[N2Write_4cyc_1L01_1V, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_4cyc_1L01_1V], (instregex "ST1i(8|16|32|64)_POST$")>;
// ASIMD store, 2 element, multiple, D-form, B/H/S
def : InstRW<[N2Write_4cyc_1L01_1V], (instregex "ST2Twov(8b|4h|2s)$")>;
-def : InstRW<[N2Write_4cyc_1L01_1V, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_4cyc_1L01_1V], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
// ASIMD store, 2 element, multiple, Q-form, B/H/S
// ASIMD store, 2 element, multiple, Q-form, D
def : InstRW<[N2Write_4cyc_2L01_2V], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_4cyc_2L01_2V, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_4cyc_2L01_2V], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
// ASIMD store, 2 element, one lane, B/H/S
// ASIMD store, 2 element, one lane, D
def : InstRW<[N2Write_4cyc_1L01_1V], (instregex "ST2i(8|16|32|64)$")>;
-def : InstRW<[N2Write_4cyc_1L01_1V, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_4cyc_1L01_1V], (instregex "ST2i(8|16|32|64)_POST$")>;
// ASIMD store, 3 element, multiple, D-form, B/H/S
def : InstRW<[N2Write_5cyc_2L01_2V], (instregex "ST3Threev(8b|4h|2s)$")>;
-def : InstRW<[N2Write_5cyc_2L01_2V, WriteAdr], (instregex "ST3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_5cyc_2L01_2V], (instregex "ST3Threev(8b|4h|2s)_POST$")>;
// ASIMD store, 3 element, multiple, Q-form, B/H/S
// ASIMD store, 3 element, multiple, Q-form, D
def : InstRW<[N2Write_6cyc_3L01_3V], (instregex "ST3Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_6cyc_3L01_3V, WriteAdr], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_6cyc_3L01_3V], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>;
// ASIMD store, 3 element, one lane, B/H
// ASIMD store, 3 element, one lane, S
// ASIMD store, 3 element, one lane, D
def : InstRW<[N2Write_6cyc_3L01_3V], (instregex "ST3i(8|16|32|64)$")>;
-def : InstRW<[N2Write_6cyc_3L01_3V, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_6cyc_3L01_3V], (instregex "ST3i(8|16|32|64)_POST$")>;
// ASIMD store, 4 element, multiple, D-form, B/H/S
def : InstRW<[N2Write_6cyc_3L01_3V], (instregex "ST4Fourv(8b|4h|2s)$")>;
-def : InstRW<[N2Write_6cyc_3L01_3V, WriteAdr], (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_6cyc_3L01_3V], (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
// ASIMD store, 4 element, multiple, Q-form, B/H/S
def : InstRW<[N2Write_7cyc_6L01_6V], (instregex "ST4Fourv(16b|8h|4s)$")>;
-def : InstRW<[N2Write_7cyc_6L01_6V, WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_7cyc_6L01_6V], (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
// ASIMD store, 4 element, multiple, Q-form, D
def : InstRW<[N2Write_5cyc_4L01_4V], (instregex "ST4Fourv(2d)$")>;
-def : InstRW<[N2Write_5cyc_4L01_4V, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_5cyc_4L01_4V], (instregex "ST4Fourv(2d)_POST$")>;
// ASIMD store, 4 element, one lane, B/H/S
def : InstRW<[N2Write_6cyc_3L01_3V], (instregex "ST4i(8|16|32)$")>;
-def : InstRW<[N2Write_6cyc_3L01_3V, WriteAdr], (instregex "ST4i(8|16|32)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_6cyc_3L01_3V], (instregex "ST4i(8|16|32)_POST$")>;
// ASIMD store, 4 element, one lane, D
def : InstRW<[N2Write_4cyc_3L01_3V], (instregex "ST4i(64)$")>;
-def : InstRW<[N2Write_4cyc_3L01_3V, WriteAdr], (instregex "ST4i(64)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_4cyc_3L01_3V], (instregex "ST4i(64)_POST$")>;
// Cryptography extensions
// -----------------------------------------------------------------------------
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
index 7ee97a088ecba18..0c6ccc1face972f 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
@@ -1182,28 +1182,28 @@ add x0, x27, 1
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3004
+# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1500
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.50
-# CHECK-NEXT: IPC: 0.33
+# CHECK-NEXT: uOps Per Cycle: 2.95
+# CHECK-NEXT: IPC: 1.97
# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . . . . . . ld1 { v1.1d }, [x27], #8
-# CHECK-NEXT: [0,1] D======eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D======eeeeeeER. . . . . ld1 { v1.2d }, [x27], #16
-# CHECK-NEXT: [0,3] D============eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] D============eeeeeeER . . . ld1 { v1.2s }, [x27], #8
-# CHECK-NEXT: [0,5] D==================eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=================eeeeeeER . . ld1 { v1.4h }, [x27], #8
-# CHECK-NEXT: [0,7] .D=======================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] .D=======================eeeeeeER. ld1 { v1.4s }, [x27], #16
-# CHECK-NEXT: [0,9] .D=============================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeER . . ld1 { v1.1d }, [x27], #8
+# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.2d }, [x27], #16
+# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] D==eeeeeeER . ld1 { v1.2s }, [x27], #8
+# CHECK-NEXT: [0,5] D===eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] .D===eeeeeeER ld1 { v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9] .D====eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1213,43 +1213,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.1d }, [x27], #8
-# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 7.0 0.0 0.0 ld1 { v1.2d }, [x27], #16
-# CHECK-NEXT: 3. 1 13.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 13.0 0.0 0.0 ld1 { v1.2s }, [x27], #8
-# CHECK-NEXT: 5. 1 19.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 18.0 0.0 0.0 ld1 { v1.4h }, [x27], #8
-# CHECK-NEXT: 7. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 24.0 0.0 0.0 ld1 { v1.4s }, [x27], #16
-# CHECK-NEXT: 9. 1 30.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 15.6 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.2d }, [x27], #16
+# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.2s }, [x27], #8
+# CHECK-NEXT: 5. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.4h }, [x27], #8
+# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ld1 { v1.4s }, [x27], #16
+# CHECK-NEXT: 9. 1 5.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 3.1 0.1 2.0 <total>
# CHECK: [1] Code Region - G02
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3004
+# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1500
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.50
-# CHECK-NEXT: IPC: 0.33
+# CHECK-NEXT: uOps Per Cycle: 2.95
+# CHECK-NEXT: IPC: 1.97
# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . . . . . . ld1 { v1.8b }, [x27], #8
-# CHECK-NEXT: [0,1] D======eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D======eeeeeeER. . . . . ld1 { v1.8h }, [x27], #16
-# CHECK-NEXT: [0,3] D============eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] D============eeeeeeER . . . ld1 { v1.16b }, [x27], #16
-# CHECK-NEXT: [0,5] D==================eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=================eeeeeeER . . ld1 { v1.1d }, [x27], x28
-# CHECK-NEXT: [0,7] .D=======================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] .D=======================eeeeeeER. ld1 { v1.2d }, [x27], x28
-# CHECK-NEXT: [0,9] .D=============================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeER . . ld1 { v1.8b }, [x27], #8
+# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.8h }, [x27], #16
+# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] D==eeeeeeER . ld1 { v1.16b }, [x27], #16
+# CHECK-NEXT: [0,5] D===eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] .D===eeeeeeER ld1 { v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9] .D====eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1259,43 +1259,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.8b }, [x27], #8
-# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 7.0 0.0 0.0 ld1 { v1.8h }, [x27], #16
-# CHECK-NEXT: 3. 1 13.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 13.0 0.0 0.0 ld1 { v1.16b }, [x27], #16
-# CHECK-NEXT: 5. 1 19.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 18.0 0.0 0.0 ld1 { v1.1d }, [x27], x28
-# CHECK-NEXT: 7. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 24.0 0.0 0.0 ld1 { v1.2d }, [x27], x28
-# CHECK-NEXT: 9. 1 30.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 15.6 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.8h }, [x27], #16
+# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.16b }, [x27], #16
+# CHECK-NEXT: 5. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.1d }, [x27], x28
+# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ld1 { v1.2d }, [x27], x28
+# CHECK-NEXT: 9. 1 5.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 3.1 0.1 2.0 <total>
# CHECK: [2] Code Region - G03
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3004
+# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1500
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.50
-# CHECK-NEXT: IPC: 0.33
+# CHECK-NEXT: uOps Per Cycle: 2.95
+# CHECK-NEXT: IPC: 1.97
# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . . . . . . ld1 { v1.2s }, [x27], x28
-# CHECK-NEXT: [0,1] D======eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D======eeeeeeER. . . . . ld1 { v1.4h }, [x27], x28
-# CHECK-NEXT: [0,3] D============eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] D============eeeeeeER . . . ld1 { v1.4s }, [x27], x28
-# CHECK-NEXT: [0,5] D==================eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=================eeeeeeER . . ld1 { v1.8b }, [x27], x28
-# CHECK-NEXT: [0,7] .D=======================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] .D=======================eeeeeeER. ld1 { v1.8h }, [x27], x28
-# CHECK-NEXT: [0,9] .D=============================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeER . . ld1 { v1.2s }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.4h }, [x27], x28
+# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] D==eeeeeeER . ld1 { v1.4s }, [x27], x28
+# CHECK-NEXT: [0,5] D===eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] .D===eeeeeeER ld1 { v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9] .D====eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1305,43 +1305,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.2s }, [x27], x28
-# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 7.0 0.0 0.0 ld1 { v1.4h }, [x27], x28
-# CHECK-NEXT: 3. 1 13.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 13.0 0.0 0.0 ld1 { v1.4s }, [x27], x28
-# CHECK-NEXT: 5. 1 19.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 18.0 0.0 0.0 ld1 { v1.8b }, [x27], x28
-# CHECK-NEXT: 7. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 24.0 0.0 0.0 ld1 { v1.8h }, [x27], x28
-# CHECK-NEXT: 9. 1 30.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 15.6 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.4h }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.4s }, [x27], x28
+# CHECK-NEXT: 5. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.8b }, [x27], x28
+# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ld1 { v1.8h }, [x27], x28
+# CHECK-NEXT: 9. 1 5.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 3.1 0.1 2.0 <total>
# CHECK: [3] Code Region - G04
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3004
+# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1900
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.63
-# CHECK-NEXT: IPC: 0.33
+# CHECK-NEXT: uOps Per Cycle: 3.74
+# CHECK-NEXT: IPC: 1.97
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . . . . . . ld1 { v1.16b }, [x27], x28
-# CHECK-NEXT: [0,1] D======eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D======eeeeeeER. . . . . ld1 { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,3] D============eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] D============eeeeeeER . . . ld1 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5] .D=================eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=================eeeeeeER . . ld1 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7] .D=======================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] .D=======================eeeeeeER. ld1 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9] .D=============================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeER . . ld1 { v1.16b }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] D==eeeeeeER . ld1 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] .D===eeeeeeER ld1 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9] .D====eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1351,43 +1351,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.16b }, [x27], x28
-# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 7.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 3. 1 13.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 13.0 0.0 0.0 ld1 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 5. 1 18.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 18.0 0.0 0.0 ld1 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 7. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 24.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 9. 1 30.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 15.5 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9. 1 5.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 3.0 0.1 2.0 <total>
# CHECK: [4] Code Region - G05
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3004
+# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.67
-# CHECK-NEXT: IPC: 0.33
+# CHECK-NEXT: uOps Per Cycle: 3.94
+# CHECK-NEXT: IPC: 1.97
# CHECK-NEXT: Block RThroughput: 3.3
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . . . . . . ld1 { v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1] D======eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D======eeeeeeER. . . . . ld1 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3] D============eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===========eeeeeeER . . . ld1 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5] .D=================eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=================eeeeeeER . . ld1 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7] .D=======================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D======================eeeeeeER. ld1 { v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,9] . D============================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeER . . ld1 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeER ld1 { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1397,43 +1397,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 7.0 0.0 0.0 ld1 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 3. 1 13.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 12.0 0.0 0.0 ld1 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 5. 1 18.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 18.0 0.0 0.0 ld1 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 7. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 23.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 9. 1 29.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 15.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 2.0 <total>
# CHECK: [5] Code Region - G06
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3004
+# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.67
-# CHECK-NEXT: IPC: 0.33
+# CHECK-NEXT: uOps Per Cycle: 3.94
+# CHECK-NEXT: IPC: 1.97
# CHECK-NEXT: Block RThroughput: 3.3
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . . . . . . ld1 { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,1] D======eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D======eeeeeeER. . . . . ld1 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,3] D============eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===========eeeeeeER . . . ld1 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,5] .D=================eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=================eeeeeeER . . ld1 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,7] .D=======================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D======================eeeeeeER. ld1 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,9] . D============================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeER . . ld1 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeER ld1 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1443,43 +1443,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 7.0 0.0 0.0 ld1 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 3. 1 13.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 12.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 5. 1 18.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 18.0 0.0 0.0 ld1 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 7. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 23.0 0.0 0.0 ld1 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 9. 1 29.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 15.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 2.0 <total>
# CHECK: [6] Code Region - G07
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3004
+# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 2300
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.77
-# CHECK-NEXT: IPC: 0.33
+# CHECK-NEXT: uOps Per Cycle: 4.53
+# CHECK-NEXT: IPC: 1.97
# CHECK-NEXT: Block RThroughput: 4.3
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . . . . . . ld1 { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,1] D======eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D======eeeeeeER. . . . . ld1 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,3] D============eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===========eeeeeeER . . . ld1 { v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,5] .D=================eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=================eeeeeeER . . ld1 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,7] .D=======================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D======================eeeeeeER. ld1 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,9] . D============================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeER . . ld1 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeER ld1 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1489,43 +1489,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 7.0 0.0 0.0 ld1 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 3. 1 13.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 12.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 5. 1 18.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 18.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 7. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 23.0 0.0 0.0 ld1 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 9. 1 29.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 15.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 2.0 <total>
# CHECK: [7] Code Region - G08
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3004
+# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 2500
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.83
-# CHECK-NEXT: IPC: 0.33
+# CHECK-NEXT: uOps Per Cycle: 4.92
+# CHECK-NEXT: IPC: 1.97
# CHECK-NEXT: Block RThroughput: 5.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . . . . . . ld1 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,1] D======eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D======eeeeeeER. . . . . ld1 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,3] D============eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===========eeeeeeER . . . ld1 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,5] .D=================eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=================eeeeeeER . . ld1 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,7] .D=======================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D======================eeeeeeER. ld1 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,9] . D============================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeER . . ld1 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeER ld1 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1535,43 +1535,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 7.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 3. 1 13.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 12.0 0.0 0.0 ld1 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 5. 1 18.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 18.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 7. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 23.0 0.0 0.0 ld1 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 9. 1 29.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 15.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 2.0 <total>
# CHECK: [8] Code Region - G09
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3004
+# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 2500
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.83
-# CHECK-NEXT: IPC: 0.33
+# CHECK-NEXT: uOps Per Cycle: 4.92
+# CHECK-NEXT: IPC: 1.97
# CHECK-NEXT: Block RThroughput: 5.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . . . . . . ld1 { v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,1] D======eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D======eeeeeeER. . . . . ld1 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3] D============eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===========eeeeeeER . . . ld1 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5] .D=================eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=================eeeeeeER . . ld1 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7] .D=======================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D======================eeeeeeER. ld1 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9] . D============================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeER . . ld1 { v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeER ld1 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1581,43 +1581,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 7.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 3. 1 13.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 12.0 0.0 0.0 ld1 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 5. 1 18.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 18.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 7. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 23.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 9. 1 29.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 15.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 2.0 <total>
# CHECK: [9] Code Region - G10
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3204
+# CHECK-NEXT: Total Cycles: 608
# CHECK-NEXT: Total uOps: 2700
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.84
-# CHECK-NEXT: IPC: 0.31
+# CHECK-NEXT: uOps Per Cycle: 4.44
+# CHECK-NEXT: IPC: 1.64
# CHECK-NEXT: Block RThroughput: 5.7
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 012345
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . . . . . . ld1 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1] D======eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D======eeeeeeER. . . . . ld1 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3] D============eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===========eeeeeeER . . . ld1 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5] .D=================eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=================eeeeeeeER . . ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,7] . D=======================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D=======================eeeeeeeER. ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,9] . D==============================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeER . . ld1 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeeER. ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,7] . D==eE-----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeeER ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,9] . D===eE-----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1627,43 +1627,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 7.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 3. 1 13.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 12.0 0.0 0.0 ld1 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 5. 1 18.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 18.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 7. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 24.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 9. 1 31.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 15.5 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 7. 1 3.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 9. 1 4.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.6 0.1 2.2 <total>
# CHECK: [10] Code Region - G11
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3504
+# CHECK-NEXT: Total Cycles: 675
# CHECK-NEXT: Total uOps: 3000
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.86
-# CHECK-NEXT: IPC: 0.29
+# CHECK-NEXT: uOps Per Cycle: 4.44
+# CHECK-NEXT: IPC: 1.48
# CHECK-NEXT: Block RThroughput: 6.7
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 012345678
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeER. . . . . . . ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,1] D=======eER . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D======eeeeeeeER . . . . . ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,3] .D=============eER . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D============eeeeeeeER . . . . ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,5] . D===================eER. . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D==================eeeeeeeER . . ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,7] . D=========================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D========================eeeeeeeER. ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,9] . D===============================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeeER. . ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,1] D=eE-----R. . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeER . ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,3] .D=eE-----R . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeER . ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,5] . D=eE-----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeeeER. ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,7] . D==eE-----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D=eeeeeeeER ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,9] . D==eE-----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1673,43 +1673,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 1. 1 8.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 7.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 3. 1 14.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 13.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 5. 1 20.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 19.0 0.0 0.0 ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 7. 1 26.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 25.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 9. 1 32.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 16.5 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 3. 1 2.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 5. 1 2.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 1.0 0.0 ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 7. 1 3.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 2.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 9. 1 3.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.9 0.2 2.5 <total>
# CHECK: [11] Code Region - G12
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3504
+# CHECK-NEXT: Total Cycles: 675
# CHECK-NEXT: Total uOps: 3000
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.86
-# CHECK-NEXT: IPC: 0.29
+# CHECK-NEXT: uOps Per Cycle: 4.44
+# CHECK-NEXT: IPC: 1.48
# CHECK-NEXT: Block RThroughput: 6.7
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 012345678
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeER. . . . . . . ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,1] D=======eER . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D======eeeeeeeER . . . . . ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,3] .D=============eER . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D============eeeeeeeER . . . . ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,5] . D===================eER. . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D==================eeeeeeeER . . ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,7] . D=========================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D========================eeeeeeeER. ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,9] . D===============================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeeER. . ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,1] D=eE-----R. . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeER . ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eE-----R . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeER . ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,5] . D=eE-----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeeeER. ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,7] . D==eE-----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D=eeeeeeeER ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,9] . D==eE-----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1719,43 +1719,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 1. 1 8.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 7.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 3. 1 14.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 13.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 5. 1 20.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 19.0 0.0 0.0 ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 7. 1 26.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 25.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 9. 1 32.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 16.5 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 5. 1 2.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 1.0 0.0 ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 7. 1 3.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 2.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 9. 1 3.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.9 0.2 2.5 <total>
# CHECK: [12] Code Region - G13
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3604
+# CHECK-NEXT: Total Cycles: 1210
# CHECK-NEXT: Total uOps: 2800
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.78
-# CHECK-NEXT: IPC: 0.28
+# CHECK-NEXT: uOps Per Cycle: 2.31
+# CHECK-NEXT: IPC: 0.83
# CHECK-NEXT: Block RThroughput: 5.7
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 01
-# CHECK: [0,0] DeeeeeeeER. . . . . . . ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,1] D=======eER . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D======eeeeeeeER . . . . . ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,3] .D=============eER . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D============eeeeeeeER . . . . ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,5] . D===================eER. . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D==================eeeeeeeER . . ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,7] . D=========================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D=========================eeeeeeeeER. ld1 { v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,9] . D=================================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeeER. . .. ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE-----R. . .. add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeER . .. ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eE-----R . .. add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeER . .. ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,5] . D=eE-----R . .. add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeeeER . .. ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,7] . D==eE-----R . .. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D========eeeeeeeeER ld1 { v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,9] . D=========eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1765,22 +1765,22 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 1. 1 8.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 7.0 0.0 0.0 ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 3. 1 14.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 13.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 5. 1 20.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 19.0 0.0 0.0 ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 7. 1 26.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 26.0 0.0 0.0 ld1 { v1.b }[0], [x27], #1
-# CHECK-NEXT: 9. 1 34.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 16.8 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 5. 1 2.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 1.0 0.0 ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 7. 1 3.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 9.0 0.0 0.0 ld1 { v1.b }[0], [x27], #1
+# CHECK-NEXT: 9. 1 10.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 3.3 0.2 2.6 <total>
# CHECK: [13] Code Region - G14
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 10
@@ -1790,18 +1790,18 @@ add x0, x27, 1
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld1 { v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld1 { v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,3] D================eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld1 { v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,5] .D=======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld1 { v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,7] .D===============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER. ld1 { v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,9] . D======================================eER add x0, x27, #1
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld1 { v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,1] D=eE------R . . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld1 { v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,3] D=========eE------R . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld1 { v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,5] .D================eE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld1 { v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,7] .D========================eE------R. . . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER ld1 { v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,9] . D===============================eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1811,22 +1811,22 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.b }[8], [x27], #1
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld1 { v1.b }[0], [x27], x28
-# CHECK-NEXT: 3. 1 17.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 3. 1 10.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld1 { v1.b }[8], [x27], x28
-# CHECK-NEXT: 5. 1 24.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 5. 1 17.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 6. 1 24.0 0.0 0.0 ld1 { v1.h }[0], [x27], #2
-# CHECK-NEXT: 7. 1 32.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 7. 1 25.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 8. 1 31.0 0.0 0.0 ld1 { v1.h }[4], [x27], #2
-# CHECK-NEXT: 9. 1 39.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 20.2 0.1 0.0 <total>
+# CHECK-NEXT: 9. 1 32.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 16.7 0.1 3.0 <total>
# CHECK: [14] Code Region - G15
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 10
@@ -1836,18 +1836,18 @@ add x0, x27, 1
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld1 { v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld1 { v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,3] D================eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld1 { v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,5] .D=======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld1 { v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,7] .D===============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER. ld1 { v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,9] . D======================================eER add x0, x27, #1
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld1 { v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,1] D=eE------R . . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld1 { v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,3] D=========eE------R . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld1 { v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,5] .D================eE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld1 { v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,7] .D========================eE------R. . . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER ld1 { v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,9] . D===============================eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1857,43 +1857,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.h }[0], [x27], x28
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld1 { v1.h }[4], [x27], x28
-# CHECK-NEXT: 3. 1 17.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 3. 1 10.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld1 { v1.s }[0], [x27], #4
-# CHECK-NEXT: 5. 1 24.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 5. 1 17.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 6. 1 24.0 0.0 0.0 ld1 { v1.s }[0], [x27], x28
-# CHECK-NEXT: 7. 1 32.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 7. 1 25.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 8. 1 31.0 0.0 0.0 ld1 { v1.d }[0], [x27], #8
-# CHECK-NEXT: 9. 1 39.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 20.2 0.1 0.0 <total>
+# CHECK-NEXT: 9. 1 32.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 16.7 0.1 3.0 <total>
# CHECK: [15] Code Region - G16
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 1203
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.50
-# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: uOps Per Cycle: 1.66
+# CHECK-NEXT: IPC: 0.83
# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld1 { v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld1r { v1.1d }, [x27], #8
-# CHECK-NEXT: [0,3] D================eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld1r { v1.2d }, [x27], #8
-# CHECK-NEXT: [0,5] .D=======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld1r { v1.2s }, [x27], #4
-# CHECK-NEXT: [0,7] .D===============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER. ld1r { v1.4h }, [x27], #2
-# CHECK-NEXT: [0,9] . D======================================eER add x0, x27, #1
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld1 { v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,1] D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeeeER . ld1r { v1.1d }, [x27], #8
+# CHECK-NEXT: [0,3] D==eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeeeER . ld1r { v1.2d }, [x27], #8
+# CHECK-NEXT: [0,5] .D==eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeeeER. ld1r { v1.2s }, [x27], #4
+# CHECK-NEXT: [0,7] .D===eE------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld1r { v1.4h }, [x27], #2
+# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1903,43 +1903,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.d }[0], [x27], x28
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld1r { v1.1d }, [x27], #8
-# CHECK-NEXT: 3. 1 17.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld1r { v1.2d }, [x27], #8
-# CHECK-NEXT: 5. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 24.0 0.0 0.0 ld1r { v1.2s }, [x27], #4
-# CHECK-NEXT: 7. 1 32.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 31.0 0.0 0.0 ld1r { v1.4h }, [x27], #2
-# CHECK-NEXT: 9. 1 39.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 20.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1r { v1.1d }, [x27], #8
+# CHECK-NEXT: 3. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1r { v1.2d }, [x27], #8
+# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1r { v1.2s }, [x27], #4
+# CHECK-NEXT: 7. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1r { v1.4h }, [x27], #2
+# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 3.0 <total>
# CHECK: [16] Code Region - G17
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 510
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.50
-# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: uOps Per Cycle: 3.92
+# CHECK-NEXT: IPC: 1.96
# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld1r { v1.4s }, [x27], #4
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld1r { v1.8b }, [x27], #1
-# CHECK-NEXT: [0,3] D================eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld1r { v1.8h }, [x27], #2
-# CHECK-NEXT: [0,5] .D=======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld1r { v1.16b }, [x27], #1
-# CHECK-NEXT: [0,7] .D===============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER. ld1r { v1.1d }, [x27], x28
-# CHECK-NEXT: [0,9] . D======================================eER add x0, x27, #1
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld1r { v1.4s }, [x27], #4
+# CHECK-NEXT: [0,1] D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeeeER . ld1r { v1.8b }, [x27], #1
+# CHECK-NEXT: [0,3] D==eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeeeER . ld1r { v1.8h }, [x27], #2
+# CHECK-NEXT: [0,5] .D==eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeeeER. ld1r { v1.16b }, [x27], #1
+# CHECK-NEXT: [0,7] .D===eE------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld1r { v1.1d }, [x27], x28
+# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1949,43 +1949,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1r { v1.4s }, [x27], #4
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld1r { v1.8b }, [x27], #1
-# CHECK-NEXT: 3. 1 17.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld1r { v1.8h }, [x27], #2
-# CHECK-NEXT: 5. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 24.0 0.0 0.0 ld1r { v1.16b }, [x27], #1
-# CHECK-NEXT: 7. 1 32.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 31.0 0.0 0.0 ld1r { v1.1d }, [x27], x28
-# CHECK-NEXT: 9. 1 39.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 20.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1r { v1.8b }, [x27], #1
+# CHECK-NEXT: 3. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1r { v1.8h }, [x27], #2
+# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1r { v1.16b }, [x27], #1
+# CHECK-NEXT: 7. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1r { v1.1d }, [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 3.0 <total>
# CHECK: [17] Code Region - G18
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 510
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.50
-# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: uOps Per Cycle: 3.92
+# CHECK-NEXT: IPC: 1.96
# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld1r { v1.2d }, [x27], x28
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld1r { v1.2s }, [x27], x28
-# CHECK-NEXT: [0,3] D================eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld1r { v1.4h }, [x27], x28
-# CHECK-NEXT: [0,5] .D=======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld1r { v1.4s }, [x27], x28
-# CHECK-NEXT: [0,7] .D===============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER. ld1r { v1.8b }, [x27], x28
-# CHECK-NEXT: [0,9] . D======================================eER add x0, x27, #1
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld1r { v1.2d }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeeeER . ld1r { v1.2s }, [x27], x28
+# CHECK-NEXT: [0,3] D==eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeeeER . ld1r { v1.4h }, [x27], x28
+# CHECK-NEXT: [0,5] .D==eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeeeER. ld1r { v1.4s }, [x27], x28
+# CHECK-NEXT: [0,7] .D===eE------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld1r { v1.8b }, [x27], x28
+# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1995,43 +1995,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1r { v1.2d }, [x27], x28
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld1r { v1.2s }, [x27], x28
-# CHECK-NEXT: 3. 1 17.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld1r { v1.4h }, [x27], x28
-# CHECK-NEXT: 5. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 24.0 0.0 0.0 ld1r { v1.4s }, [x27], x28
-# CHECK-NEXT: 7. 1 32.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 31.0 0.0 0.0 ld1r { v1.8b }, [x27], x28
-# CHECK-NEXT: 9. 1 39.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 20.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1r { v1.2s }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1r { v1.4h }, [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1r { v1.4s }, [x27], x28
+# CHECK-NEXT: 7. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1r { v1.8b }, [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 3.0 <total>
# CHECK: [18] Code Region - G19
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 510
# CHECK-NEXT: Total uOps: 2400
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.60
-# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: uOps Per Cycle: 4.71
+# CHECK-NEXT: IPC: 1.96
# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld1r { v1.8h }, [x27], x28
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld1r { v1.16b }, [x27], x28
-# CHECK-NEXT: [0,3] D================eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld2 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5] .D=======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld2 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7] . D==============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER. ld2 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9] . D======================================eER add x0, x27, #1
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld1r { v1.8h }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeeeER . ld1r { v1.16b }, [x27], x28
+# CHECK-NEXT: [0,3] D==eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeeeER . ld2 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5] .D==eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeeeER. ld2 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7] . D==eE------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld2 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2041,43 +2041,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1r { v1.8h }, [x27], x28
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld1r { v1.16b }, [x27], x28
-# CHECK-NEXT: 3. 1 17.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld2 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 5. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 24.0 0.0 0.0 ld2 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 7. 1 31.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 31.0 0.0 0.0 ld2 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 9. 1 39.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 20.1 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1r { v1.16b }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld2 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld2 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld2 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.6 0.1 3.0 <total>
# CHECK: [19] Code Region - G20
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 510
# CHECK-NEXT: Total uOps: 2900
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.72
-# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: uOps Per Cycle: 5.69
+# CHECK-NEXT: IPC: 1.96
# CHECK-NEXT: Block RThroughput: 5.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld2 { v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld2 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3] .D===============eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld2 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5] .D=======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D======================eeeeeeeeER. . . ld2 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7] . D==============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D=============================eeeeeeeeER. ld2 { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,9] . D=====================================eER add x0, x27, #1
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld2 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1] D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeeeER . ld2 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3] .D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeeeER . ld2 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5] .D==eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeeeeER. ld2 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7] . D==eE------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D=eeeeeeeeER ld2 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,9] . D==eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2087,43 +2087,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld2 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 3. 1 16.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld2 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 5. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 23.0 0.0 0.0 ld2 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 7. 1 31.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 30.0 0.0 0.0 ld2 { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 9. 1 38.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 19.7 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld2 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld2 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld2 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 2.0 0.0 0.0 ld2 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 9. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.2 0.1 3.0 <total>
# CHECK: [20] Code Region - G21
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 510
# CHECK-NEXT: Total uOps: 2700
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.67
-# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: uOps Per Cycle: 5.29
+# CHECK-NEXT: IPC: 1.96
# CHECK-NEXT: Block RThroughput: 5.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld2 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld2 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,3] D================eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld2 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,5] .D=======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld2 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,7] . D==============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER. ld2 { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,9] . D======================================eER add x0, x27, #1
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld2 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeeeER . ld2 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3] D==eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeeeER . ld2 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,5] .D==eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeeeER. ld2 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,7] . D==eE------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld2 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2133,43 +2133,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld2 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 3. 1 17.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld2 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 5. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 24.0 0.0 0.0 ld2 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 7. 1 31.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 31.0 0.0 0.0 ld2 { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 9. 1 39.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 20.1 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld2 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld2 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld2 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 7. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld2 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.6 0.1 3.0 <total>
# CHECK: [21] Code Region - G22
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 3310
# CHECK-NEXT: Total uOps: 2600
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.65
-# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: uOps Per Cycle: 0.79
+# CHECK-NEXT: IPC: 0.30
# CHECK-NEXT: Block RThroughput: 5.0
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld2 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld2 { v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,3] .D===============eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld2 { v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,5] .D=======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld2 { v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,7] . D==============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER. ld2 { v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,9] . D======================================eER add x0, x27, #1
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld2 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE------R . . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld2 { v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,3] .D========eE------R . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld2 { v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,5] .D================eE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld2 { v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,7] . D=======================eE------R. . . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER ld2 { v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,9] . D===============================eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2179,22 +2179,22 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld2 { v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 3. 1 16.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 3. 1 9.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld2 { v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 5. 1 24.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 5. 1 17.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 6. 1 24.0 0.0 0.0 ld2 { v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: 7. 1 31.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 7. 1 24.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 8. 1 31.0 0.0 0.0 ld2 { v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 9. 1 39.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 20.0 0.1 0.0 <total>
+# CHECK-NEXT: 9. 1 32.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 16.5 0.1 3.0 <total>
# CHECK: [22] Code Region - G23
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 2500
# CHECK: Dispatch Width: 10
@@ -2204,18 +2204,18 @@ add x0, x27, 1
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld2 { v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld2 { v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,3] D================eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld2 { v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,5] .D=======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld2 { v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,7] .D===============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER. ld2 { v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,9] . D======================================eER add x0, x27, #1
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld2 { v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,1] D=eE------R . . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld2 { v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,3] D=========eE------R . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld2 { v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,5] .D================eE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld2 { v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,7] .D========================eE------R. . . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER ld2 { v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,9] . D===============================eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2225,43 +2225,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld2 { v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 3. 1 17.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 3. 1 10.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld2 { v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 5. 1 24.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 5. 1 17.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 6. 1 24.0 0.0 0.0 ld2 { v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: 7. 1 32.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 7. 1 25.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 8. 1 31.0 0.0 0.0 ld2 { v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 9. 1 39.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 20.2 0.1 0.0 <total>
+# CHECK-NEXT: 9. 1 32.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 16.7 0.1 3.0 <total>
# CHECK: [23] Code Region - G24
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 2603
# CHECK-NEXT: Total uOps: 2500
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.62
-# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: uOps Per Cycle: 0.96
+# CHECK-NEXT: IPC: 0.38
# CHECK-NEXT: Block RThroughput: 5.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld2 { v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld2 { v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,3] D================eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld2 { v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,5] .D=======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld2r { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,7] .D===============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER. ld2r { v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: [0,9] . D======================================eER add x0, x27, #1
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
+
+# CHECK: [0,0] DeeeeeeeeER . . . . ld2 { v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,1] D=eE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . ld2 { v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,3] D=========eE------R . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . ld2 { v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,5] .D================eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D================eeeeeeeeER. ld2r { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,7] .D=================eE------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D================eeeeeeeeER ld2r { v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,9] . D=================eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2271,43 +2271,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld2 { v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 3. 1 17.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 3. 1 10.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld2 { v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 5. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 24.0 0.0 0.0 ld2r { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 7. 1 32.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 31.0 0.0 0.0 ld2r { v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: 9. 1 39.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 20.2 0.1 0.0 <total>
+# CHECK-NEXT: 5. 1 17.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 17.0 0.0 0.0 ld2r { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 7. 1 18.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 17.0 0.0 0.0 ld2r { v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: 9. 1 18.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 12.5 0.1 3.0 <total>
# CHECK: [24] Code Region - G25
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 510
# CHECK-NEXT: Total uOps: 2500
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.62
-# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: uOps Per Cycle: 4.90
+# CHECK-NEXT: IPC: 1.96
# CHECK-NEXT: Block RThroughput: 5.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld2r { v1.2s, v2.2s }, [x27], #8
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld2r { v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: [0,3] D================eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld2r { v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: [0,5] .D=======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld2r { v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: [0,7] .D===============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER. ld2r { v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: [0,9] . D======================================eER add x0, x27, #1
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld2r { v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: [0,1] D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeeeER . ld2r { v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,3] D==eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeeeER . ld2r { v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,5] .D==eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeeeER. ld2r { v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,7] .D===eE------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld2r { v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2317,43 +2317,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2r { v1.2s, v2.2s }, [x27], #8
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld2r { v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: 3. 1 17.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld2r { v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: 5. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 24.0 0.0 0.0 ld2r { v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: 7. 1 32.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 31.0 0.0 0.0 ld2r { v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: 9. 1 39.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 20.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld2r { v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 3. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld2r { v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld2r { v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 7. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld2r { v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 3.0 <total>
# CHECK: [25] Code Region - G26
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 510
# CHECK-NEXT: Total uOps: 2500
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.62
-# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: uOps Per Cycle: 4.90
+# CHECK-NEXT: IPC: 1.96
# CHECK-NEXT: Block RThroughput: 5.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld2r { v1.16b, v2.16b }, [x27], #2
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld2r { v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,3] D================eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld2r { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,5] .D=======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld2r { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,7] .D===============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER. ld2r { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,9] . D======================================eER add x0, x27, #1
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld2r { v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: [0,1] D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeeeER . ld2r { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3] D==eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeeeER . ld2r { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,5] .D==eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeeeER. ld2r { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,7] .D===eE------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld2r { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2363,43 +2363,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2r { v1.16b, v2.16b }, [x27], #2
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld2r { v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 3. 1 17.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld2r { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 5. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 24.0 0.0 0.0 ld2r { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 7. 1 32.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 31.0 0.0 0.0 ld2r { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 9. 1 39.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 20.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld2r { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld2r { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld2r { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 7. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld2r { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 3.0 <total>
# CHECK: [26] Code Region - G27
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 609
# CHECK-NEXT: Total uOps: 2800
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.70
-# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: uOps Per Cycle: 4.60
+# CHECK-NEXT: IPC: 1.64
# CHECK-NEXT: Block RThroughput: 5.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld2r { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld2r { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,3] D================eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld2r { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,5] .D=======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld2r { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,7] .D===============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER. ld3 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,9] . D======================================eER add x0, x27, #1
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld2r { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeeeER . ld2r { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3] D==eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeeeER . ld2r { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,5] .D==eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeeeER. ld2r { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,7] .D===eE------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld3 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2409,43 +2409,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2r { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld2r { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 3. 1 17.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld2r { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 5. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 24.0 0.0 0.0 ld2r { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 7. 1 32.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 31.0 0.0 0.0 ld3 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 9. 1 39.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 20.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld2r { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld2r { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld2r { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 7. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld3 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 3.0 <total>
# CHECK: [27] Code Region - G28
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 759
# CHECK-NEXT: Total uOps: 3700
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.92
-# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: uOps Per Cycle: 4.87
+# CHECK-NEXT: IPC: 1.32
# CHECK-NEXT: Block RThroughput: 7.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld3 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld3 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,3] .D===============eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld3 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,5] . D======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld3 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,7] . D=============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D============================eeeeeeeeER. ld3 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,9] . D====================================eER add x0, x27, #1
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER .. ld3 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1] D=eE------R .. add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeER .. ld3 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,3] .D=eE------R .. add x0, x27, #1
+# CHECK-NEXT: [0,4] . D=eeeeeeeeER .. ld3 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,5] . D==eE------R .. add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeeeeER.. ld3 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,7] . D==eE------R.. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld3 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2455,43 +2455,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld3 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 3. 1 16.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld3 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 5. 1 23.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld3 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 7. 1 30.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld3 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 9. 1 37.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 19.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld3 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 ld3 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld3 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 7. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 1.0 0.0 ld3 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.3 3.0 <total>
# CHECK: [28] Code Region - G29
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 759
# CHECK-NEXT: Total uOps: 3800
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.95
-# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: uOps Per Cycle: 5.01
+# CHECK-NEXT: IPC: 1.32
# CHECK-NEXT: Block RThroughput: 7.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld3 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld3 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3] .D===============eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld3 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5] . D======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld3 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7] . D=============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D============================eeeeeeeeER. ld3 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9] . D====================================eER add x0, x27, #1
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER .. ld3 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1] D=eE------R .. add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeER .. ld3 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eE------R .. add x0, x27, #1
+# CHECK-NEXT: [0,4] . D=eeeeeeeeER .. ld3 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5] . D==eE------R .. add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeeeeER.. ld3 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7] . D==eE------R.. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld3 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2501,43 +2501,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld3 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 3. 1 16.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld3 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 5. 1 23.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld3 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 7. 1 30.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld3 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 9. 1 37.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 19.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld3 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 ld3 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld3 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 1.0 0.0 ld3 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.3 3.0 <total>
# CHECK: [29] Code Region - G30
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 2010
# CHECK-NEXT: Total uOps: 3700
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.92
-# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: uOps Per Cycle: 1.84
+# CHECK-NEXT: IPC: 0.50
# CHECK-NEXT: Block RThroughput: 7.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld3 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld3 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3] .D===============eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld3 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5] . D======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld3 { v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,7] . D=============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D============================eeeeeeeeER. ld3 { v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,9] . D====================================eER add x0, x27, #1
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . . . . ld3 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeER . . . . ld3 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D=eeeeeeeeER . . . . ld3 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5] . D==eE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D========eeeeeeeeER . . ld3 { v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,7] . D=========eE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D===============eeeeeeeeER ld3 { v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,9] . D================eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2547,22 +2547,22 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld3 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 3. 1 16.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld3 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 5. 1 23.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 7. 1 30.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 9. 1 37.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 19.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld3 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 ld3 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 9.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 7. 1 10.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 16.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 9. 1 17.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 6.3 0.2 3.0 <total>
# CHECK: [30] Code Region - G31
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 3500
# CHECK: Dispatch Width: 10
@@ -2572,18 +2572,18 @@ add x0, x27, 1
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld3 { v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld3 { v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,3] .D===============eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld3 { v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,5] . D======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld3 { v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,7] . D=============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D============================eeeeeeeeER. ld3 { v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,9] . D====================================eER add x0, x27, #1
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld3 { v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,1] D=eE------R . . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld3 { v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,3] .D========eE------R . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld3 { v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,5] . D===============eE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld3 { v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,7] . D======================eE------R. . . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D============================eeeeeeeeER ld3 { v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,9] . D=============================eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2593,22 +2593,22 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 3. 1 16.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 3. 1 9.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 5. 1 23.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 5. 1 16.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 7. 1 30.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 7. 1 23.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 9. 1 37.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 19.0 0.1 0.0 <total>
+# CHECK-NEXT: 9. 1 30.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 15.5 0.1 3.0 <total>
# CHECK: [31] Code Region - G32
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 3500
# CHECK: Dispatch Width: 10
@@ -2618,18 +2618,18 @@ add x0, x27, 1
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld3 { v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld3 { v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,3] .D===============eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld3 { v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,5] . D======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld3 { v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,7] . D=============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D============================eeeeeeeeER. ld3 { v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,9] . D====================================eER add x0, x27, #1
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld3 { v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,1] D=eE------R . . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld3 { v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,3] .D========eE------R . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld3 { v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,5] . D===============eE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld3 { v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,7] . D======================eE------R. . . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D============================eeeeeeeeER ld3 { v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,9] . D=============================eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2639,43 +2639,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld3 { v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 3. 1 16.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 3. 1 9.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld3 { v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 5. 1 23.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 5. 1 16.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld3 { v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 7. 1 30.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 7. 1 23.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld3 { v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 9. 1 37.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 19.0 0.1 0.0 <total>
+# CHECK-NEXT: 9. 1 30.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 15.5 0.1 3.0 <total>
# CHECK: [32] Code Region - G33
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 759
# CHECK-NEXT: Total uOps: 3700
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.92
-# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: uOps Per Cycle: 4.87
+# CHECK-NEXT: IPC: 1.32
# CHECK-NEXT: Block RThroughput: 7.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld3r { v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld3r { v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: [0,3] .D===============eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld3r { v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: [0,5] . D======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld3r { v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: [0,7] . D=============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D============================eeeeeeeeER. ld3r { v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: [0,9] . D====================================eER add x0, x27, #1
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER .. ld3r { v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1] D=eE------R .. add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeER .. ld3r { v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,3] .D=eE------R .. add x0, x27, #1
+# CHECK-NEXT: [0,4] . D=eeeeeeeeER .. ld3r { v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,5] . D==eE------R .. add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeeeeER.. ld3r { v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,7] . D==eE------R.. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld3r { v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2685,43 +2685,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3r { v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld3r { v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: 3. 1 16.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld3r { v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: 5. 1 23.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld3r { v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: 7. 1 30.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld3r { v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: 9. 1 37.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 19.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld3r { v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 ld3r { v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld3r { v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 7. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 1.0 0.0 ld3r { v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.3 3.0 <total>
# CHECK: [33] Code Region - G34
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 759
# CHECK-NEXT: Total uOps: 3800
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.95
-# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: uOps Per Cycle: 5.01
+# CHECK-NEXT: IPC: 1.32
# CHECK-NEXT: Block RThroughput: 7.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld3r { v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld3r { v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: [0,3] .D===============eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld3r { v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: [0,5] . D======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld3r { v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7] . D=============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D============================eeeeeeeeER. ld3r { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9] . D====================================eER add x0, x27, #1
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER .. ld3r { v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1] D=eE------R .. add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeER .. ld3r { v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,3] .D=eE------R .. add x0, x27, #1
+# CHECK-NEXT: [0,4] . D=eeeeeeeeER .. ld3r { v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,5] . D==eE------R .. add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeeeeER.. ld3r { v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7] . D==eE------R.. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld3r { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2731,43 +2731,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3r { v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld3r { v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: 3. 1 16.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld3r { v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: 5. 1 23.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld3r { v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 7. 1 30.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld3r { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9. 1 37.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 19.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld3r { v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 ld3r { v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld3r { v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 1.0 0.0 ld3r { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.3 3.0 <total>
# CHECK: [34] Code Region - G35
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 759
# CHECK-NEXT: Total uOps: 3700
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.92
-# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: uOps Per Cycle: 4.87
+# CHECK-NEXT: IPC: 1.32
# CHECK-NEXT: Block RThroughput: 7.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld3r { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld3r { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3] .D===============eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld3r { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5] . D======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld3r { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7] . D=============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D============================eeeeeeeeER. ld3r { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9] . D====================================eER add x0, x27, #1
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER .. ld3r { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE------R .. add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeER .. ld3r { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eE------R .. add x0, x27, #1
+# CHECK-NEXT: [0,4] . D=eeeeeeeeER .. ld3r { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5] . D==eE------R .. add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeeeeER.. ld3r { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7] . D==eE------R.. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld3r { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2777,43 +2777,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3r { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld3r { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3. 1 16.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld3r { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5. 1 23.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld3r { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7. 1 30.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld3r { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9. 1 37.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 19.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld3r { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 ld3r { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld3r { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 1.0 0.0 ld3r { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.3 3.0 <total>
# CHECK: [35] Code Region - G36
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4204
+# CHECK-NEXT: Total Cycles: 959
# CHECK-NEXT: Total uOps: 4600
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.09
-# CHECK-NEXT: IPC: 0.24
+# CHECK-NEXT: uOps Per Cycle: 4.80
+# CHECK-NEXT: IPC: 1.04
# CHECK-NEXT: Block RThroughput: 9.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 012345
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld3r { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeeER. . . . . . ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,3] .D================eER . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D===============eeeeeeeeER . . . . ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,5] . D=======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D======================eeeeeeeeER . . ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,7] . D==============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D=============================eeeeeeeeeER. ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,9] . D======================================eER add x0, x27, #1
+# CHECK-NEXT: 012345678
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . . ld3r { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeeER . . ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3] .D=eE-------R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D=eeeeeeeeER . . ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,5] . D==eE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D==eeeeeeeeER . ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,7] . D===eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D===eeeeeeeeeER ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,9] . D====eE-------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2823,43 +2823,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3r { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 3. 1 17.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 5. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 23.0 0.0 0.0 ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 7. 1 31.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 30.0 0.0 0.0 ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 9. 1 39.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 19.8 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 3. 1 2.0 0.0 7.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 1.0 0.0 ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 7. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 4.0 1.0 0.0 ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 9. 1 5.0 0.0 7.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.4 3.2 <total>
# CHECK: [36] Code Region - G37
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4304
+# CHECK-NEXT: Total Cycles: 1008
# CHECK-NEXT: Total uOps: 4800
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.12
-# CHECK-NEXT: IPC: 0.23
+# CHECK-NEXT: uOps Per Cycle: 4.76
+# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 10.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123456
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . .. ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,1] D========eER . . . . . . .. add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeeER. . . . . .. ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,3] .D================eER . . . . .. add x0, x27, #1
-# CHECK-NEXT: [0,4] . D===============eeeeeeeeeER . . . .. ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,5] . D========================eER. . . .. add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=======================eeeeeeeeeER . .. ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,7] . D================================eER . .. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D===============================eeeeeeeeER. ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,9] . D=======================================eER add x0, x27, #1
+# CHECK-NEXT: 012345678
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . . ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1] D=eE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeeER . . ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3] .D=eE-------R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D==eeeeeeeeeER . ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,5] . D===eE-------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D==eeeeeeeeeER . ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,7] . D===eE-------R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D====eeeeeeeeER ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,9] . D=====eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2869,43 +2869,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 3. 1 17.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 5. 1 25.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 24.0 0.0 0.0 ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 7. 1 33.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 32.0 0.0 0.0 ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 9. 1 40.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 20.5 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 3. 1 2.0 0.0 7.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 3.0 2.0 0.0 ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 5. 1 4.0 0.0 7.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 7. 1 4.0 0.0 7.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 5.0 2.0 0.0 ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 9. 1 6.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 3.1 0.5 3.3 <total>
# CHECK: [37] Code Region - G38
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4304
+# CHECK-NEXT: Total Cycles: 1009
# CHECK-NEXT: Total uOps: 4800
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.12
-# CHECK-NEXT: IPC: 0.23
+# CHECK-NEXT: uOps Per Cycle: 4.76
+# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 10.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123456
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . .. ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,1] D========eER . . . . . . .. add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeeER. . . . . .. ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,3] .D================eER . . . . .. add x0, x27, #1
-# CHECK-NEXT: [0,4] . D===============eeeeeeeeER . . . .. ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,5] . D=======================eER . . . .. add x0, x27, #1
-# CHECK-NEXT: [0,6] . D======================eeeeeeeeeER . .. ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,7] . D===============================eER . .. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============================eeeeeeeeeER. ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,9] . D=======================================eER add x0, x27, #1
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . . ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeeER . . ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eE-------R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D==eeeeeeeeER. . ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,5] . D===eE------R. . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D==eeeeeeeeeER . ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,7] . D===eE-------R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D====eeeeeeeeeER ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,9] . D=====eE-------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2915,22 +2915,22 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 3. 1 17.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 5. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 23.0 0.0 0.0 ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 7. 1 32.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 31.0 0.0 0.0 ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 9. 1 40.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 20.1 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 7.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 3.0 2.0 0.0 ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 5. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 7. 1 4.0 0.0 7.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 5.0 2.0 0.0 ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 9. 1 6.0 0.0 7.0 add x0, x27, #1
+# CHECK-NEXT: 1 3.1 0.5 3.3 <total>
# CHECK: [38] Code Region - G39
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 4500
# CHECK: Dispatch Width: 10
@@ -2940,18 +2940,18 @@ add x0, x27, 1
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,3] .D===============eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,5] . D======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,7] . D=============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D============================eeeeeeeeER. ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,9] . D====================================eER add x0, x27, #1
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,1] D=eE------R . . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,3] .D========eE------R . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,5] . D===============eE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,7] . D======================eE------R. . . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D============================eeeeeeeeER ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,9] . D=============================eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2961,22 +2961,22 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 3. 1 16.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 3. 1 9.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 5. 1 23.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 5. 1 16.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 7. 1 30.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 7. 1 23.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 9. 1 37.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 19.0 0.1 0.0 <total>
+# CHECK-NEXT: 9. 1 30.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 15.5 0.1 3.0 <total>
# CHECK: [39] Code Region - G40
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 4500
# CHECK: Dispatch Width: 10
@@ -2986,18 +2986,18 @@ add x0, x27, 1
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,3] .D===============eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,5] . D======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,7] . D=============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D============================eeeeeeeeER. ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,9] . D====================================eER add x0, x27, #1
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,1] D=eE------R . . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,3] .D========eE------R . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,5] . D===============eE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,7] . D======================eE------R. . . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D============================eeeeeeeeER ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,9] . D=============================eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3007,43 +3007,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 3. 1 16.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 3. 1 9.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 5. 1 23.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 5. 1 16.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 7. 1 30.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 7. 1 23.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 9. 1 37.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 19.0 0.1 0.0 <total>
+# CHECK-NEXT: 9. 1 30.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 15.5 0.1 3.0 <total>
# CHECK: [40] Code Region - G41
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 2103
# CHECK-NEXT: Total uOps: 4600
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.15
-# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: uOps Per Cycle: 2.19
+# CHECK-NEXT: IPC: 0.48
# CHECK-NEXT: Block RThroughput: 10.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,3] .D===============eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,5] . D======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: [0,7] . D=============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D============================eeeeeeeeER. ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: [0,9] . D====================================eER add x0, x27, #1
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 0123
+
+# CHECK: [0,0] DeeeeeeeeER . . . ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,1] D=eE------R . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,3] .D========eE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D=======eeeeeeeeER. . ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,5] . D========eE------R. . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=========eeeeeeeeER. ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,7] . D==========eE------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D=========eeeeeeeeER ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,9] . D==========eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3053,43 +3053,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 3. 1 16.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 5. 1 23.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: 7. 1 30.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: 9. 1 37.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 19.0 0.1 0.0 <total>
+# CHECK-NEXT: 3. 1 9.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 8.0 0.0 0.0 ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 5. 1 9.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 10.0 2.0 0.0 ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 7. 1 11.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 10.0 0.0 0.0 ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: 9. 1 11.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 7.9 0.3 3.0 <total>
# CHECK: [41] Code Region - G42
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 1008
# CHECK-NEXT: Total uOps: 4800
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.20
-# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: uOps Per Cycle: 4.76
+# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 10.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: [0,3] .D===============eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: [0,5] . D======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: [0,7] . D=============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D============================eeeeeeeeER. ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: [0,9] . D====================================eER add x0, x27, #1
+# CHECK-NEXT: 012345678
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . . ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1] D=eE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeER . . ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,3] .D=eE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D==eeeeeeeeER. . ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,5] . D===eE------R. . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D==eeeeeeeeER . ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,7] . D===eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D====eeeeeeeeER ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: [0,9] . D=====eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3099,43 +3099,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: 3. 1 16.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: 5. 1 23.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: 7. 1 30.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: 9. 1 37.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 19.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 3.0 2.0 0.0 ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: 5. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: 7. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 5.0 2.0 0.0 ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: 9. 1 6.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 3.1 0.5 3.0 <total>
# CHECK: [42] Code Region - G43
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 4004
+# CHECK-NEXT: Total Cycles: 1008
# CHECK-NEXT: Total uOps: 4700
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.17
-# CHECK-NEXT: IPC: 0.25
+# CHECK-NEXT: uOps Per Cycle: 4.66
+# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 10.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 0123
-
-# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,1] D========eER . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,3] .D===============eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,5] . D======================eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,7] . D=============================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D============================eeeeeeeeER. ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,9] . D====================================eER add x0, x27, #1
+# CHECK-NEXT: 012345678
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . . ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeER . . ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D==eeeeeeeeER. . ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,5] . D===eE------R. . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D==eeeeeeeeER . ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,7] . D===eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D====eeeeeeeeER ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,9] . D=====eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3145,43 +3145,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 3. 1 16.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 5. 1 23.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 7. 1 30.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 9. 1 37.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 19.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 3.0 2.0 0.0 ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 5. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 7. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 5.0 2.0 0.0 ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 9. 1 6.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 3.1 0.5 3.0 <total>
# CHECK: [43] Code Region - G44
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3604
+# CHECK-NEXT: Total Cycles: 708
# CHECK-NEXT: Total uOps: 3900
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.08
-# CHECK-NEXT: IPC: 0.28
+# CHECK-NEXT: uOps Per Cycle: 5.51
+# CHECK-NEXT: IPC: 1.41
# CHECK-NEXT: Block RThroughput: 6.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeeER . . . . . . ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,1] D========eER . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,3] .D===============eER. . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,5] . D======================eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=====================eeeeeeER . . ldp s1, s2, [x27], #248
-# CHECK-NEXT: [0,7] . D===========================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D===========================eeeeeeER. ldp d1, d2, [x27], #496
-# CHECK-NEXT: [0,9] . D=================================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeeeER . ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeER . ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,5] . D===eE------R add x0, x27, #1
+# CHECK-NEXT: [0,6] . D==eeeeeeE-R ldp s1, s2, [x27], #248
+# CHECK-NEXT: [0,7] . D===eE-----R add x0, x27, #1
+# CHECK-NEXT: [0,8] . D===eeeeeeER ldp d1, d2, [x27], #496
+# CHECK-NEXT: [0,9] . D====eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3191,43 +3191,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 3. 1 16.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 5. 1 23.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ldp s1, s2, [x27], #248
-# CHECK-NEXT: 7. 1 28.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 28.0 0.0 0.0 ldp d1, d2, [x27], #496
-# CHECK-NEXT: 9. 1 34.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 18.4 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 3.0 2.0 0.0 ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 5. 1 4.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 1.0 ldp s1, s2, [x27], #248
+# CHECK-NEXT: 7. 1 4.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ldp d1, d2, [x27], #496
+# CHECK-NEXT: 9. 1 5.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.9 0.3 2.8 <total>
# CHECK: [44] Code Region - G45
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 2506
+# CHECK-NEXT: Total Cycles: 507
# CHECK-NEXT: Total uOps: 2800
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.12
-# CHECK-NEXT: IPC: 0.40
+# CHECK-NEXT: uOps Per Cycle: 5.52
+# CHECK-NEXT: IPC: 1.97
# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . . . . . ldp q1, q2, [x27], #992
-# CHECK-NEXT: [0,1] D======eER. . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=====eeeeeeER. . . . ldp s1, s2, [x27, #248]!
-# CHECK-NEXT: [0,3] .D===========eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===========eeeeeeER . . ldp d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,5] .D=================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D================eeeeeeER . ldp q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,7] . D======================eER . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D======================eeeeER ldp w1, w2, [x27], #248
-# CHECK-NEXT: [0,9] . D======================eE--R add x0, x27, #1
+# CHECK: [0,0] DeeeeeeER .. ldp q1, q2, [x27], #992
+# CHECK-NEXT: [0,1] D=eE----R .. add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeER.. ldp s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3] .D=eE----R.. add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeER. ldp d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5] .D==eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeeER ldp q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7] . D==eE----R add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeE-R ldp w1, w2, [x27], #248
+# CHECK-NEXT: [0,9] . D==eE---R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3237,43 +3237,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldp q1, q2, [x27], #992
-# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 6.0 0.0 0.0 ldp s1, s2, [x27, #248]!
-# CHECK-NEXT: 3. 1 12.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 12.0 0.0 0.0 ldp d1, d2, [x27, #496]!
-# CHECK-NEXT: 5. 1 18.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 17.0 0.0 0.0 ldp q1, q2, [x27, #992]!
-# CHECK-NEXT: 7. 1 23.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 23.0 0.0 0.0 ldp w1, w2, [x27], #248
-# CHECK-NEXT: 9. 1 23.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 1 14.2 0.1 0.2 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ldp s1, s2, [x27, #248]!
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ldp d1, d2, [x27, #496]!
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ldp q1, q2, [x27, #992]!
+# CHECK-NEXT: 7. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 1.0 ldp w1, w2, [x27], #248
+# CHECK-NEXT: 9. 1 3.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.2 0.1 2.0 <total>
# CHECK: [45] Code Region - G46
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1304
+# CHECK-NEXT: Total Cycles: 1006
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.53
-# CHECK-NEXT: IPC: 0.77
+# CHECK-NEXT: uOps Per Cycle: 1.99
+# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 10.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 012345
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeER . .. ldp x1, x2, [x27], #496
-# CHECK-NEXT: [0,1] D=eE--R . .. add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeER . .. ldp w1, w2, [x27, #248]!
-# CHECK-NEXT: [0,3] D==eE--R . .. add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeER . .. ldp x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,5] .D==eE--R . .. add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeER .. ldpsw x1, x2, [x27], #248
-# CHECK-NEXT: [0,7] .D=======eER .. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D======eeeeeER. ldpsw x1, x2, [x27, #248]!
-# CHECK-NEXT: [0,9] . D===========eER add x0, x27, #1
+# CHECK: [0,0] DeeeeER . . ldp x1, x2, [x27], #496
+# CHECK-NEXT: [0,1] D=eE--R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeER . . ldp w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3] D==eE--R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeER . . ldp x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5] .D==eE--R . . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeER . ldpsw x1, x2, [x27], #248
+# CHECK-NEXT: [0,7] .D===eE---R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D======eeeeeER ldpsw x1, x2, [x27, #248]!
+# CHECK-NEXT: [0,9] . D=======eE---R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3289,10 +3289,10 @@ add x0, x27, 1
# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ldp x1, x2, [x27, #496]!
# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ldpsw x1, x2, [x27], #248
-# CHECK-NEXT: 7. 1 8.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 7.0 0.0 0.0 ldpsw x1, x2, [x27, #248]!
-# CHECK-NEXT: 9. 1 12.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 4.3 0.1 0.6 <total>
+# CHECK-NEXT: 7. 1 4.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 7.0 4.0 0.0 ldpsw x1, x2, [x27, #248]!
+# CHECK-NEXT: 9. 1 8.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 1 3.5 0.5 1.2 <total>
# CHECK: [46] Code Region - G47
@@ -3344,28 +3344,28 @@ add x0, x27, 1
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3004
+# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 0.67
-# CHECK-NEXT: IPC: 0.33
+# CHECK-NEXT: uOps Per Cycle: 3.94
+# CHECK-NEXT: IPC: 1.97
# CHECK-NEXT: Block RThroughput: 3.8
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . . . . . . ldr b1, [x27, #254]!
-# CHECK-NEXT: [0,1] D======eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D======eeeeeeER. . . . . ldr h1, [x27, #254]!
-# CHECK-NEXT: [0,3] D============eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===========eeeeeeER . . . ldr s1, [x27, #254]!
-# CHECK-NEXT: [0,5] .D=================eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=================eeeeeeER . . ldr d1, [x27, #254]!
-# CHECK-NEXT: [0,7] .D=======================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D======================eeeeeeER. ldr q1, [x27, #254]!
-# CHECK-NEXT: [0,9] . D============================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeER . . ldr b1, [x27, #254]!
+# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeER. . ldr h1, [x27, #254]!
+# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeER . ldr s1, [x27, #254]!
+# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeER. ldr d1, [x27, #254]!
+# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeER ldr q1, [x27, #254]!
+# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3375,16 +3375,16 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldr b1, [x27, #254]!
-# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 7.0 0.0 0.0 ldr h1, [x27, #254]!
-# CHECK-NEXT: 3. 1 13.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 12.0 0.0 0.0 ldr s1, [x27, #254]!
-# CHECK-NEXT: 5. 1 18.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 18.0 0.0 0.0 ldr d1, [x27, #254]!
-# CHECK-NEXT: 7. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 23.0 0.0 0.0 ldr q1, [x27, #254]!
-# CHECK-NEXT: 9. 1 29.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 15.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ldr h1, [x27, #254]!
+# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ldr s1, [x27, #254]!
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ldr d1, [x27, #254]!
+# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ldr q1, [x27, #254]!
+# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 2.0 <total>
# CHECK: [48] Code Region - G49
@@ -3528,28 +3528,27 @@ add x0, x27, 1
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 704
+# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 1700
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 2.41
-# CHECK-NEXT: IPC: 1.42
+# CHECK-NEXT: uOps Per Cycle: 3.37
+# CHECK-NEXT: IPC: 1.98
# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 012345678
-# CHECK: [0,0] DeeeeER . ldrsh x1, [x27, #254]!
-# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeER . ldrsw x1, [x27], #254
-# CHECK-NEXT: [0,3] D==eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,4] D==eeeeER . ldrsw x1, [x27, #254]!
-# CHECK-NEXT: [0,5] D===eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeE-R . st1 { v1.1d }, [x27], #8
-# CHECK-NEXT: [0,7] .D====eER . add x0, x27, #1
-# CHECK-NEXT: [0,8] .D====eeER. st1 { v1.2d }, [x27], #16
-# CHECK-NEXT: [0,9] .D======eER add x0, x27, #1
+# CHECK: [0,0] DeeeeER . ldrsh x1, [x27, #254]!
+# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeER. ldrsw x1, [x27], #254
+# CHECK-NEXT: [0,3] D==eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,4] D==eeeeER ldrsw x1, [x27, #254]!
+# CHECK-NEXT: [0,5] D===eE--R add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeE-R st1 { v1.1d }, [x27], #8
+# CHECK-NEXT: [0,7] .D===eE-R add x0, x27, #1
+# CHECK-NEXT: [0,8] .D===eeER st1 { v1.2d }, [x27], #16
+# CHECK-NEXT: [0,9] .D====eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3565,37 +3564,36 @@ add x0, x27, 1
# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ldrsw x1, [x27, #254]!
# CHECK-NEXT: 5. 1 4.0 0.0 2.0 add x0, x27, #1
# CHECK-NEXT: 6. 1 3.0 0.0 1.0 st1 { v1.1d }, [x27], #8
-# CHECK-NEXT: 7. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 5.0 0.0 0.0 st1 { v1.2d }, [x27], #16
-# CHECK-NEXT: 9. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.5 0.1 0.7 <total>
+# CHECK-NEXT: 7. 1 4.0 0.0 1.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 4.0 0.0 0.0 st1 { v1.2d }, [x27], #16
+# CHECK-NEXT: 9. 1 5.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 3.1 0.1 0.8 <total>
# CHECK: [52] Code Region - G53
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1004
+# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.99
-# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: uOps Per Cycle: 3.97
+# CHECK-NEXT: IPC: 1.98
# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 012345678
-# CHECK: [0,0] DeeER. . . st1 { v1.2s }, [x27], #8
-# CHECK-NEXT: [0,1] D==eER . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D==eeER . . st1 { v1.4h }, [x27], #8
-# CHECK-NEXT: [0,3] D====eER . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===eeER . . st1 { v1.4s }, [x27], #16
-# CHECK-NEXT: [0,5] .D=====eER. . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=====eeER . st1 { v1.8b }, [x27], #8
-# CHECK-NEXT: [0,7] .D=======eER . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D======eeER. st1 { v1.8h }, [x27], #16
-# CHECK-NEXT: [0,9] . D========eER add x0, x27, #1
+# CHECK: [0,0] DeeER. . st1 { v1.2s }, [x27], #8
+# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeER . st1 { v1.4h }, [x27], #8
+# CHECK-NEXT: [0,3] D==eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeER . st1 { v1.4s }, [x27], #16
+# CHECK-NEXT: [0,5] .D==eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeER. st1 { v1.8b }, [x27], #8
+# CHECK-NEXT: [0,7] .D===eER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeER st1 { v1.8h }, [x27], #16
+# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3605,43 +3603,42 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.2s }, [x27], #8
-# CHECK-NEXT: 1. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.4h }, [x27], #8
-# CHECK-NEXT: 3. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.4s }, [x27], #16
-# CHECK-NEXT: 5. 1 6.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 6.0 0.0 0.0 st1 { v1.8b }, [x27], #8
-# CHECK-NEXT: 7. 1 8.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 7.0 0.0 0.0 st1 { v1.8h }, [x27], #16
-# CHECK-NEXT: 9. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 5.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.4h }, [x27], #8
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.4s }, [x27], #16
+# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st1 { v1.8b }, [x27], #8
+# CHECK-NEXT: 7. 1 4.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st1 { v1.8h }, [x27], #16
+# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 0.0 <total>
# CHECK: [53] Code Region - G54
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1004
+# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.99
-# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: uOps Per Cycle: 3.97
+# CHECK-NEXT: IPC: 1.98
# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 012345678
-# CHECK: [0,0] DeeER. . . st1 { v1.16b }, [x27], #16
-# CHECK-NEXT: [0,1] D==eER . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D==eeER . . st1 { v1.1d }, [x27], x28
-# CHECK-NEXT: [0,3] D====eER . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===eeER . . st1 { v1.2d }, [x27], x28
-# CHECK-NEXT: [0,5] .D=====eER. . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=====eeER . st1 { v1.2s }, [x27], x28
-# CHECK-NEXT: [0,7] .D=======eER . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D======eeER. st1 { v1.4h }, [x27], x28
-# CHECK-NEXT: [0,9] . D========eER add x0, x27, #1
+# CHECK: [0,0] DeeER. . st1 { v1.16b }, [x27], #16
+# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeER . st1 { v1.1d }, [x27], x28
+# CHECK-NEXT: [0,3] D==eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeER . st1 { v1.2d }, [x27], x28
+# CHECK-NEXT: [0,5] .D==eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeER. st1 { v1.2s }, [x27], x28
+# CHECK-NEXT: [0,7] .D===eER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeER st1 { v1.4h }, [x27], x28
+# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3651,43 +3648,42 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.16b }, [x27], #16
-# CHECK-NEXT: 1. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.1d }, [x27], x28
-# CHECK-NEXT: 3. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.2d }, [x27], x28
-# CHECK-NEXT: 5. 1 6.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 6.0 0.0 0.0 st1 { v1.2s }, [x27], x28
-# CHECK-NEXT: 7. 1 8.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 7.0 0.0 0.0 st1 { v1.4h }, [x27], x28
-# CHECK-NEXT: 9. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 5.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.1d }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.2d }, [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st1 { v1.2s }, [x27], x28
+# CHECK-NEXT: 7. 1 4.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st1 { v1.4h }, [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 0.0 <total>
# CHECK: [54] Code Region - G55
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1004
+# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.99
-# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: uOps Per Cycle: 3.97
+# CHECK-NEXT: IPC: 1.98
# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 012345678
-# CHECK: [0,0] DeeER. . . st1 { v1.4s }, [x27], x28
-# CHECK-NEXT: [0,1] D==eER . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D==eeER . . st1 { v1.8b }, [x27], x28
-# CHECK-NEXT: [0,3] D====eER . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===eeER . . st1 { v1.8h }, [x27], x28
-# CHECK-NEXT: [0,5] .D=====eER. . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=====eeER . st1 { v1.16b }, [x27], x28
-# CHECK-NEXT: [0,7] .D=======eER . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D======eeER. st1 { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,9] . D========eER add x0, x27, #1
+# CHECK: [0,0] DeeER. . st1 { v1.4s }, [x27], x28
+# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeER . st1 { v1.8b }, [x27], x28
+# CHECK-NEXT: [0,3] D==eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeER . st1 { v1.8h }, [x27], x28
+# CHECK-NEXT: [0,5] .D==eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeER. st1 { v1.16b }, [x27], x28
+# CHECK-NEXT: [0,7] .D===eER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeER st1 { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3697,43 +3693,42 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.4s }, [x27], x28
-# CHECK-NEXT: 1. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.8b }, [x27], x28
-# CHECK-NEXT: 3. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.8h }, [x27], x28
-# CHECK-NEXT: 5. 1 6.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 6.0 0.0 0.0 st1 { v1.16b }, [x27], x28
-# CHECK-NEXT: 7. 1 8.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 7.0 0.0 0.0 st1 { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 9. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 5.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.8b }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.8h }, [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st1 { v1.16b }, [x27], x28
+# CHECK-NEXT: 7. 1 4.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st1 { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 0.0 <total>
# CHECK: [55] Code Region - G56
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1004
+# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 2400
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 2.39
-# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: uOps Per Cycle: 4.76
+# CHECK-NEXT: IPC: 1.98
# CHECK-NEXT: Block RThroughput: 3.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 012345678
-# CHECK: [0,0] DeeER. . . st1 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,1] D==eER . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D==eeER . . st1 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,3] D====eER . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===eeER . . st1 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,5] .D=====eER. . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=====eeER . st1 { v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,7] .D=======eER . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D======eeER. st1 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,9] . D========eER add x0, x27, #1
+# CHECK: [0,0] DeeER. . st1 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeER . st1 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,3] D==eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeER . st1 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,5] .D==eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeER. st1 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,7] .D===eER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeER st1 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3743,43 +3738,42 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 1. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 3. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 5. 1 6.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 6.0 0.0 0.0 st1 { v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 7. 1 8.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 7.0 0.0 0.0 st1 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 9. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 5.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st1 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 7. 1 4.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st1 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 0.0 <total>
# CHECK: [56] Code Region - G57
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1004
+# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 2600
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 2.59
-# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: uOps Per Cycle: 5.16
+# CHECK-NEXT: IPC: 1.98
# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 012345678
-# CHECK: [0,0] DeeER. . . st1 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,1] D==eER . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=eeER . . st1 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,3] .D===eER . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===eeER . . st1 { v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,5] .D=====eER. . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D====eeER . st1 { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,7] . D======eER . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D======eeER. st1 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,9] . D========eER add x0, x27, #1
+# CHECK: [0,0] DeeER. . st1 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeER . st1 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeER . st1 { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,5] .D==eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeER. st1 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,7] . D==eER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeER st1 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3789,43 +3783,42 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 1. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 5. 1 6.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 5.0 0.0 0.0 st1 { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 7. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 7.0 0.0 0.0 st1 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 9. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 4.8 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st1 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 7. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st1 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.1 0.0 <total>
# CHECK: [57] Code Region - G58
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1004
+# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 2600
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 2.59
-# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: uOps Per Cycle: 5.16
+# CHECK-NEXT: IPC: 1.98
# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 012345678
-# CHECK: [0,0] DeeER. . . st1 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,1] D==eER . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D==eeER . . st1 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,3] D====eER . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===eeER . . st1 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,5] .D=====eER. . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=====eeER . st1 { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,7] .D=======eER . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D======eeER. st1 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,9] . D========eER add x0, x27, #1
+# CHECK: [0,0] DeeER. . st1 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeER . st1 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3] D==eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeER . st1 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,5] .D==eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeER. st1 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,7] .D===eER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeER st1 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3835,43 +3828,42 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 1. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 3. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 5. 1 6.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 6.0 0.0 0.0 st1 { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 7. 1 8.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 7.0 0.0 0.0 st1 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 9. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 5.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st1 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 7. 1 4.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st1 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 0.0 <total>
# CHECK: [58] Code Region - G59
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1004
+# CHECK-NEXT: Total Cycles: 703
# CHECK-NEXT: Total uOps: 3400
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 3.39
-# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: uOps Per Cycle: 4.84
+# CHECK-NEXT: IPC: 1.42
# CHECK-NEXT: Block RThroughput: 6.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeER. . . st1 { v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1] D==eER . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=eeER . . st1 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,3] .D===eER . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==eeER . . st1 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,5] . D====eER. . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D===eeER . st1 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,7] . D=====eER . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D====eeER. st1 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,9] . D======eER add x0, x27, #1
+# CHECK: [0,0] DeeER. . st1 { v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeER . st1 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeER . st1 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,5] . D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeER. st1 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,7] . D==eER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D=eeER st1 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,9] . D==eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3881,43 +3873,42 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 5. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 4.0 0.0 0.0 st1 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 7. 1 6.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 5.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 9. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 4.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 5. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 7. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 2.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 9. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.9 0.2 0.0 <total>
# CHECK: [59] Code Region - G60
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1004
+# CHECK-NEXT: Total Cycles: 703
# CHECK-NEXT: Total uOps: 3600
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 3.59
-# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: uOps Per Cycle: 5.12
+# CHECK-NEXT: IPC: 1.42
# CHECK-NEXT: Block RThroughput: 6.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeER. . . st1 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,1] D==eER . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=eeER . . st1 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,3] .D===eER . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==eeER . . st1 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,5] . D====eER. . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D===eeER . st1 { v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7] . D=====eER . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D====eeER. st1 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9] . D======eER add x0, x27, #1
+# CHECK: [0,0] DeeER. . st1 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeER . st1 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeER . st1 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,5] . D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeER. st1 { v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7] . D==eER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D=eeER st1 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9] . D==eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3927,43 +3918,42 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 1. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 st1 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 5. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 4.0 0.0 0.0 st1 { v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 7. 1 6.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 5.0 0.0 0.0 st1 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 4.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 5. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 1.0 0.0 st1 { v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 2.0 0.0 0.0 st1 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.9 0.2 0.0 <total>
# CHECK: [60] Code Region - G61
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1004
+# CHECK-NEXT: Total Cycles: 703
# CHECK-NEXT: Total uOps: 3400
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 3.39
-# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: uOps Per Cycle: 4.84
+# CHECK-NEXT: IPC: 1.42
# CHECK-NEXT: Block RThroughput: 6.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeER. . . st1 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1] D==eER . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=eeER . . st1 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3] .D===eER . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==eeER . . st1 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5] . D====eER. . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D===eeER . st1 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7] . D=====eER . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D====eeER. st1 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9] . D======eER add x0, x27, #1
+# CHECK: [0,0] DeeER. . st1 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeER . st1 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeER . st1 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5] . D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeER . st1 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7] . D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D=eeER st1 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9] . D==eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3973,43 +3963,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 4.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7. 1 6.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 5.0 0.0 0.0 st1 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 4.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 2.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.7 0.2 0.0 <total>
# CHECK: [61] Code Region - G62
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1004
+# CHECK-NEXT: Total Cycles: 704
# CHECK-NEXT: Total uOps: 3600
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 3.59
-# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: uOps Per Cycle: 5.11
+# CHECK-NEXT: IPC: 1.42
# CHECK-NEXT: Block RThroughput: 6.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123
+# CHECK-NEXT: 0
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeER. . . st1 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1] D==eER . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=eeER . . st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,3] .D===eER . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==eeER . . st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,5] . D====eER. . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D===eeER . st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,7] . D=====eER . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D====eeER. st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,9] . D======eER add x0, x27, #1
+# CHECK: [0,0] DeeER. . st1 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeER . st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D=eeER . st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,5] . D==eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeER . st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,7] . D==eER . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeER st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4019,43 +4009,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 5. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 4.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 7. 1 6.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 5.0 0.0 0.0 st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 9. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 4.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 7. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.3 0.0 <total>
# CHECK: [62] Code Region - G63
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1004
+# CHECK-NEXT: Total Cycles: 804
# CHECK-NEXT: Total uOps: 4200
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 4.18
-# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: uOps Per Cycle: 5.22
+# CHECK-NEXT: IPC: 1.24
# CHECK-NEXT: Block RThroughput: 8.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeER. . . st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,1] D==eER . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=eeER . . st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,3] .D===eER . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==eeER . . st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,5] . D====eER. . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D===eeER . st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,7] . D=====eER . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D====eeER. st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,9] . D======eER add x0, x27, #1
+# CHECK: [0,0] DeeER. .. st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1] D=eER. .. add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeER .. st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3] .D=eER .. add x0, x27, #1
+# CHECK-NEXT: [0,4] . D=eeER .. st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,5] . D==eER .. add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeER .. st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,7] . D==eER .. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D===eeER st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,9] . D====eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4065,43 +4055,42 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 1. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 5. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 4.0 0.0 0.0 st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 7. 1 6.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 5.0 0.0 0.0 st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 9. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 4.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 7. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 4.0 2.0 0.0 st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 9. 1 5.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.5 0.4 0.0 <total>
# CHECK: [63] Code Region - G64
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1004
+# CHECK-NEXT: Total Cycles: 703
# CHECK-NEXT: Total uOps: 3800
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 3.78
-# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: uOps Per Cycle: 5.41
+# CHECK-NEXT: IPC: 1.42
# CHECK-NEXT: Block RThroughput: 7.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeER. . . st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1] D==eER . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=eeER . . st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3] .D===eER . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==eeER . . st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5] . D====eER. . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D===eeER . st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7] . D=====eER . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D====eeER. st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9] . D======eER add x0, x27, #1
+# CHECK: [0,0] DeeER. . st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeER . st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D=eeER . st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5] . D==eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeER. st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7] . D==eER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D=eeER st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9] . D==eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4111,43 +4100,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 5. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 4.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 7. 1 6.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 5.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 9. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 4.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 2.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.1 0.2 0.0 <total>
# CHECK: [64] Code Region - G65
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1604
+# CHECK-NEXT: Total Cycles: 706
# CHECK-NEXT: Total uOps: 3200
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 2.00
-# CHECK-NEXT: IPC: 0.62
+# CHECK-NEXT: uOps Per Cycle: 4.53
+# CHECK-NEXT: IPC: 1.42
# CHECK-NEXT: Block RThroughput: 5.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789
+# CHECK-NEXT: 012
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeER. . . . st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1] D==eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=eeER . . . st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3] .D===eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==eeeeER . . st1 { v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,5] . D======eER . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D======eeeeER. . st1 { v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,7] . D==========eER . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D=========eeeeER. st1 { v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,9] . D=============eER add x0, x27, #1
+# CHECK: [0,0] DeeER. . . st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1] D=eER. . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeER . . st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eER . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D==eeeeER . st1 { v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,5] . D===eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D===eeeeER. st1 { v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,7] . D====eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D===eeeeER st1 { v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,9] . D====eE--R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4157,43 +4146,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 st1 { v1.b }[0], [x27], #1
-# CHECK-NEXT: 5. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 7.0 0.0 0.0 st1 { v1.b }[8], [x27], #1
-# CHECK-NEXT: 7. 1 11.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 10.0 0.0 0.0 st1 { v1.b }[0], [x27], x28
-# CHECK-NEXT: 9. 1 14.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 6.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 3.0 2.0 0.0 st1 { v1.b }[0], [x27], #1
+# CHECK-NEXT: 5. 1 4.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 4.0 0.0 0.0 st1 { v1.b }[8], [x27], #1
+# CHECK-NEXT: 7. 1 5.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 4.0 0.0 0.0 st1 { v1.b }[0], [x27], x28
+# CHECK-NEXT: 9. 1 5.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 1 3.1 0.3 0.6 <total>
# CHECK: [65] Code Region - G66
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 2004
+# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.00
-# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: uOps Per Cycle: 3.95
+# CHECK-NEXT: IPC: 1.98
# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789 0123
+# CHECK-NEXT: 0
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeER . . . . st1 { v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,1] D====eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D====eeeeER . . . st1 { v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,3] D========eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=======eeeeER. . . st1 { v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,5] .D===========eER . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D===========eeeeER . . st1 { v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,7] .D===============eER. . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============eeeeER. st1 { v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,9] . D==================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeER . st1 { v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeER . st1 { v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,3] D==eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeER . st1 { v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,5] .D==eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeER. st1 { v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,7] .D===eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeER st1 { v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,9] . D===eE--R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4203,43 +4192,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.b }[8], [x27], x28
-# CHECK-NEXT: 1. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 5.0 0.0 0.0 st1 { v1.h }[0], [x27], #2
-# CHECK-NEXT: 3. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 8.0 0.0 0.0 st1 { v1.h }[4], [x27], #2
-# CHECK-NEXT: 5. 1 12.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 12.0 0.0 0.0 st1 { v1.h }[0], [x27], x28
-# CHECK-NEXT: 7. 1 16.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 15.0 0.0 0.0 st1 { v1.h }[4], [x27], x28
-# CHECK-NEXT: 9. 1 19.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 10.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.h }[0], [x27], #2
+# CHECK-NEXT: 3. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.h }[4], [x27], #2
+# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st1 { v1.h }[0], [x27], x28
+# CHECK-NEXT: 7. 1 4.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st1 { v1.h }[4], [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 1.0 <total>
# CHECK: [66] Code Region - G67
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 2004
+# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 2200
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.10
-# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: uOps Per Cycle: 4.35
+# CHECK-NEXT: IPC: 1.98
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789 0123
+# CHECK-NEXT: 0
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeER . . . . st1 { v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,1] D====eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D====eeeeER . . . st1 { v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,3] D========eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=======eeeeER. . . st1 { v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,5] .D===========eER . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D===========eeeeER . . st1 { v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,7] .D===============eER. . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============eeeeER. st2 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,9] . D==================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeER . st1 { v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeER . st1 { v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,3] D==eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeER . st1 { v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,5] .D==eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeER. st1 { v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,7] .D===eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeER st2 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,9] . D===eE--R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4249,43 +4238,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.s }[0], [x27], #4
-# CHECK-NEXT: 1. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 5.0 0.0 0.0 st1 { v1.s }[0], [x27], x28
-# CHECK-NEXT: 3. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 8.0 0.0 0.0 st1 { v1.d }[0], [x27], #8
-# CHECK-NEXT: 5. 1 12.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 12.0 0.0 0.0 st1 { v1.d }[0], [x27], x28
-# CHECK-NEXT: 7. 1 16.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 15.0 0.0 0.0 st2 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 9. 1 19.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 10.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.s }[0], [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.d }[0], [x27], #8
+# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st1 { v1.d }[0], [x27], x28
+# CHECK-NEXT: 7. 1 4.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st2 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 9. 1 4.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 1.0 <total>
# CHECK: [67] Code Region - G68
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 2004
+# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 2400
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.20
-# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: uOps Per Cycle: 4.74
+# CHECK-NEXT: IPC: 1.98
# CHECK-NEXT: Block RThroughput: 3.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789 0123
+# CHECK-NEXT: 0
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeER . . . . st2 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,1] D====eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D====eeeeER . . . st2 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,3] D========eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=======eeeeER. . . st2 { v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,5] .D===========eER . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D===========eeeeER . . st2 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,7] .D===============eER. . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============eeeeER. st2 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,9] . D==================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeER . st2 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeER . st2 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3] D==eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeER . st2 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,5] .D==eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeER. st2 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,7] .D===eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeER st2 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,9] . D===eE--R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4295,43 +4284,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 1. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 5.0 0.0 0.0 st2 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 3. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 8.0 0.0 0.0 st2 { v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 5. 1 12.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 12.0 0.0 0.0 st2 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 7. 1 16.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 15.0 0.0 0.0 st2 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 9. 1 19.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 10.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st2 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 3. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st2 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st2 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 7. 1 4.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st2 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 9. 1 4.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 1.0 <total>
# CHECK: [68] Code Region - G69
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 2004
+# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 2600
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.30
-# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: uOps Per Cycle: 5.14
+# CHECK-NEXT: IPC: 1.98
# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789 0123
+# CHECK-NEXT: 0
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeER . . . . st2 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,1] D====eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D===eeeeER . . . st2 { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,3] .D=======eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=======eeeeER. . . st2 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,5] .D===========eER . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D==========eeeeER . . st2 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,7] . D==============eER. . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============eeeeER. st2 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,9] . D==================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeER . st2 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeER . st2 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeER . st2 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,5] .D==eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeER. st2 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,7] . D==eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeER st2 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,9] . D===eE--R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4341,43 +4330,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 1. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 4.0 0.0 0.0 st2 { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 3. 1 8.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 8.0 0.0 0.0 st2 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 5. 1 12.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 11.0 0.0 0.0 st2 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 7. 1 15.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 15.0 0.0 0.0 st2 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 9. 1 19.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 9.8 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st2 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st2 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st2 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 7. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st2 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.1 1.0 <total>
# CHECK: [69] Code Region - G70
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 2004
+# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 2400
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.20
-# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: uOps Per Cycle: 4.74
+# CHECK-NEXT: IPC: 1.98
# CHECK-NEXT: Block RThroughput: 3.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789 0123
+# CHECK-NEXT: 0
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeER . . . . st2 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,1] D====eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D====eeeeER . . . st2 { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,3] D========eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=======eeeeER. . . st2 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,5] .D===========eER . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D===========eeeeER . . st2 { v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,7] .D===============eER. . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============eeeeER. st2 { v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,9] . D==================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeER . st2 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeER . st2 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3] D==eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeER . st2 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,5] .D==eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeER. st2 { v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,7] .D===eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeER st2 { v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,9] . D===eE--R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4387,43 +4376,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 1. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 5.0 0.0 0.0 st2 { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 3. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 8.0 0.0 0.0 st2 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 5. 1 12.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 12.0 0.0 0.0 st2 { v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 7. 1 16.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 15.0 0.0 0.0 st2 { v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 9. 1 19.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 10.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st2 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st2 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st2 { v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 7. 1 4.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st2 { v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 9. 1 4.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 1.0 <total>
# CHECK: [70] Code Region - G71
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 2004
+# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.00
-# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: uOps Per Cycle: 3.95
+# CHECK-NEXT: IPC: 1.98
# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789 0123
+# CHECK-NEXT: 0
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeER . . . . st2 { v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,1] D====eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D====eeeeER . . . st2 { v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,3] D========eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=======eeeeER. . . st2 { v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,5] .D===========eER . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D===========eeeeER . . st2 { v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,7] .D===============eER. . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============eeeeER. st2 { v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,9] . D==================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeER . st2 { v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeER . st2 { v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,3] D==eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeER . st2 { v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,5] .D==eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeER. st2 { v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,7] .D===eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeER st2 { v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,9] . D===eE--R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4433,43 +4422,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: 1. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 5.0 0.0 0.0 st2 { v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 3. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 8.0 0.0 0.0 st2 { v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: 5. 1 12.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 12.0 0.0 0.0 st2 { v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 7. 1 16.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 15.0 0.0 0.0 st2 { v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 9. 1 19.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 10.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st2 { v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st2 { v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st2 { v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 7. 1 4.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st2 { v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 1.0 <total>
# CHECK: [71] Code Region - G72
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 2004
+# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 2000
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.00
-# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: uOps Per Cycle: 3.95
+# CHECK-NEXT: IPC: 1.98
# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789 0123
+# CHECK-NEXT: 0
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeER . . . . st2 { v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,1] D====eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D====eeeeER . . . st2 { v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,3] D========eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=======eeeeER. . . st2 { v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,5] .D===========eER . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D===========eeeeER . . st2 { v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,7] .D===============eER. . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============eeeeER. st2 { v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,9] . D==================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeER . st2 { v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeER . st2 { v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,3] D==eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeER . st2 { v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,5] .D==eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeER. st2 { v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,7] .D===eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeER st2 { v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,9] . D===eE--R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4479,43 +4468,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: 1. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 5.0 0.0 0.0 st2 { v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 3. 1 9.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 8.0 0.0 0.0 st2 { v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: 5. 1 12.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 12.0 0.0 0.0 st2 { v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 7. 1 16.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 15.0 0.0 0.0 st2 { v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 9. 1 19.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 10.2 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st2 { v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 3. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st2 { v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st2 { v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 7. 1 4.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st2 { v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.7 0.1 1.0 <total>
# CHECK: [72] Code Region - G73
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1804
+# CHECK-NEXT: Total Cycles: 607
# CHECK-NEXT: Total uOps: 2800
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.55
-# CHECK-NEXT: IPC: 0.55
+# CHECK-NEXT: uOps Per Cycle: 4.61
+# CHECK-NEXT: IPC: 1.65
# CHECK-NEXT: Block RThroughput: 4.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789 01
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeER . . . .. st2g x26, [x27], #4064
-# CHECK-NEXT: [0,1] D=eER. . . .. add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eER. . . .. st2g x26, [x27, #4064]!
-# CHECK-NEXT: [0,3] D==eER . . .. add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeER . .. st3 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,5] .D=======eER . .. add x0, x27, #1
-# CHECK-NEXT: [0,6] . D======eeeeeER .. st3 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,7] . D===========eER .. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==========eeeeeER. st3 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,9] . D===============eER add x0, x27, #1
+# CHECK: [0,0] DeER . . . st2g x26, [x27], #4064
+# CHECK-NEXT: [0,1] D=eER. . . add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eER. . . st2g x26, [x27, #4064]!
+# CHECK-NEXT: [0,3] D==eER . . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeER . st3 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeER . st3 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,7] . D==eE---R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeER st3 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,9] . D===eE---R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4529,39 +4518,39 @@ add x0, x27, 1
# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st2g x26, [x27, #4064]!
# CHECK-NEXT: 3. 1 3.0 0.0 0.0 add x0, x27, #1
# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st3 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 5. 1 8.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 7.0 0.0 0.0 st3 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 7. 1 12.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 11.0 0.0 0.0 st3 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 9. 1 16.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 6.4 0.1 0.0 <total>
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st3 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 7. 1 3.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 1.0 0.0 st3 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 9. 1 4.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.5 0.2 1.0 <total>
# CHECK: [73] Code Region - G74
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 2904
+# CHECK-NEXT: Total Cycles: 708
# CHECK-NEXT: Total uOps: 3800
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.31
-# CHECK-NEXT: IPC: 0.34
+# CHECK-NEXT: uOps Per Cycle: 5.37
+# CHECK-NEXT: IPC: 1.41
# CHECK-NEXT: Block RThroughput: 7.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 012
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . . . . . . st3 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,1] D======eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=====eeeeeER . . . . . st3 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,3] .D==========eER. . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D=========eeeeeeER. . . . st3 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,5] . D===============eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D==============eeeeeeER . . st3 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,7] . D====================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D===================eeeeeeER. st3 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9] . D=========================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeER . . st3 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeER . . st3 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3] .D=eE---R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D=eeeeeeER . st3 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,5] . D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeeER . st3 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,7] . D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeER st3 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4571,43 +4560,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 6.0 0.0 0.0 st3 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 3. 1 11.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 10.0 0.0 0.0 st3 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 5. 1 16.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 15.0 0.0 0.0 st3 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 7. 1 21.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 20.0 0.0 0.0 st3 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9. 1 26.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 13.3 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st3 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3. 1 2.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st3 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st3 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 7. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 1.0 0.0 st3 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.3 1.9 <total>
# CHECK: [74] Code Region - G75
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 2704
+# CHECK-NEXT: Total Cycles: 707
# CHECK-NEXT: Total uOps: 3400
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.26
-# CHECK-NEXT: IPC: 0.37
+# CHECK-NEXT: uOps Per Cycle: 4.81
+# CHECK-NEXT: IPC: 1.41
# CHECK-NEXT: Block RThroughput: 6.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeER . . . . . st3 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1] D=====eER . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D====eeeeeER . . . . st3 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3] .D=========eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D========eeeeeeER . . . st3 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5] . D==============eER. . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=============eeeeeER . . st3 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7] . D==================eER. . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D=================eeeeeeER. st3 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9] . D=======================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeER . . st3 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE---R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeER . . st3 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eE---R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER . st3 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5] . D=eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeER . st3 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7] . D=eE---R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D=eeeeeeER st3 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9] . D==eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4617,43 +4606,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1. 1 6.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 5.0 0.0 0.0 st3 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3. 1 10.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 9.0 0.0 0.0 st3 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5. 1 15.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 14.0 0.0 0.0 st3 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7. 1 19.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 18.0 0.0 0.0 st3 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9. 1 24.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 12.1 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st3 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st3 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 st3 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7. 1 2.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 2.0 1.0 0.0 st3 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.7 0.2 1.7 <total>
# CHECK: [75] Code Region - G76
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3004
+# CHECK-NEXT: Total Cycles: 757
# CHECK-NEXT: Total uOps: 4000
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.33
-# CHECK-NEXT: IPC: 0.33
+# CHECK-NEXT: uOps Per Cycle: 5.28
+# CHECK-NEXT: IPC: 1.32
# CHECK-NEXT: Block RThroughput: 7.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . . . . . . st3 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1] D======eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=====eeeeeeER. . . . . st3 { v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,3] .D===========eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==========eeeeeeER . . . st3 { v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,5] . D================eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D===============eeeeeeER . . st3 { v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,7] . D=====================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D====================eeeeeeER. st3 { v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,9] . D==========================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeER . . st3 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeER. . st3 { v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D=eeeeeeER . st3 { v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,5] . D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeeER . st3 { v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,7] . D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeER st3 { v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4663,43 +4652,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 6.0 0.0 0.0 st3 { v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 3. 1 12.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 11.0 0.0 0.0 st3 { v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 5. 1 17.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 16.0 0.0 0.0 st3 { v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 7. 1 22.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 21.0 0.0 0.0 st3 { v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 9. 1 27.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 14.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st3 { v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st3 { v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st3 { v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 7. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 1.0 0.0 st3 { v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.3 2.0 <total>
# CHECK: [76] Code Region - G77
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3004
+# CHECK-NEXT: Total Cycles: 757
# CHECK-NEXT: Total uOps: 4000
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.33
-# CHECK-NEXT: IPC: 0.33
+# CHECK-NEXT: uOps Per Cycle: 5.28
+# CHECK-NEXT: IPC: 1.32
# CHECK-NEXT: Block RThroughput: 7.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . . . . . . st3 { v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,1] D======eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=====eeeeeeER. . . . . st3 { v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,3] .D===========eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==========eeeeeeER . . . st3 { v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,5] . D================eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D===============eeeeeeER . . st3 { v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,7] . D=====================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D====================eeeeeeER. st3 { v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,9] . D==========================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeER . . st3 { v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeER. . st3 { v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D=eeeeeeER . st3 { v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,5] . D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeeER . st3 { v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,7] . D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeER st3 { v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4709,43 +4698,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 6.0 0.0 0.0 st3 { v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 3. 1 12.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 11.0 0.0 0.0 st3 { v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 5. 1 17.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 16.0 0.0 0.0 st3 { v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 7. 1 22.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 21.0 0.0 0.0 st3 { v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 9. 1 27.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 14.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st3 { v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st3 { v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st3 { v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 7. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 1.0 0.0 st3 { v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.3 2.0 <total>
# CHECK: [77] Code Region - G78
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 2904
+# CHECK-NEXT: Total Cycles: 807
# CHECK-NEXT: Total uOps: 4200
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.45
-# CHECK-NEXT: IPC: 0.34
+# CHECK-NEXT: uOps Per Cycle: 5.20
+# CHECK-NEXT: IPC: 1.24
# CHECK-NEXT: Block RThroughput: 8.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 012
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . . . . . . st3 { v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,1] D======eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=====eeeeeeER. . . . . st3 { v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,3] .D===========eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==========eeeeeeER . . . st3 { v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,5] . D================eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D===============eeeeeER . . st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,7] . D====================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D===================eeeeeeER. st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,9] . D=========================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeER . . st3 { v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeER. . st3 { v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D=eeeeeeER . st3 { v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,5] . D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeER . st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,7] . D==eE---R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeER st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4755,43 +4744,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 6.0 0.0 0.0 st3 { v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 3. 1 12.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 11.0 0.0 0.0 st3 { v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 5. 1 17.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 16.0 0.0 0.0 st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 7. 1 21.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 20.0 0.0 0.0 st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 9. 1 26.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 13.7 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st3 { v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st3 { v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 7. 1 3.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 1.0 0.0 st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.3 1.9 <total>
# CHECK: [78] Code Region - G79
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3304
+# CHECK-NEXT: Total Cycles: 1205
# CHECK-NEXT: Total uOps: 5800
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.76
-# CHECK-NEXT: IPC: 0.30
+# CHECK-NEXT: uOps Per Cycle: 4.81
+# CHECK-NEXT: IPC: 0.83
# CHECK-NEXT: Block RThroughput: 12.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123456
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . . . . . .. st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,1] D======eER. . . . . .. add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=====eeeeeeeER . . . .. st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,3] . D===========eER . . . .. add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==========eeeeeeER . . .. st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,5] . D================eER . . .. add x0, x27, #1
-# CHECK-NEXT: [0,6] . D===============eeeeeeeER . .. st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,7] . D=====================eER. .. add x0, x27, #1
-# CHECK-NEXT: [0,8] . .D====================eeeeeeeER. st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,9] . . D==========================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeER . .. st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1] D=eE----R . .. add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeER .. st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3] . DeE-----R .. add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER .. st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,5] . D=eE----R .. add x0, x27, #1
+# CHECK-NEXT: [0,6] . D==eeeeeeeER. st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,7] . D==eE-----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . .D=eeeeeeeER st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,9] . . D=eE-----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4801,43 +4790,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 6.0 0.0 0.0 st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 3. 1 12.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 11.0 0.0 0.0 st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 5. 1 17.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 16.0 0.0 0.0 st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 7. 1 22.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 21.0 0.0 0.0 st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 9. 1 27.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 14.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 3. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 5. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 2.0 0.0 st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 7. 1 3.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 2.0 0.0 0.0 st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 9. 1 2.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.8 0.4 2.3 <total>
# CHECK: [79] Code Region - G80
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3004
+# CHECK-NEXT: Total Cycles: 1006
# CHECK-NEXT: Total uOps: 4800
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.60
-# CHECK-NEXT: IPC: 0.33
+# CHECK-NEXT: uOps Per Cycle: 4.77
+# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 9.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 012345
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeER . . . . . . st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1] D=====eER . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D====eeeeeeER . . . . . st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3] .D==========eER. . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D=========eeeeeeER. . . . st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5] . D===============eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D==============eeeeeeeER . . st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7] . D====================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D===================eeeeeeER. st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9] . D=========================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeER . . st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE---R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeER. . st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D==eeeeeeER . st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5] . D===eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D==eeeeeeeER. st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7] . D==eE-----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeER st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4847,43 +4836,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1. 1 6.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 5.0 0.0 0.0 st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 3. 1 11.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 10.0 0.0 0.0 st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 5. 1 16.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 15.0 0.0 0.0 st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 7. 1 21.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 20.0 0.0 0.0 st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 9. 1 26.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 13.1 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 3.0 2.0 0.0 st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7. 1 3.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 1.0 0.0 st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.6 0.4 2.0 <total>
# CHECK: [80] Code Region - G81
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3204
+# CHECK-NEXT: Total Cycles: 1058
# CHECK-NEXT: Total uOps: 5200
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.62
-# CHECK-NEXT: IPC: 0.31
+# CHECK-NEXT: uOps Per Cycle: 4.91
+# CHECK-NEXT: IPC: 0.95
# CHECK-NEXT: Block RThroughput: 10.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 012345
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 01234567
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeER. . . . . . st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1] .D======eER . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] . D=====eeeeeeeER . . . . st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3] . D===========eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==========eeeeeeER . . . st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,5] . D================eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D===============eeeeeeER . . st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,7] . D=====================eER. . add x0, x27, #1
-# CHECK-NEXT: [0,8] . .D====================eeeeeeER. st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,9] . .D==========================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeeER. . . st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1] .DeE-----R. . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeeER . . st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3] . DeE-----R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D==eeeeeeER. . st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,5] . D===eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D===eeeeeeER. st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,7] . D====eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . .D===eeeeeeER st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,9] . .D====eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4893,43 +4882,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 6.0 0.0 0.0 st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3. 1 12.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 11.0 0.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 5. 1 17.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 16.0 0.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 7. 1 22.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 21.0 0.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 9. 1 27.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 14.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 3.0 3.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 5. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 4.0 1.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 7. 1 5.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 4.0 0.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 9. 1 5.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.9 0.6 2.2 <total>
# CHECK: [81] Code Region - G82
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 3004
+# CHECK-NEXT: Total Cycles: 757
# CHECK-NEXT: Total uOps: 4000
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.33
-# CHECK-NEXT: IPC: 0.33
+# CHECK-NEXT: uOps Per Cycle: 5.28
+# CHECK-NEXT: IPC: 1.32
# CHECK-NEXT: Block RThroughput: 7.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0123
-# CHECK-NEXT: Index 0123456789 0123456789
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . . . . . . st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,1] D======eER. . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=====eeeeeeER. . . . . st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,3] .D===========eER . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==========eeeeeeER . . . st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,5] . D================eER . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D===============eeeeeeER . . st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,7] . D=====================eER . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D====================eeeeeeER. st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,9] . D==========================eER add x0, x27, #1
+# CHECK: [0,0] DeeeeeeER . . st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeER. . st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D=eeeeeeER . st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,5] . D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeeER . st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,7] . D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeER st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4939,43 +4928,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 6.0 0.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 3. 1 12.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 11.0 0.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 5. 1 17.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 16.0 0.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 7. 1 22.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 21.0 0.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 9. 1 27.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 14.0 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 7. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 1.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.3 2.0 <total>
# CHECK: [82] Code Region - G83
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 2104
+# CHECK-NEXT: Total Cycles: 704
# CHECK-NEXT: Total uOps: 3600
# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 1.71
-# CHECK-NEXT: IPC: 0.48
+# CHECK-NEXT: uOps Per Cycle: 5.11
+# CHECK-NEXT: IPC: 1.42
# CHECK-NEXT: Block RThroughput: 6.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789 01234
-
-# CHECK: [0,0] DeeeeeeER . . . . st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,1] D======eER. . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=====eeeeeeER. . . st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,3] .D===========eER . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==========eeeeER . . st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,5] . D==============eER. . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=============eeeeER . st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,7] . D=================eER. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D================eER. stg x26, [x27], #4064
-# CHECK-NEXT: [0,9] . D=================eER add x0, x27, #1
+# CHECK-NEXT: 0
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,1] D=eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeER. st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,3] .D=eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,4] . D=eeeeER. st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,5] . D==eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeER st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,7] . D==eE--R add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eE-R stg x26, [x27], #4064
+# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4985,16 +4974,16 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 1. 1 7.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 6.0 0.0 0.0 st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 3. 1 12.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 11.0 0.0 0.0 st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 5. 1 15.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 14.0 0.0 0.0 st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 7. 1 18.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 17.0 0.0 0.0 stg x26, [x27], #4064
-# CHECK-NEXT: 9. 1 18.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 11.9 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 7. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 1.0 1.0 stg x26, [x27], #4064
+# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.3 1.3 <total>
# CHECK: [83] Code Region - G84
More information about the llvm-commits
mailing list