[llvm] a5a6008 - [AArch64] Fix schedmodel pre/post-index loads and stores for Neoverse V2
Sjoerd Meijer via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 4 03:09:16 PDT 2023
Author: Sjoerd Meijer
Date: 2023-09-04T11:09:06+01:00
New Revision: a5a600898863336173a0fba5426e35fb8d18d18d
URL: https://github.com/llvm/llvm-project/commit/a5a600898863336173a0fba5426e35fb8d18d18d
DIFF: https://github.com/llvm/llvm-project/commit/a5a600898863336173a0fba5426e35fb8d18d18d.diff
LOG: [AArch64] Fix schedmodel pre/post-index loads and stores for Neoverse V2
Fix the operand description: the update was in the wrong place. As a result the
latency of the update was modelled incorrectly, it wasn't available as early as
it should. This was visible in llvm-mca timeline views.
This fixes the problem for the Neoverse V2, but the problem also affects the
other Neoverse cores.
Patch by: Ricardo Jesus
Differential Revision: https://reviews.llvm.org/D159254
Added:
llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s
Modified:
llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
index c7edc0a567524a7..b8e1dee705022c1 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
@@ -1213,7 +1213,7 @@ def : InstRW<[V2Write_5cyc_1L_1F], (instrs LDRWl, LDRXl, LDRSWl, PRFMl)>;
def : InstRW<[V2Write_5cyc_1I_3L, WriteLDHi], (instrs LDPSWi)>;
// Load pair, immed post-index or immed pre-index, signed words
-def : InstRW<[V2Write_5cyc_1I_3L, WriteLDHi, WriteAdr],
+def : InstRW<[WriteAdr, V2Write_5cyc_1I_3L, WriteLDHi],
(instregex "^LDPSW(post|pre)$")>;
// Store instructions
@@ -1224,7 +1224,7 @@ def : InstRW<[V2Write_5cyc_1I_3L, WriteLDHi, WriteAdr],
def : SchedAlias<WriteST, V2Write_1cyc_1L01_1D>;
def : SchedAlias<WriteSTIdx, V2Write_1cyc_1L01_1D>;
def : SchedAlias<WriteSTP, V2Write_1cyc_1L01_1D>;
-def : SchedAlias<WriteAdr, V2Write_1cyc_1I>; // copied from A57.
+def : SchedAlias<WriteAdr, V2Write_1cyc_1I>;
// Tag load instructions
// -----------------------------------------------------------------------------
@@ -1337,7 +1337,7 @@ def : InstRW<[V2Write_6cyc_1L], (instregex "^LDUR[BHSDQ]i$")>;
// Load vector reg, immed post-index
// Load vector reg, immed pre-index
-def : InstRW<[V2Write_6cyc_1I_1L, WriteAdr],
+def : InstRW<[WriteAdr, V2Write_6cyc_1I_1L],
(instregex "^LDR[BHSDQ](pre|post)$")>;
// Load vector reg, unsigned immed
@@ -1359,12 +1359,12 @@ def : InstRW<[V2Write_6cyc_2L, WriteLDHi], (instrs LDPQi, LDNPQi)>;
// Load vector pair, immed post-index, S/D-form
// Load vector pair, immed pre-index, S/D-form
-def : InstRW<[V2Write_6cyc_1I_1L, WriteLDHi, WriteAdr],
+def : InstRW<[WriteAdr, V2Write_6cyc_1I_1L, WriteLDHi],
(instregex "^LDP[SD](pre|post)$")>;
// Load vector pair, immed post-index, Q-form
// Load vector pair, immed pre-index, Q-form
-def : InstRW<[V2Write_6cyc_2I_2L, WriteLDHi, WriteAdr], (instrs LDPQpost,
+def : InstRW<[WriteAdr, V2Write_6cyc_2I_2L, WriteLDHi], (instrs LDPQpost,
LDPQpre)>;
// FP store instructions
@@ -1725,220 +1725,220 @@ def : InstRW<[V2Write_5cyc_1M0_1V], (instregex "^INSvi(8|16|32|64)gpr$")>;
// ASIMD load, 1 element, multiple, 1 reg, D-form
def : InstRW<[V2Write_6cyc_1L], (instregex "^LD1Onev(8b|4h|2s|1d)$")>;
-def : InstRW<[V2Write_6cyc_1L, WriteAdr],
+def : InstRW<[WriteAdr, V2Write_6cyc_1L],
(instregex "^LD1Onev(8b|4h|2s|1d)_POST$")>;
// ASIMD load, 1 element, multiple, 1 reg, Q-form
def : InstRW<[V2Write_6cyc_1L], (instregex "^LD1Onev(16b|8h|4s|2d)$")>;
-def : InstRW<[V2Write_6cyc_1L, WriteAdr],
+def : InstRW<[WriteAdr, V2Write_6cyc_1L],
(instregex "^LD1Onev(16b|8h|4s|2d)_POST$")>;
// ASIMD load, 1 element, multiple, 2 reg, D-form
def : InstRW<[V2Write_6cyc_2L], (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[V2Write_6cyc_2L, WriteAdr],
+def : InstRW<[WriteAdr, V2Write_6cyc_2L],
(instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
// ASIMD load, 1 element, multiple, 2 reg, Q-form
def : InstRW<[V2Write_6cyc_2L], (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[V2Write_6cyc_2L, WriteAdr],
+def : InstRW<[WriteAdr, V2Write_6cyc_2L],
(instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
// ASIMD load, 1 element, multiple, 3 reg, D-form
def : InstRW<[V2Write_6cyc_3L], (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[V2Write_6cyc_3L, WriteAdr],
+def : InstRW<[WriteAdr, V2Write_6cyc_3L],
(instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
// ASIMD load, 1 element, multiple, 3 reg, Q-form
def : InstRW<[V2Write_6cyc_3L], (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[V2Write_6cyc_3L, WriteAdr],
+def : InstRW<[WriteAdr, V2Write_6cyc_3L],
(instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
// ASIMD load, 1 element, multiple, 4 reg, D-form
def : InstRW<[V2Write_7cyc_4L], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[V2Write_7cyc_4L, WriteAdr],
+def : InstRW<[WriteAdr, V2Write_7cyc_4L],
(instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
// ASIMD load, 1 element, multiple, 4 reg, Q-form
def : InstRW<[V2Write_7cyc_4L], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[V2Write_7cyc_4L, WriteAdr],
+def : InstRW<[WriteAdr, V2Write_7cyc_4L],
(instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
// ASIMD load, 1 element, one lane, B/H/S
// ASIMD load, 1 element, one lane, D
def : InstRW<[V2Write_8cyc_1L_1V], (instregex "LD1i(8|16|32|64)$")>;
-def : InstRW<[V2Write_8cyc_1L_1V, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_8cyc_1L_1V], (instregex "LD1i(8|16|32|64)_POST$")>;
// ASIMD load, 1 element, all lanes, D-form, B/H/S
// ASIMD load, 1 element, all lanes, D-form, D
def : InstRW<[V2Write_8cyc_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[V2Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_8cyc_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)_POST$")>;
// ASIMD load, 1 element, all lanes, Q-form
def : InstRW<[V2Write_8cyc_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[V2Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_8cyc_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
// ASIMD load, 2 element, multiple, D-form, B/H/S
def : InstRW<[V2Write_8cyc_1L_2V], (instregex "LD2Twov(8b|4h|2s)$")>;
-def : InstRW<[V2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_8cyc_1L_2V], (instregex "LD2Twov(8b|4h|2s)_POST$")>;
// ASIMD load, 2 element, multiple, Q-form, B/H/S
// ASIMD load, 2 element, multiple, Q-form, D
def : InstRW<[V2Write_8cyc_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[V2Write_8cyc_2L_2V, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_8cyc_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>;
// ASIMD load, 2 element, one lane, B/H
// ASIMD load, 2 element, one lane, S
// ASIMD load, 2 element, one lane, D
def : InstRW<[V2Write_8cyc_1L_2V], (instregex "LD2i(8|16|32|64)$")>;
-def : InstRW<[V2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_8cyc_1L_2V], (instregex "LD2i(8|16|32|64)_POST$")>;
// ASIMD load, 2 element, all lanes, D-form, B/H/S
// ASIMD load, 2 element, all lanes, D-form, D
def : InstRW<[V2Write_8cyc_1L_2V], (instregex "LD2Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[V2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_8cyc_1L_2V], (instregex "LD2Rv(8b|4h|2s|1d)_POST$")>;
// ASIMD load, 2 element, all lanes, Q-form
def : InstRW<[V2Write_8cyc_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[V2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_8cyc_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
// ASIMD load, 3 element, multiple, D-form, B/H/S
def : InstRW<[V2Write_8cyc_2L_3V], (instregex "LD3Threev(8b|4h|2s)$")>;
-def : InstRW<[V2Write_8cyc_2L_3V, WriteAdr], (instregex "LD3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_8cyc_2L_3V], (instregex "LD3Threev(8b|4h|2s)_POST$")>;
// ASIMD load, 3 element, multiple, Q-form, B/H/S
// ASIMD load, 3 element, multiple, Q-form, D
def : InstRW<[V2Write_8cyc_3L_3V], (instregex "LD3Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[V2Write_8cyc_3L_3V, WriteAdr], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_8cyc_3L_3V], (instregex "LD3Threev(16b|8h|4s|2d)_POST$")>;
// ASIMD load, 3 element, one lane, B/H
// ASIMD load, 3 element, one lane, S
// ASIMD load, 3 element, one lane, D
def : InstRW<[V2Write_8cyc_2L_3V], (instregex "LD3i(8|16|32|64)$")>;
-def : InstRW<[V2Write_8cyc_2L_3V, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_8cyc_2L_3V], (instregex "LD3i(8|16|32|64)_POST$")>;
// ASIMD load, 3 element, all lanes, D-form, B/H/S
// ASIMD load, 3 element, all lanes, D-form, D
def : InstRW<[V2Write_8cyc_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[V2Write_8cyc_2L_3V, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_8cyc_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)_POST$")>;
// ASIMD load, 3 element, all lanes, Q-form, B/H/S
// ASIMD load, 3 element, all lanes, Q-form, D
def : InstRW<[V2Write_8cyc_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[V2Write_8cyc_3L_3V, WriteAdr], (instregex "LD3Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_8cyc_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)_POST$")>;
// ASIMD load, 4 element, multiple, D-form, B/H/S
def : InstRW<[V2Write_8cyc_3L_4V], (instregex "LD4Fourv(8b|4h|2s)$")>;
-def : InstRW<[V2Write_8cyc_3L_4V, WriteAdr], (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_8cyc_3L_4V], (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
// ASIMD load, 4 element, multiple, Q-form, B/H/S
// ASIMD load, 4 element, multiple, Q-form, D
def : InstRW<[V2Write_9cyc_6L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[V2Write_9cyc_6L_4V, WriteAdr], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_9cyc_6L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
// ASIMD load, 4 element, one lane, B/H
// ASIMD load, 4 element, one lane, S
// ASIMD load, 4 element, one lane, D
def : InstRW<[V2Write_8cyc_3L_4V], (instregex "LD4i(8|16|32|64)$")>;
-def : InstRW<[V2Write_8cyc_3L_4V, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_8cyc_3L_4V], (instregex "LD4i(8|16|32|64)_POST$")>;
// ASIMD load, 4 element, all lanes, D-form, B/H/S
// ASIMD load, 4 element, all lanes, D-form, D
-def : InstRW<[V2Write_8cyc_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[V2Write_8cyc_3L_4V, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[V2Write_8cyc_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[WriteAdr, V2Write_8cyc_3L_4V], (instregex "LD4Rv(8b|4h|2s|1d)_POST$")>;
// ASIMD load, 4 element, all lanes, Q-form, B/H/S
// ASIMD load, 4 element, all lanes, Q-form, D
-def : InstRW<[V2Write_8cyc_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[V2Write_8cyc_4L_4V, WriteAdr], (instregex "LD4Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[V2Write_8cyc_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[WriteAdr, V2Write_8cyc_4L_4V], (instregex "LD4Rv(16b|8h|4s|2d)_POST$")>;
// ASIMD store instructions
// -----------------------------------------------------------------------------
// ASIMD store, 1 element, multiple, 1 reg, D-form
def : InstRW<[V2Write_2cyc_1L01_1V01], (instregex "ST1Onev(8b|4h|2s|1d)$")>;
-def : InstRW<[V2Write_2cyc_1L01_1V01, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_2cyc_1L01_1V01], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
// ASIMD store, 1 element, multiple, 1 reg, Q-form
def : InstRW<[V2Write_2cyc_1L01_1V01], (instregex "ST1Onev(16b|8h|4s|2d)$")>;
-def : InstRW<[V2Write_2cyc_1L01_1V01, WriteAdr], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_2cyc_1L01_1V01], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
// ASIMD store, 1 element, multiple, 2 reg, D-form
def : InstRW<[V2Write_2cyc_1L01_1V01], (instregex "ST1Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[V2Write_2cyc_1L01_1V01, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_2cyc_1L01_1V01], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
// ASIMD store, 1 element, multiple, 2 reg, Q-form
def : InstRW<[V2Write_2cyc_2L01_2V01], (instregex "ST1Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[V2Write_2cyc_2L01_2V01, WriteAdr], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_2cyc_2L01_2V01], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
// ASIMD store, 1 element, multiple, 3 reg, D-form
def : InstRW<[V2Write_2cyc_2L01_2V01], (instregex "ST1Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[V2Write_2cyc_2L01_2V01, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_2cyc_2L01_2V01], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
// ASIMD store, 1 element, multiple, 3 reg, Q-form
def : InstRW<[V2Write_2cyc_3L01_3V01], (instregex "ST1Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[V2Write_2cyc_3L01_3V01, WriteAdr], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_2cyc_3L01_3V01], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
// ASIMD store, 1 element, multiple, 4 reg, D-form
def : InstRW<[V2Write_2cyc_2L01_2V01], (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[V2Write_2cyc_2L01_2V01, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_2cyc_2L01_2V01], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
// ASIMD store, 1 element, multiple, 4 reg, Q-form
def : InstRW<[V2Write_2cyc_4L01_4V01], (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[V2Write_2cyc_4L01_4V01, WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_2cyc_4L01_4V01], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
// ASIMD store, 1 element, one lane, B/H/S
// ASIMD store, 1 element, one lane, D
def : InstRW<[V2Write_4cyc_1L01_2V01], (instregex "ST1i(8|16|32|64)$")>;
-def : InstRW<[V2Write_4cyc_1L01_2V01, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_4cyc_1L01_2V01], (instregex "ST1i(8|16|32|64)_POST$")>;
// ASIMD store, 2 element, multiple, D-form, B/H/S
def : InstRW<[V2Write_4cyc_1L01_2V01], (instregex "ST2Twov(8b|4h|2s)$")>;
-def : InstRW<[V2Write_4cyc_1L01_2V01, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_4cyc_1L01_2V01], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
// ASIMD store, 2 element, multiple, Q-form, B/H/S
// ASIMD store, 2 element, multiple, Q-form, D
def : InstRW<[V2Write_4cyc_2L01_4V01], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[V2Write_4cyc_2L01_4V01, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_4cyc_2L01_4V01], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
// ASIMD store, 2 element, one lane, B/H/S
// ASIMD store, 2 element, one lane, D
def : InstRW<[V2Write_4cyc_1L01_2V01], (instregex "ST2i(8|16|32|64)$")>;
-def : InstRW<[V2Write_4cyc_1L01_2V01, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_4cyc_1L01_2V01], (instregex "ST2i(8|16|32|64)_POST$")>;
// ASIMD store, 3 element, multiple, D-form, B/H/S
def : InstRW<[V2Write_5cyc_2L01_4V01], (instregex "ST3Threev(8b|4h|2s)$")>;
-def : InstRW<[V2Write_5cyc_2L01_4V01, WriteAdr], (instregex "ST3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_5cyc_2L01_4V01], (instregex "ST3Threev(8b|4h|2s)_POST$")>;
// ASIMD store, 3 element, multiple, Q-form, B/H/S
// ASIMD store, 3 element, multiple, Q-form, D
def : InstRW<[V2Write_6cyc_3L01_6V01], (instregex "ST3Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[V2Write_6cyc_3L01_6V01, WriteAdr], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_6cyc_3L01_6V01], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>;
// ASIMD store, 3 element, one lane, B/H
// ASIMD store, 3 element, one lane, S
// ASIMD store, 3 element, one lane, D
def : InstRW<[V2Write_5cyc_2L01_4V01], (instregex "ST3i(8|16|32|64)$")>;
-def : InstRW<[V2Write_5cyc_2L01_4V01, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_5cyc_2L01_4V01], (instregex "ST3i(8|16|32|64)_POST$")>;
// ASIMD store, 4 element, multiple, D-form, B/H/S
def : InstRW<[V2Write_6cyc_2L01_6V01], (instregex "ST4Fourv(8b|4h|2s)$")>;
-def : InstRW<[V2Write_6cyc_2L01_6V01, WriteAdr], (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_6cyc_2L01_6V01], (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
// ASIMD store, 4 element, multiple, Q-form, B/H/S
def : InstRW<[V2Write_7cyc_4L01_12V01], (instregex "ST4Fourv(16b|8h|4s)$")>;
-def : InstRW<[V2Write_7cyc_4L01_12V01, WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_7cyc_4L01_12V01], (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
// ASIMD store, 4 element, multiple, Q-form, D
def : InstRW<[V2Write_5cyc_4L01_8V01], (instregex "ST4Fourv(2d)$")>;
-def : InstRW<[V2Write_5cyc_4L01_8V01, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_5cyc_4L01_8V01], (instregex "ST4Fourv(2d)_POST$")>;
// ASIMD store, 4 element, one lane, B/H/S
def : InstRW<[V2Write_6cyc_1L01_3V01], (instregex "ST4i(8|16|32)$")>;
-def : InstRW<[V2Write_6cyc_1L01_3V01, WriteAdr], (instregex "ST4i(8|16|32)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_6cyc_1L01_3V01], (instregex "ST4i(8|16|32)_POST$")>;
// ASIMD store, 4 element, one lane, D
def : InstRW<[V2Write_4cyc_2L01_4V01], (instregex "ST4i(64)$")>;
-def : InstRW<[V2Write_4cyc_2L01_4V01, WriteAdr], (instregex "ST4i(64)_POST$")>;
+def : InstRW<[WriteAdr, V2Write_4cyc_2L01_4V01], (instregex "ST4i(64)_POST$")>;
// Cryptography extensions
// -----------------------------------------------------------------------------
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s
new file mode 100644
index 000000000000000..1ef746813966de3
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s
@@ -0,0 +1,3979 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v2 --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=1 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN G01
+ld1 { v1.1d }, [x27], #8
+ld1 { v1.2d }, [x27], #16
+ld1 { v1.2s }, [x27], #8
+ld1 { v1.4h }, [x27], #8
+ld1 { v1.4s }, [x27], #16
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G02
+ld1 { v1.8b }, [x27], #8
+ld1 { v1.8h }, [x27], #16
+ld1 { v1.16b }, [x27], #16
+ld1 { v1.1d }, [x27], x28
+ld1 { v1.2d }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G03
+ld1 { v1.2s }, [x27], x28
+ld1 { v1.4h }, [x27], x28
+ld1 { v1.4s }, [x27], x28
+ld1 { v1.8b }, [x27], x28
+ld1 { v1.8h }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G04
+ld1 { v1.16b }, [x27], x28
+ld1 { v1.1d, v2.1d }, [x27], #16
+ld1 { v1.2d, v2.2d }, [x27], #32
+ld1 { v1.2s, v2.2s }, [x27], #16
+ld1 { v1.4h, v2.4h }, [x27], #16
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G05
+ld1 { v1.4s, v2.4s }, [x27], #32
+ld1 { v1.8b, v2.8b }, [x27], #16
+ld1 { v1.8h, v2.8h }, [x27], #32
+ld1 { v1.16b, v2.16b }, [x27], #32
+ld1 { v1.1d, v2.1d }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G06
+ld1 { v1.2d, v2.2d }, [x27], x28
+ld1 { v1.2s, v2.2s }, [x27], x28
+ld1 { v1.4h, v2.4h }, [x27], x28
+ld1 { v1.4s, v2.4s }, [x27], x28
+ld1 { v1.8b, v2.8b }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G07
+ld1 { v1.8h, v2.8h }, [x27], x28
+ld1 { v1.16b, v2.16b }, [x27], x28
+ld1 { v1.1d, v2.1d, v3.1d }, [x27], #24
+ld1 { v1.2d, v2.2d, v3.2d }, [x27], #48
+ld1 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G08
+ld1 { v1.4h, v2.4h, v3.4h }, [x27], #24
+ld1 { v1.4s, v2.4s, v3.4s }, [x27], #48
+ld1 { v1.8b, v2.8b, v3.8b }, [x27], #24
+ld1 { v1.8h, v2.8h, v3.8h }, [x27], #48
+ld1 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G09
+ld1 { v1.1d, v2.1d, v3.1d }, [x27], x28
+ld1 { v1.2d, v2.2d, v3.2d }, [x27], x28
+ld1 { v1.2s, v2.2s, v3.2s }, [x27], x28
+ld1 { v1.4h, v2.4h, v3.4h }, [x27], x28
+ld1 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G10
+ld1 { v1.8b, v2.8b, v3.8b }, [x27], x28
+ld1 { v1.8h, v2.8h, v3.8h }, [x27], x28
+ld1 { v1.16b, v2.16b, v3.16b }, [x27], x28
+ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G11
+ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G12
+ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G13
+ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+ld1 { v1.b }[0], [x27], #1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G14
+ld1 { v1.b }[8], [x27], #1
+ld1 { v1.b }[0], [x27], x28
+ld1 { v1.b }[8], [x27], x28
+ld1 { v1.h }[0], [x27], #2
+ld1 { v1.h }[4], [x27], #2
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G15
+ld1 { v1.h }[0], [x27], x28
+ld1 { v1.h }[4], [x27], x28
+ld1 { v1.s }[0], [x27], #4
+ld1 { v1.s }[0], [x27], x28
+ld1 { v1.d }[0], [x27], #8
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G16
+ld1 { v1.d }[0], [x27], x28
+ld1r { v1.1d }, [x27], #8
+ld1r { v1.2d }, [x27], #8
+ld1r { v1.2s }, [x27], #4
+ld1r { v1.4h }, [x27], #2
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G17
+ld1r { v1.4s }, [x27], #4
+ld1r { v1.8b }, [x27], #1
+ld1r { v1.8h }, [x27], #2
+ld1r { v1.16b }, [x27], #1
+ld1r { v1.1d }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G18
+ld1r { v1.2d }, [x27], x28
+ld1r { v1.2s }, [x27], x28
+ld1r { v1.4h }, [x27], x28
+ld1r { v1.4s }, [x27], x28
+ld1r { v1.8b }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G19
+ld1r { v1.8h }, [x27], x28
+ld1r { v1.16b }, [x27], x28
+ld2 { v1.2d, v2.2d }, [x27], #32
+ld2 { v1.2s, v2.2s }, [x27], #16
+ld2 { v1.4h, v2.4h }, [x27], #16
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G20
+ld2 { v1.4s, v2.4s }, [x27], #32
+ld2 { v1.8b, v2.8b }, [x27], #16
+ld2 { v1.8h, v2.8h }, [x27], #32
+ld2 { v1.16b, v2.16b }, [x27], #32
+ld2 { v1.2d, v2.2d }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G21
+ld2 { v1.2s, v2.2s }, [x27], x28
+ld2 { v1.4h, v2.4h }, [x27], x28
+ld2 { v1.4s, v2.4s }, [x27], x28
+ld2 { v1.8b, v2.8b }, [x27], x28
+ld2 { v1.8h, v2.8h }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G22
+ld2 { v1.16b, v2.16b }, [x27], x28
+ld2 { v1.b, v2.b }[0], [x27], #2
+ld2 { v1.b, v2.b }[8], [x27], #2
+ld2 { v1.b, v2.b }[0], [x27], x28
+ld2 { v1.b, v2.b }[8], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G23
+ld2 { v1.h, v2.h }[0], [x27], #4
+ld2 { v1.h, v2.h }[4], [x27], #4
+ld2 { v1.h, v2.h }[0], [x27], x28
+ld2 { v1.h, v2.h }[4], [x27], x28
+ld2 { v1.s, v2.s }[0], [x27], #8
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G24
+ld2 { v1.s, v2.s }[0], [x27], x28
+ld2 { v1.d, v2.d }[0], [x27], #16
+ld2 { v1.d, v2.d }[0], [x27], x28
+ld2r { v1.1d, v2.1d }, [x27], #16
+ld2r { v1.2d, v2.2d }, [x27], #16
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G25
+ld2r { v1.2s, v2.2s }, [x27], #8
+ld2r { v1.4h, v2.4h }, [x27], #4
+ld2r { v1.4s, v2.4s }, [x27], #8
+ld2r { v1.8b, v2.8b }, [x27], #2
+ld2r { v1.8h, v2.8h }, [x27], #4
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G26
+ld2r { v1.16b, v2.16b }, [x27], #2
+ld2r { v1.1d, v2.1d }, [x27], x28
+ld2r { v1.2d, v2.2d }, [x27], x28
+ld2r { v1.2s, v2.2s }, [x27], x28
+ld2r { v1.4h, v2.4h }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G27
+ld2r { v1.4s, v2.4s }, [x27], x28
+ld2r { v1.8b, v2.8b }, [x27], x28
+ld2r { v1.8h, v2.8h }, [x27], x28
+ld2r { v1.16b, v2.16b }, [x27], x28
+ld3 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G28
+ld3 { v1.2s, v2.2s, v3.2s }, [x27], #24
+ld3 { v1.4h, v2.4h, v3.4h }, [x27], #24
+ld3 { v1.4s, v2.4s, v3.4s }, [x27], #48
+ld3 { v1.8b, v2.8b, v3.8b }, [x27], #24
+ld3 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G29
+ld3 { v1.16b, v2.16b, v3.16b }, [x27], #48
+ld3 { v1.2d, v2.2d, v3.2d }, [x27], x28
+ld3 { v1.2s, v2.2s, v3.2s }, [x27], x28
+ld3 { v1.4h, v2.4h, v3.4h }, [x27], x28
+ld3 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G30
+ld3 { v1.8b, v2.8b, v3.8b }, [x27], x28
+ld3 { v1.8h, v2.8h, v3.8h }, [x27], x28
+ld3 { v1.16b, v2.16b, v3.16b }, [x27], x28
+ld3 { v1.b, v2.b, v3.b }[0], [x27], #3
+ld3 { v1.b, v2.b, v3.b }[8], [x27], #3
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G31
+ld3 { v1.b, v2.b, v3.b }[0], [x27], x28
+ld3 { v1.b, v2.b, v3.b }[8], [x27], x28
+ld3 { v1.h, v2.h, v3.h }[0], [x27], #6
+ld3 { v1.h, v2.h, v3.h }[4], [x27], #6
+ld3 { v1.h, v2.h, v3.h }[0], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G32
+ld3 { v1.h, v2.h, v3.h }[4], [x27], x28
+ld3 { v1.s, v2.s, v3.s }[0], [x27], #12
+ld3 { v1.s, v2.s, v3.s }[0], [x27], x28
+ld3 { v1.d, v2.d, v3.d }[0], [x27], #24
+ld3 { v1.d, v2.d, v3.d }[0], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G33
+ld3r { v1.1d, v2.1d, v3.1d }, [x27], #24
+ld3r { v1.2d, v2.2d, v3.2d }, [x27], #24
+ld3r { v1.2s, v2.2s, v3.2s }, [x27], #12
+ld3r { v1.4h, v2.4h, v3.4h }, [x27], #6
+ld3r { v1.4s, v2.4s, v3.4s }, [x27], #12
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G34
+ld3r { v1.8b, v2.8b, v3.8b }, [x27], #3
+ld3r { v1.8h, v2.8h, v3.8h }, [x27], #6
+ld3r { v1.16b, v2.16b, v3.16b }, [x27], #3
+ld3r { v1.1d, v2.1d, v3.1d }, [x27], x28
+ld3r { v1.2d, v2.2d, v3.2d }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G35
+ld3r { v1.2s, v2.2s, v3.2s }, [x27], x28
+ld3r { v1.4h, v2.4h, v3.4h }, [x27], x28
+ld3r { v1.4s, v2.4s, v3.4s }, [x27], x28
+ld3r { v1.8b, v2.8b, v3.8b }, [x27], x28
+ld3r { v1.8h, v2.8h, v3.8h }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G36
+ld3r { v1.16b, v2.16b, v3.16b }, [x27], x28
+ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G37
+ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G38
+ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G39
+ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G40
+ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G41
+ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G42
+ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G43
+ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G44
+ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+ldp s1, s2, [x27], #248
+ldp d1, d2, [x27], #496
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G45
+ldp q1, q2, [x27], #992
+ldp s1, s2, [x27, #248]!
+ldp d1, d2, [x27, #496]!
+ldp q1, q2, [x27, #992]!
+ldp w1, w2, [x27], #248
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G46
+ldp x1, x2, [x27], #496
+ldp w1, w2, [x27, #248]!
+ldp x1, x2, [x27, #496]!
+ldpsw x1, x2, [x27], #248
+ldpsw x1, x2, [x27, #248]!
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G47
+ldr b1, [x27], #254
+ldr h1, [x27], #254
+ldr s1, [x27], #254
+ldr d1, [x27], #254
+ldr q1, [x27], #254
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G48
+ldr b1, [x27, #254]!
+ldr h1, [x27, #254]!
+ldr s1, [x27, #254]!
+ldr d1, [x27, #254]!
+ldr q1, [x27, #254]!
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G49
+ldr w1, [x27], #254
+ldr x1, [x27], #254
+ldr w1, [x27, #254]!
+ldr x1, [x27, #254]!
+ldrb w1, [x27], #254
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G50
+ldrb w1, [x27, #254]!
+ldrh w1, [x27], #254
+ldrh w1, [x27, #254]!
+ldrsb w1, [x27], #254
+ldrsb x1, [x27], #254
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G51
+ldrsb w1, [x27, #254]!
+ldrsb x1, [x27, #254]!
+ldrsh w1, [x27], #254
+ldrsh x1, [x27], #254
+ldrsh w1, [x27, #254]!
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G52
+ldrsh x1, [x27, #254]!
+ldrsw x1, [x27], #254
+ldrsw x1, [x27, #254]!
+st1 { v1.1d }, [x27], #8
+st1 { v1.2d }, [x27], #16
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G53
+st1 { v1.2s }, [x27], #8
+st1 { v1.4h }, [x27], #8
+st1 { v1.4s }, [x27], #16
+st1 { v1.8b }, [x27], #8
+st1 { v1.8h }, [x27], #16
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G54
+st1 { v1.16b }, [x27], #16
+st1 { v1.1d }, [x27], x28
+st1 { v1.2d }, [x27], x28
+st1 { v1.2s }, [x27], x28
+st1 { v1.4h }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G55
+st1 { v1.4s }, [x27], x28
+st1 { v1.8b }, [x27], x28
+st1 { v1.8h }, [x27], x28
+st1 { v1.16b }, [x27], x28
+st1 { v1.1d, v2.1d }, [x27], #16
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G56
+st1 { v1.2d, v2.2d }, [x27], #32
+st1 { v1.2s, v2.2s }, [x27], #16
+st1 { v1.4h, v2.4h }, [x27], #16
+st1 { v1.4s, v2.4s }, [x27], #32
+st1 { v1.8b, v2.8b }, [x27], #16
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G57
+st1 { v1.8h, v2.8h }, [x27], #32
+st1 { v1.16b, v2.16b }, [x27], #32
+st1 { v1.1d, v2.1d }, [x27], x28
+st1 { v1.2d, v2.2d }, [x27], x28
+st1 { v1.2s, v2.2s }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G58
+st1 { v1.4h, v2.4h }, [x27], x28
+st1 { v1.4s, v2.4s }, [x27], x28
+st1 { v1.8b, v2.8b }, [x27], x28
+st1 { v1.8h, v2.8h }, [x27], x28
+st1 { v1.16b, v2.16b }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G59
+st1 { v1.1d, v2.1d, v3.1d }, [x27], #24
+st1 { v1.2d, v2.2d, v3.2d }, [x27], #48
+st1 { v1.2s, v2.2s, v3.2s }, [x27], #24
+st1 { v1.4h, v2.4h, v3.4h }, [x27], #24
+st1 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G60
+st1 { v1.8b, v2.8b, v3.8b }, [x27], #24
+st1 { v1.8h, v2.8h, v3.8h }, [x27], #48
+st1 { v1.16b, v2.16b, v3.16b }, [x27], #48
+st1 { v1.1d, v2.1d, v3.1d }, [x27], x28
+st1 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G61
+st1 { v1.2s, v2.2s, v3.2s }, [x27], x28
+st1 { v1.4h, v2.4h, v3.4h }, [x27], x28
+st1 { v1.4s, v2.4s, v3.4s }, [x27], x28
+st1 { v1.8b, v2.8b, v3.8b }, [x27], x28
+st1 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G62
+st1 { v1.16b, v2.16b, v3.16b }, [x27], x28
+st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G63
+st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G64
+st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G65
+st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+st1 { v1.b }[0], [x27], #1
+st1 { v1.b }[8], [x27], #1
+st1 { v1.b }[0], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G66
+st1 { v1.b }[8], [x27], x28
+st1 { v1.h }[0], [x27], #2
+st1 { v1.h }[4], [x27], #2
+st1 { v1.h }[0], [x27], x28
+st1 { v1.h }[4], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G67
+st1 { v1.s }[0], [x27], #4
+st1 { v1.s }[0], [x27], x28
+st1 { v1.d }[0], [x27], #8
+st1 { v1.d }[0], [x27], x28
+st2 { v1.2d, v2.2d }, [x27], #32
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G68
+st2 { v1.2s, v2.2s }, [x27], #16
+st2 { v1.4h, v2.4h }, [x27], #16
+st2 { v1.4s, v2.4s }, [x27], #32
+st2 { v1.8b, v2.8b }, [x27], #16
+st2 { v1.8h, v2.8h }, [x27], #32
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G69
+st2 { v1.16b, v2.16b }, [x27], #32
+st2 { v1.2d, v2.2d }, [x27], x28
+st2 { v1.2s, v2.2s }, [x27], x28
+st2 { v1.4h, v2.4h }, [x27], x28
+st2 { v1.4s, v2.4s }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G70
+st2 { v1.8b, v2.8b }, [x27], x28
+st2 { v1.8h, v2.8h }, [x27], x28
+st2 { v1.16b, v2.16b }, [x27], x28
+st2 { v1.b, v2.b }[0], [x27], #2
+st2 { v1.b, v2.b }[8], [x27], #2
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G71
+st2 { v1.b, v2.b }[0], [x27], x28
+st2 { v1.b, v2.b }[8], [x27], x28
+st2 { v1.h, v2.h }[0], [x27], #4
+st2 { v1.h, v2.h }[4], [x27], #4
+st2 { v1.h, v2.h }[0], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G72
+st2 { v1.h, v2.h }[4], [x27], x28
+st2 { v1.s, v2.s }[0], [x27], #8
+st2 { v1.s, v2.s }[0], [x27], x28
+st2 { v1.d, v2.d }[0], [x27], #16
+st2 { v1.d, v2.d }[0], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G73
+st2g x26, [x27], #4064
+st2g x26, [x27, #4064]!
+st3 { v1.2d, v2.2d, v3.2d }, [x27], #48
+st3 { v1.2s, v2.2s, v3.2s }, [x27], #24
+st3 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G74
+st3 { v1.4s, v2.4s, v3.4s }, [x27], #48
+st3 { v1.8b, v2.8b, v3.8b }, [x27], #24
+st3 { v1.8h, v2.8h, v3.8h }, [x27], #48
+st3 { v1.16b, v2.16b, v3.16b }, [x27], #48
+st3 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G75
+st3 { v1.2s, v2.2s, v3.2s }, [x27], x28
+st3 { v1.4h, v2.4h, v3.4h }, [x27], x28
+st3 { v1.4s, v2.4s, v3.4s }, [x27], x28
+st3 { v1.8b, v2.8b, v3.8b }, [x27], x28
+st3 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G76
+st3 { v1.16b, v2.16b, v3.16b }, [x27], x28
+st3 { v1.b, v2.b, v3.b }[0], [x27], #3
+st3 { v1.b, v2.b, v3.b }[8], [x27], #3
+st3 { v1.b, v2.b, v3.b }[0], [x27], x28
+st3 { v1.b, v2.b, v3.b }[8], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G77
+st3 { v1.h, v2.h, v3.h }[0], [x27], #6
+st3 { v1.h, v2.h, v3.h }[4], [x27], #6
+st3 { v1.h, v2.h, v3.h }[0], [x27], x28
+st3 { v1.h, v2.h, v3.h }[4], [x27], x28
+st3 { v1.s, v2.s, v3.s }[0], [x27], #12
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G78
+st3 { v1.s, v2.s, v3.s }[0], [x27], x28
+st3 { v1.d, v2.d, v3.d }[0], [x27], #24
+st3 { v1.d, v2.d, v3.d }[0], [x27], x28
+st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G79
+st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G80
+st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G81
+st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G82
+st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G83
+st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+stg x26, [x27], #4064
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G84
+stg x26, [x27, #4064]!
+stgp x1, x2, [x27], #992
+stgp x1, x2, [x27, #992]!
+stp s1, s2, [x27], #248
+stp d1, d2, [x27], #496
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G85
+stp q1, q2, [x27], #992
+stp s1, s2, [x27, #248]!
+stp d1, d2, [x27, #496]!
+stp q1, q2, [x27, #992]!
+stp w1, w2, [x27], #248
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G86
+stp x1, x2, [x27], #496
+stp w1, w2, [x27, #248]!
+stp x1, x2, [x27, #496]!
+str b1, [x27], #254
+str h1, [x27], #254
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G87
+str s1, [x27], #254
+str d1, [x27], #254
+str q1, [x27], #254
+str b1, [x27, #254]!
+str h1, [x27, #254]!
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G88
+str s1, [x27, #254]!
+str d1, [x27, #254]!
+str q1, [x27, #254]!
+str w1, [x27], #254
+str x1, [x27], #254
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G89
+str w1, [x27, #254]!
+str x1, [x27, #254]!
+strb w1, [x27], #254
+strb w1, [x27, #254]!
+strh w1, [x27], #254
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G90
+strh w1, [x27, #254]!
+stz2g x26, [x27], #4064
+stz2g x26, [x27, #4064]!
+stzg x26, [x27], #4064
+stzg x26, [x27, #4064]!
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN G91
+ldr x1, [x27], #254
+ldr x2, [x1], #254
+# LLVM-MCA-END
+
+# CHECK: [0] Code Region - G01
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 508
+# CHECK-NEXT: Total uOps: 1000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 1.97
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 1.7
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . . ld1 { v1.1d }, [x27], #8
+# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.2d }, [x27], #16
+# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.2s }, [x27], #8
+# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.4h }, [x27], #8
+# CHECK-NEXT: [0,4] D====eeeeeeER ld1 { v1.4s }, [x27], #16
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.1d }, [x27], #8
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.2d }, [x27], #16
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.2s }, [x27], #8
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.4h }, [x27], #8
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.4s }, [x27], #16
+# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+
+# CHECK: [1] Code Region - G02
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 508
+# CHECK-NEXT: Total uOps: 1000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 1.97
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 1.7
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . . ld1 { v1.8b }, [x27], #8
+# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.8h }, [x27], #16
+# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.16b }, [x27], #16
+# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.1d }, [x27], x28
+# CHECK-NEXT: [0,4] D====eeeeeeER ld1 { v1.2d }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.8b }, [x27], #8
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.8h }, [x27], #16
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.16b }, [x27], #16
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.1d }, [x27], x28
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.2d }, [x27], x28
+# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+
+# CHECK: [2] Code Region - G03
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 508
+# CHECK-NEXT: Total uOps: 1000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 1.97
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 1.7
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . . ld1 { v1.2s }, [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.4h }, [x27], x28
+# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.4s }, [x27], x28
+# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.8b }, [x27], x28
+# CHECK-NEXT: [0,4] D====eeeeeeER ld1 { v1.8h }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.2s }, [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.4h }, [x27], x28
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.4s }, [x27], x28
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.8b }, [x27], x28
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.8h }, [x27], x28
+# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+
+# CHECK: [3] Code Region - G04
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 508
+# CHECK-NEXT: Total uOps: 1400
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 2.76
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . . ld1 { v1.16b }, [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,4] D====eeeeeeER ld1 { v1.4h, v2.4h }, [x27], #16
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.16b }, [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+
+# CHECK: [4] Code Region - G05
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 508
+# CHECK-NEXT: Total uOps: 1500
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 2.95
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . . ld1 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,4] D====eeeeeeER ld1 { v1.1d, v2.1d }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+
+# CHECK: [5] Code Region - G06
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 508
+# CHECK-NEXT: Total uOps: 1500
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 2.95
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 3.3
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . . ld1 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,4] D====eeeeeeER ld1 { v1.8b, v2.8b }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+
+# CHECK: [6] Code Region - G07
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 508
+# CHECK-NEXT: Total uOps: 1800
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.54
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 4.3
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . . ld1 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,4] .D===eeeeeeER ld1 { v1.2s, v2.2s, v3.2s }, [x27], #24
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld1 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+
+# CHECK: [7] Code Region - G08
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 508
+# CHECK-NEXT: Total uOps: 2000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.94
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . . ld1 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,4] .D===eeeeeeER ld1 { v1.16b, v2.16b, v3.16b }, [x27], #48
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld1 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+
+# CHECK: [8] Code Region - G09
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 508
+# CHECK-NEXT: Total uOps: 2000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.94
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . . ld1 { v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,4] .D===eeeeeeER ld1 { v1.4s, v2.4s, v3.4s }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+
+# CHECK: [9] Code Region - G10
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 608
+# CHECK-NEXT: Total uOps: 2200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.62
+# CHECK-NEXT: IPC: 0.82
+# CHECK-NEXT: Block RThroughput: 5.7
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . . ld1 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,3] .D==eeeeeeeER. ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,4] .D===eeeeeeeER ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 1 2.6 0.2 0.0 <total>
+
+# CHECK: [10] Code Region - G11
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 675
+# CHECK-NEXT: Total uOps: 2500
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.70
+# CHECK-NEXT: IPC: 0.74
+# CHECK-NEXT: Block RThroughput: 6.7
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeER. . ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,1] D=eeeeeeeER . ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,2] D==eeeeeeeER . ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3] .D===eeeeeeeER. ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,4] .D====eeeeeeeER ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 3. 1 4.0 1.0 0.0 ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 1 3.0 0.4 0.0 <total>
+
+# CHECK: [11] Code Region - G12
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 675
+# CHECK-NEXT: Total uOps: 2500
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.70
+# CHECK-NEXT: IPC: 0.74
+# CHECK-NEXT: Block RThroughput: 6.7
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeER. . ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,1] D=eeeeeeeER . ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,2] D==eeeeeeeER . ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3] .D===eeeeeeeER. ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,4] .D====eeeeeeeER ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 3. 1 4.0 1.0 0.0 ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 1 3.0 0.4 0.0 <total>
+
+# CHECK: [12] Code Region - G13
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 1210
+# CHECK-NEXT: Total uOps: 2300
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 1.90
+# CHECK-NEXT: IPC: 0.41
+# CHECK-NEXT: Block RThroughput: 5.7
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 01
+
+# CHECK: [0,0] DeeeeeeeER. . .. ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeeeeER . .. ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,2] D==eeeeeeeER . .. ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3] .D===eeeeeeeER . .. ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,4] .D==========eeeeeeeeER ld1 { v1.b }[0], [x27], #1
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 3. 1 4.0 1.0 0.0 ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 4. 1 11.0 0.0 0.0 ld1 { v1.b }[0], [x27], #1
+# CHECK-NEXT: 1 4.2 0.4 0.0 <total>
+
+# CHECK: [13] Code Region - G14
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 4003
+# CHECK-NEXT: Total uOps: 1500
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.37
+# CHECK-NEXT: IPC: 0.12
+# CHECK-NEXT: Block RThroughput: 1.7
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld1 { v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld1 { v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,2] D================eeeeeeeeER . . . . ld1 { v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,3] D========================eeeeeeeeER. . . ld1 { v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,4] D================================eeeeeeeeER ld1 { v1.h }[4], [x27], #2
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.b }[8], [x27], #1
+# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld1 { v1.b }[0], [x27], x28
+# CHECK-NEXT: 2. 1 17.0 0.0 0.0 ld1 { v1.b }[8], [x27], x28
+# CHECK-NEXT: 3. 1 25.0 0.0 0.0 ld1 { v1.h }[0], [x27], #2
+# CHECK-NEXT: 4. 1 33.0 0.0 0.0 ld1 { v1.h }[4], [x27], #2
+# CHECK-NEXT: 1 17.0 0.2 0.0 <total>
+
+# CHECK: [14] Code Region - G15
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 4003
+# CHECK-NEXT: Total uOps: 1500
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.37
+# CHECK-NEXT: IPC: 0.12
+# CHECK-NEXT: Block RThroughput: 1.7
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld1 { v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld1 { v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,2] D================eeeeeeeeER . . . . ld1 { v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,3] D========================eeeeeeeeER. . . ld1 { v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,4] D================================eeeeeeeeER ld1 { v1.d }[0], [x27], #8
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.h }[0], [x27], x28
+# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld1 { v1.h }[4], [x27], x28
+# CHECK-NEXT: 2. 1 17.0 0.0 0.0 ld1 { v1.s }[0], [x27], #4
+# CHECK-NEXT: 3. 1 25.0 0.0 0.0 ld1 { v1.s }[0], [x27], x28
+# CHECK-NEXT: 4. 1 33.0 0.0 0.0 ld1 { v1.d }[0], [x27], #8
+# CHECK-NEXT: 1 17.0 0.2 0.0 <total>
+
+# CHECK: [15] Code Region - G16
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 1203
+# CHECK-NEXT: Total uOps: 1500
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 1.25
+# CHECK-NEXT: IPC: 0.42
+# CHECK-NEXT: Block RThroughput: 1.7
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld1 { v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld1r { v1.1d }, [x27], #8
+# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld1r { v1.2d }, [x27], #8
+# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld1r { v1.2s }, [x27], #4
+# CHECK-NEXT: [0,4] D====eeeeeeeeER ld1r { v1.4h }, [x27], #2
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.d }[0], [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1r { v1.1d }, [x27], #8
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1r { v1.2d }, [x27], #8
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1r { v1.2s }, [x27], #4
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1r { v1.4h }, [x27], #2
+# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+
+# CHECK: [16] Code Region - G17
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 510
+# CHECK-NEXT: Total uOps: 1500
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 2.94
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 1.7
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld1r { v1.4s }, [x27], #4
+# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld1r { v1.8b }, [x27], #1
+# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld1r { v1.8h }, [x27], #2
+# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld1r { v1.16b }, [x27], #1
+# CHECK-NEXT: [0,4] D====eeeeeeeeER ld1r { v1.1d }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1r { v1.4s }, [x27], #4
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1r { v1.8b }, [x27], #1
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1r { v1.8h }, [x27], #2
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1r { v1.16b }, [x27], #1
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1r { v1.1d }, [x27], x28
+# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+
+# CHECK: [17] Code Region - G18
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 510
+# CHECK-NEXT: Total uOps: 1500
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 2.94
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 1.7
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld1r { v1.2d }, [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld1r { v1.2s }, [x27], x28
+# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld1r { v1.4h }, [x27], x28
+# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld1r { v1.4s }, [x27], x28
+# CHECK-NEXT: [0,4] D====eeeeeeeeER ld1r { v1.8b }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1r { v1.2d }, [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1r { v1.2s }, [x27], x28
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1r { v1.4h }, [x27], x28
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1r { v1.4s }, [x27], x28
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1r { v1.8b }, [x27], x28
+# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+
+# CHECK: [18] Code Region - G19
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 510
+# CHECK-NEXT: Total uOps: 1900
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.73
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld1r { v1.8h }, [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld1r { v1.16b }, [x27], x28
+# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld2 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld2 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,4] .D===eeeeeeeeER ld2 { v1.4h, v2.4h }, [x27], #16
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1r { v1.8h }, [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1r { v1.16b }, [x27], x28
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld2 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld2 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld2 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+
+# CHECK: [19] Code Region - G20
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 510
+# CHECK-NEXT: Total uOps: 2400
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.71
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld2 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld2 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld2 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld2 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,4] .D===eeeeeeeeER ld2 { v1.2d, v2.2d }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld2 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld2 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld2 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld2 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 1 2.6 0.2 0.0 <total>
+
+# CHECK: [20] Code Region - G21
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 510
+# CHECK-NEXT: Total uOps: 2200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.31
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld2 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld2 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld2 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld2 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,4] .D===eeeeeeeeER ld2 { v1.8h, v2.8h }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld2 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld2 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld2 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld2 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 1 2.6 0.2 0.0 <total>
+
+# CHECK: [21] Code Region - G22
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 3310
+# CHECK-NEXT: Total uOps: 2100
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.63
+# CHECK-NEXT: IPC: 0.15
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld2 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld2 { v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,2] D================eeeeeeeeER . . . . ld2 { v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,3] .D=======================eeeeeeeeER. . . ld2 { v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,4] .D===============================eeeeeeeeER ld2 { v1.b, v2.b }[8], [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld2 { v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 2. 1 17.0 0.0 0.0 ld2 { v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 3. 1 24.0 0.0 0.0 ld2 { v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 4. 1 32.0 0.0 0.0 ld2 { v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 1 16.6 0.2 0.0 <total>
+
+# CHECK: [22] Code Region - G23
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 4003
+# CHECK-NEXT: Total uOps: 2000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.50
+# CHECK-NEXT: IPC: 0.12
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld2 { v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld2 { v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,2] D================eeeeeeeeER . . . . ld2 { v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,3] D========================eeeeeeeeER. . . ld2 { v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,4] .D===============================eeeeeeeeER ld2 { v1.s, v2.s }[0], [x27], #8
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld2 { v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 2. 1 17.0 0.0 0.0 ld2 { v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 3. 1 25.0 0.0 0.0 ld2 { v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 4. 1 32.0 0.0 0.0 ld2 { v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 1 16.8 0.2 0.0 <total>
+
+# CHECK: [23] Code Region - G24
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 2603
+# CHECK-NEXT: Total uOps: 2000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.77
+# CHECK-NEXT: IPC: 0.19
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
+
+# CHECK: [0,0] DeeeeeeeeER . . . . ld2 { v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . ld2 { v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,2] D================eeeeeeeeER . ld2 { v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,3] D=================eeeeeeeeER. ld2r { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,4] .D=================eeeeeeeeER ld2r { v1.2d, v2.2d }, [x27], #16
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld2 { v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 2. 1 17.0 0.0 0.0 ld2 { v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 3. 1 18.0 0.0 0.0 ld2r { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 4. 1 18.0 0.0 0.0 ld2r { v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: 1 12.6 0.2 0.0 <total>
+
+# CHECK: [24] Code Region - G25
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 510
+# CHECK-NEXT: Total uOps: 2000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.92
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld2r { v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld2r { v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld2r { v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld2r { v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,4] .D===eeeeeeeeER ld2r { v1.8h, v2.8h }, [x27], #4
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2r { v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld2r { v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld2r { v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld2r { v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld2r { v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+
+# CHECK: [25] Code Region - G26
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 510
+# CHECK-NEXT: Total uOps: 2000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.92
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld2r { v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld2r { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld2r { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld2r { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,4] .D===eeeeeeeeER ld2r { v1.4h, v2.4h }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2r { v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld2r { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld2r { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld2r { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld2r { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+
+# CHECK: [26] Code Region - G27
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 510
+# CHECK-NEXT: Total uOps: 2300
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.51
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 2.8
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld2r { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld2r { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld2r { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld2r { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,4] .D===eeeeeeeeER ld3 { v1.2d, v2.2d, v3.2d }, [x27], #48
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2r { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld2r { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld2r { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld2r { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld3 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+
+# CHECK: [27] Code Region - G28
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 510
+# CHECK-NEXT: Total uOps: 3200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 6.27
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld3 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld3 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld3 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld3 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld3 { v1.8h, v2.8h, v3.8h }, [x27], #48
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld3 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld3 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld3 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld3 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
+
+# CHECK: [28] Code Region - G29
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 510
+# CHECK-NEXT: Total uOps: 3300
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 6.47
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 4.3
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld3 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld3 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld3 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld3 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld3 { v1.4s, v2.4s, v3.4s }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld3 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld3 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld3 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld3 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
+
+# CHECK: [29] Code Region - G30
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 1910
+# CHECK-NEXT: Total uOps: 3200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 1.68
+# CHECK-NEXT: IPC: 0.26
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
+
+# CHECK: [0,0] DeeeeeeeeER . . . . ld3 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeeeeeER . . . . ld3 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,2] .D=eeeeeeeeER . . . . ld3 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,3] .D=========eeeeeeeeER . . ld3 { v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,4] . D================eeeeeeeeER ld3 { v1.b, v2.b, v3.b }[8], [x27], #3
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld3 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld3 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 3. 1 10.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 4. 1 17.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 1 6.4 0.2 0.0 <total>
+
+# CHECK: [30] Code Region - G31
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 4003
+# CHECK-NEXT: Total uOps: 3000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.75
+# CHECK-NEXT: IPC: 0.12
+# CHECK-NEXT: Block RThroughput: 3.8
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld3 { v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld3 { v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,2] .D===============eeeeeeeeER . . . . ld3 { v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,3] .D=======================eeeeeeeeER. . . ld3 { v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,4] . D==============================eeeeeeeeER ld3 { v1.h, v2.h, v3.h }[0], [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 2. 1 16.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 3. 1 24.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 4. 1 31.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 1 16.2 0.2 0.0 <total>
+
+# CHECK: [31] Code Region - G32
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 4003
+# CHECK-NEXT: Total uOps: 3000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 0.75
+# CHECK-NEXT: IPC: 0.12
+# CHECK-NEXT: Block RThroughput: 3.8
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld3 { v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld3 { v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,2] .D===============eeeeeeeeER . . . . ld3 { v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,3] .D=======================eeeeeeeeER. . . ld3 { v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,4] . D==============================eeeeeeeeER ld3 { v1.d, v2.d, v3.d }[0], [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld3 { v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 2. 1 16.0 0.0 0.0 ld3 { v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 3. 1 24.0 0.0 0.0 ld3 { v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 4. 1 31.0 0.0 0.0 ld3 { v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 1 16.2 0.2 0.0 <total>
+
+# CHECK: [32] Code Region - G33
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 510
+# CHECK-NEXT: Total uOps: 3200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 6.27
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld3r { v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld3r { v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld3r { v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld3r { v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld3r { v1.4s, v2.4s, v3.4s }, [x27], #12
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3r { v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld3r { v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld3r { v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld3r { v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld3r { v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
+
+# CHECK: [33] Code Region - G34
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 510
+# CHECK-NEXT: Total uOps: 3300
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 6.47
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 4.3
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld3r { v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld3r { v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld3r { v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld3r { v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld3r { v1.2d, v2.2d, v3.2d }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3r { v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld3r { v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld3r { v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld3r { v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld3r { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
+
+# CHECK: [34] Code Region - G35
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 510
+# CHECK-NEXT: Total uOps: 3200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 6.27
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld3r { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld3r { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld3r { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld3r { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld3r { v1.8h, v2.8h, v3.8h }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3r { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld3r { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld3r { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld3r { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld3r { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
+
+# CHECK: [35] Code Region - G36
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 710
+# CHECK-NEXT: Total uOps: 4500
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 6.34
+# CHECK-NEXT: IPC: 0.70
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER .. ld3r { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1] .DeeeeeeeeeER .. ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,2] . DeeeeeeeeER .. ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,3] . D=eeeeeeeeER .. ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,4] . D==eeeeeeeeeER ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3r { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 4. 1 3.0 1.0 0.0 ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 1 1.6 0.4 0.0 <total>
+
+# CHECK: [36] Code Region - G37
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 810
+# CHECK-NEXT: Total uOps: 4900
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 6.05
+# CHECK-NEXT: IPC: 0.62
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234567
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . . ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1] .DeeeeeeeeeER . . ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,2] . DeeeeeeeeeER . . ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,3] . DeeeeeeeeeER. . ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,4] . D===eeeeeeeeER ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 4. 1 4.0 3.0 0.0 ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 1 1.6 0.8 0.0 <total>
+
+# CHECK: [37] Code Region - G38
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 809
+# CHECK-NEXT: Total uOps: 4900
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 6.06
+# CHECK-NEXT: IPC: 0.62
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER .. ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1] .DeeeeeeeeeER .. ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeeeeeER .. ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3] . DeeeeeeeeeER.. ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,4] . D=eeeeeeeeeER ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 1 1.2 0.4 0.0 <total>
+
+# CHECK: [38] Code Region - G39
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 4003
+# CHECK-NEXT: Total uOps: 4000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 1.00
+# CHECK-NEXT: IPC: 0.12
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,2] .D===============eeeeeeeeER . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,3] .D=======================eeeeeeeeER. . . ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,4] . D==============================eeeeeeeeER ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 2. 1 16.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 3. 1 24.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 4. 1 31.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 1 16.2 0.2 0.0 <total>
+
+# CHECK: [39] Code Region - G40
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 4003
+# CHECK-NEXT: Total uOps: 4000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 1.00
+# CHECK-NEXT: IPC: 0.12
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 012
+
+# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,2] .D===============eeeeeeeeER . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,3] .D=======================eeeeeeeeER. . . ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,4] . D==============================eeeeeeeeER ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 2. 1 16.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 3. 1 24.0 0.0 0.0 ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 4. 1 31.0 0.0 0.0 ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 1 16.2 0.2 0.0 <total>
+
+# CHECK: [40] Code Region - G41
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 1903
+# CHECK-NEXT: Total uOps: 4100
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 2.15
+# CHECK-NEXT: IPC: 0.26
+# CHECK-NEXT: Block RThroughput: 5.3
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 01
+
+# CHECK: [0,0] DeeeeeeeeER . .. ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,1] D========eeeeeeeeER .. ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,2] .D========eeeeeeeeER.. ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3] . D========eeeeeeeeER. ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,4] . D========eeeeeeeeER ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3. 1 9.0 0.0 0.0 ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 4. 1 9.0 0.0 0.0 ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: 1 7.4 0.2 0.0 <total>
+
+# CHECK: [41] Code Region - G42
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 659
+# CHECK-NEXT: Total uOps: 4300
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 6.53
+# CHECK-NEXT: IPC: 0.76
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012345
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,3] . DeeeeeeeeER . ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,4] . D=eeeeeeeeER ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: 1 1.2 0.4 0.0 <total>
+
+# CHECK: [42] Code Region - G43
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 610
+# CHECK-NEXT: Total uOps: 4200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 6.89
+# CHECK-NEXT: IPC: 0.82
+# CHECK-NEXT: Block RThroughput: 5.7
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012345
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3] . D=eeeeeeeeER . ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 4. 1 3.0 1.0 0.0 ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 1 1.6 0.4 0.0 <total>
+
+# CHECK: [43] Code Region - G44
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 508
+# CHECK-NEXT: Total uOps: 3400
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 6.69
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 4.3
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1] .DeeeeeeeeER. ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeeeeeER ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3] . D=eeeeeeE-R ldp s1, s2, [x27], #248
+# CHECK-NEXT: [0,4] . D=eeeeeeER ldp d1, d2, [x27], #496
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 1.0 ldp s1, s2, [x27], #248
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ldp d1, d2, [x27], #496
+# CHECK-NEXT: 1 1.4 0.2 0.2 <total>
+
+# CHECK: [44] Code Region - G45
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 507
+# CHECK-NEXT: Total uOps: 2300
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.54
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 2.3
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER .. ldp q1, q2, [x27], #992
+# CHECK-NEXT: [0,1] D=eeeeeeER.. ldp s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,2] D==eeeeeeER. ldp d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,3] .D==eeeeeeER ldp q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,4] .D===eeeeE-R ldp w1, w2, [x27], #248
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldp q1, q2, [x27], #992
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldp s1, s2, [x27, #248]!
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldp d1, d2, [x27, #496]!
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ldp q1, q2, [x27, #992]!
+# CHECK-NEXT: 4. 1 4.0 0.0 1.0 ldp w1, w2, [x27], #248
+# CHECK-NEXT: 1 2.6 0.2 0.2 <total>
+
+# CHECK: [45] Code Region - G46
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 507
+# CHECK-NEXT: Total uOps: 2100
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.14
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeER .. ldp x1, x2, [x27], #496
+# CHECK-NEXT: [0,1] D=eeeeER .. ldp w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,2] D==eeeeER .. ldp x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,3] D===eeeeeER. ldpsw x1, x2, [x27], #248
+# CHECK-NEXT: [0,4] .D===eeeeeER ldpsw x1, x2, [x27, #248]!
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldp x1, x2, [x27], #496
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldp w1, w2, [x27, #248]!
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldp x1, x2, [x27, #496]!
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ldpsw x1, x2, [x27], #248
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ldpsw x1, x2, [x27, #248]!
+# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+
+# CHECK: [46] Code Region - G47
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 508
+# CHECK-NEXT: Total uOps: 1500
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 2.95
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 1.7
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . . ldr b1, [x27], #254
+# CHECK-NEXT: [0,1] D=eeeeeeER. . ldr h1, [x27], #254
+# CHECK-NEXT: [0,2] D==eeeeeeER . ldr s1, [x27], #254
+# CHECK-NEXT: [0,3] D===eeeeeeER. ldr d1, [x27], #254
+# CHECK-NEXT: [0,4] D====eeeeeeER ldr q1, [x27], #254
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldr b1, [x27], #254
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldr h1, [x27], #254
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldr s1, [x27], #254
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ldr d1, [x27], #254
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ldr q1, [x27], #254
+# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+
+# CHECK: [47] Code Region - G48
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 508
+# CHECK-NEXT: Total uOps: 1500
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 2.95
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 1.7
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . . ldr b1, [x27, #254]!
+# CHECK-NEXT: [0,1] D=eeeeeeER. . ldr h1, [x27, #254]!
+# CHECK-NEXT: [0,2] D==eeeeeeER . ldr s1, [x27, #254]!
+# CHECK-NEXT: [0,3] D===eeeeeeER. ldr d1, [x27, #254]!
+# CHECK-NEXT: [0,4] D====eeeeeeER ldr q1, [x27, #254]!
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldr b1, [x27, #254]!
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldr h1, [x27, #254]!
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldr s1, [x27, #254]!
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ldr d1, [x27, #254]!
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ldr q1, [x27, #254]!
+# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+
+# CHECK: [48] Code Region - G49
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 506
+# CHECK-NEXT: Total uOps: 1000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 1.98
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 1.7
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeER . ldr w1, [x27], #254
+# CHECK-NEXT: [0,1] D=eeeeER . ldr x1, [x27], #254
+# CHECK-NEXT: [0,2] D==eeeeER . ldr w1, [x27, #254]!
+# CHECK-NEXT: [0,3] D===eeeeER. ldr x1, [x27, #254]!
+# CHECK-NEXT: [0,4] D====eeeeER ldrb w1, [x27], #254
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldr w1, [x27], #254
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldr x1, [x27], #254
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldr w1, [x27, #254]!
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ldr x1, [x27, #254]!
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ldrb w1, [x27], #254
+# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+
+# CHECK: [49] Code Region - G50
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 506
+# CHECK-NEXT: Total uOps: 1000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 1.98
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 1.7
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeER . ldrb w1, [x27, #254]!
+# CHECK-NEXT: [0,1] D=eeeeER . ldrh w1, [x27], #254
+# CHECK-NEXT: [0,2] D==eeeeER . ldrh w1, [x27, #254]!
+# CHECK-NEXT: [0,3] D===eeeeER. ldrsb w1, [x27], #254
+# CHECK-NEXT: [0,4] D====eeeeER ldrsb x1, [x27], #254
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldrb w1, [x27, #254]!
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldrh w1, [x27], #254
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldrh w1, [x27, #254]!
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ldrsb w1, [x27], #254
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ldrsb x1, [x27], #254
+# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+
+# CHECK: [50] Code Region - G51
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 506
+# CHECK-NEXT: Total uOps: 1000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 1.98
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 1.7
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeER . ldrsb w1, [x27, #254]!
+# CHECK-NEXT: [0,1] D=eeeeER . ldrsb x1, [x27, #254]!
+# CHECK-NEXT: [0,2] D==eeeeER . ldrsh w1, [x27], #254
+# CHECK-NEXT: [0,3] D===eeeeER. ldrsh x1, [x27], #254
+# CHECK-NEXT: [0,4] D====eeeeER ldrsh w1, [x27, #254]!
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldrsb w1, [x27, #254]!
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldrsb x1, [x27, #254]!
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldrsh w1, [x27], #254
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ldrsh x1, [x27], #254
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ldrsh w1, [x27, #254]!
+# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+
+# CHECK: [51] Code Region - G52
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 504
+# CHECK-NEXT: Total uOps: 1200
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 2.38
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 012345678
+
+# CHECK: [0,0] DeeeeER . ldrsh x1, [x27, #254]!
+# CHECK-NEXT: [0,1] D=eeeeER. ldrsw x1, [x27], #254
+# CHECK-NEXT: [0,2] D==eeeeER ldrsw x1, [x27, #254]!
+# CHECK-NEXT: [0,3] D===eeE-R st1 { v1.1d }, [x27], #8
+# CHECK-NEXT: [0,4] D====eeER st1 { v1.2d }, [x27], #16
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldrsh x1, [x27, #254]!
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldrsw x1, [x27], #254
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldrsw x1, [x27, #254]!
+# CHECK-NEXT: 3. 1 4.0 0.0 1.0 st1 { v1.1d }, [x27], #8
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 st1 { v1.2d }, [x27], #16
+# CHECK-NEXT: 1 3.0 0.2 0.2 <total>
+
+# CHECK: [52] Code Region - G53
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 504
+# CHECK-NEXT: Total uOps: 1500
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 2.98
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 012345678
+
+# CHECK: [0,0] DeeER. . st1 { v1.2s }, [x27], #8
+# CHECK-NEXT: [0,1] D=eeER . st1 { v1.4h }, [x27], #8
+# CHECK-NEXT: [0,2] D==eeER . st1 { v1.4s }, [x27], #16
+# CHECK-NEXT: [0,3] D===eeER. st1 { v1.8b }, [x27], #8
+# CHECK-NEXT: [0,4] D====eeER st1 { v1.8h }, [x27], #16
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.2s }, [x27], #8
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.4h }, [x27], #8
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.4s }, [x27], #16
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.8b }, [x27], #8
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 st1 { v1.8h }, [x27], #16
+# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+
+# CHECK: [53] Code Region - G54
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 504
+# CHECK-NEXT: Total uOps: 1500
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 2.98
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 012345678
+
+# CHECK: [0,0] DeeER. . st1 { v1.16b }, [x27], #16
+# CHECK-NEXT: [0,1] D=eeER . st1 { v1.1d }, [x27], x28
+# CHECK-NEXT: [0,2] D==eeER . st1 { v1.2d }, [x27], x28
+# CHECK-NEXT: [0,3] D===eeER. st1 { v1.2s }, [x27], x28
+# CHECK-NEXT: [0,4] D====eeER st1 { v1.4h }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.16b }, [x27], #16
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.1d }, [x27], x28
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.2d }, [x27], x28
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.2s }, [x27], x28
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 st1 { v1.4h }, [x27], x28
+# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+
+# CHECK: [54] Code Region - G55
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 504
+# CHECK-NEXT: Total uOps: 1500
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 2.98
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 012345678
+
+# CHECK: [0,0] DeeER. . st1 { v1.4s }, [x27], x28
+# CHECK-NEXT: [0,1] D=eeER . st1 { v1.8b }, [x27], x28
+# CHECK-NEXT: [0,2] D==eeER . st1 { v1.8h }, [x27], x28
+# CHECK-NEXT: [0,3] D===eeER. st1 { v1.16b }, [x27], x28
+# CHECK-NEXT: [0,4] D====eeER st1 { v1.1d, v2.1d }, [x27], #16
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.4s }, [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.8b }, [x27], x28
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.8h }, [x27], x28
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.16b }, [x27], x28
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 st1 { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+
+# CHECK: [55] Code Region - G56
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 504
+# CHECK-NEXT: Total uOps: 1900
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.77
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 3.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 012345678
+
+# CHECK: [0,0] DeeER. . st1 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,1] D=eeER . st1 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,2] D==eeER . st1 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3] D===eeER. st1 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,4] .D===eeER st1 { v1.8b, v2.8b }, [x27], #16
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+
+# CHECK: [56] Code Region - G57
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 504
+# CHECK-NEXT: Total uOps: 2100
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.17
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 012345678
+
+# CHECK: [0,0] DeeER. . st1 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1] D=eeER . st1 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,2] D==eeER . st1 { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3] .D==eeER. st1 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,4] .D===eeER st1 { v1.2s, v2.2s }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st1 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 1 2.6 0.2 0.0 <total>
+
+# CHECK: [57] Code Region - G58
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 504
+# CHECK-NEXT: Total uOps: 2100
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.17
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 012345678
+
+# CHECK: [0,0] DeeER. . st1 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1] D=eeER . st1 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,2] D==eeER . st1 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3] D===eeER. st1 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,4] .D===eeER st1 { v1.16b, v2.16b }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+
+# CHECK: [58] Code Region - G59
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 703
+# CHECK-NEXT: Total uOps: 2900
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.13
+# CHECK-NEXT: IPC: 0.71
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . st1 { v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1] D=eeER . st1 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,2] .D=eeER . st1 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,3] .D===eeER. st1 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,4] . D===eeER st1 { v1.4s, v2.4s, v3.4s }, [x27], #48
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 3. 1 4.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 1 2.6 0.4 0.0 <total>
+
+# CHECK: [59] Code Region - G60
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 703
+# CHECK-NEXT: Total uOps: 3100
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.41
+# CHECK-NEXT: IPC: 0.71
+# CHECK-NEXT: Block RThroughput: 6.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . st1 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1] D=eeER . st1 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,2] .D=eeER . st1 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,3] .D===eeER. st1 { v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,4] . D===eeER st1 { v1.2d, v2.2d, v3.2d }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 3. 1 4.0 1.0 0.0 st1 { v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 1 2.6 0.4 0.0 <total>
+
+# CHECK: [60] Code Region - G61
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 703
+# CHECK-NEXT: Total uOps: 2900
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.13
+# CHECK-NEXT: IPC: 0.71
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . st1 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1] D=eeER . st1 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,2] .D=eeER . st1 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,3] .D==eeER . st1 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,4] . D===eeER st1 { v1.8h, v2.8h, v3.8h }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 4. 1 4.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 1 2.4 0.4 0.0 <total>
+
+# CHECK: [61] Code Region - G62
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 704
+# CHECK-NEXT: Total uOps: 3100
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.40
+# CHECK-NEXT: IPC: 0.71
+# CHECK-NEXT: Block RThroughput: 6.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . st1 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1] D=eeER . st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,2] .D==eeER . st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3] .D===eeER . st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,4] . D====eeER st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 2. 1 3.0 1.0 0.0 st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 4. 1 5.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 1 3.0 0.6 0.0 <total>
+
+# CHECK: [62] Code Region - G63
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 804
+# CHECK-NEXT: Total uOps: 3700
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.60
+# CHECK-NEXT: IPC: 0.62
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. .. st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1] D=eeER .. st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,2] .D==eeER .. st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3] . D==eeER .. st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,4] . D=====eeER st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 2. 1 3.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 4. 1 6.0 2.0 0.0 st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 1 3.0 0.8 0.0 <total>
+
+# CHECK: [63] Code Region - G64
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 703
+# CHECK-NEXT: Total uOps: 3300
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.69
+# CHECK-NEXT: IPC: 0.71
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1] D=eeER . st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,2] .D==eeER . st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,3] .D===eeER. st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,4] . D===eeER st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 2. 1 3.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 1 2.8 0.4 0.0 <total>
+
+# CHECK: [64] Code Region - G65
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 706
+# CHECK-NEXT: Total uOps: 3000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 4.25
+# CHECK-NEXT: IPC: 0.71
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . . st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1] .DeeER . . st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,2] .D===eeeeER . st1 { v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,3] . D===eeeeER. st1 { v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,4] . D====eeeeER st1 { v1.b }[0], [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 2. 1 4.0 2.0 0.0 st1 { v1.b }[0], [x27], #1
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.b }[8], [x27], #1
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 st1 { v1.b }[0], [x27], x28
+# CHECK-NEXT: 1 3.0 0.6 0.0 <total>
+
+# CHECK: [65] Code Region - G66
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 506
+# CHECK-NEXT: Total uOps: 2000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.95
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeER . st1 { v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeER . st1 { v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,2] D==eeeeER . st1 { v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,3] D===eeeeER. st1 { v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,4] .D===eeeeER st1 { v1.h }[4], [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.b }[8], [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.h }[0], [x27], #2
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.h }[4], [x27], #2
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.h }[0], [x27], x28
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.h }[4], [x27], x28
+# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+
+# CHECK: [66] Code Region - G67
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 605
+# CHECK-NEXT: Total uOps: 2300
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.80
+# CHECK-NEXT: IPC: 0.83
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeER . st1 { v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,1] D=eeeeER . st1 { v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,2] D==eeeeER . st1 { v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,3] D===eeeeER. st1 { v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,4] .D===eeeeER st2 { v1.2d, v2.2d }, [x27], #32
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.s }[0], [x27], #4
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.s }[0], [x27], x28
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.d }[0], [x27], #8
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.d }[0], [x27], x28
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st2 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+
+# CHECK: [67] Code Region - G68
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 705
+# CHECK-NEXT: Total uOps: 2600
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.69
+# CHECK-NEXT: IPC: 0.71
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeER .. st2 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,1] D=eeeeER .. st2 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,2] D==eeeeER .. st2 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,3] .D==eeeeER.. st2 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,4] .D====eeeeER st2 { v1.8h, v2.8h }, [x27], #32
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st2 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st2 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st2 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 4. 1 5.0 1.0 0.0 st2 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 1 2.8 0.4 0.0 <total>
+
+# CHECK: [68] Code Region - G69
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 805
+# CHECK-NEXT: Total uOps: 2900
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.60
+# CHECK-NEXT: IPC: 0.62
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeER . . st2 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1] D=eeeeER . . st2 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,2] .D===eeeeER . st2 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3] .D====eeeeER. st2 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,4] .D=====eeeeER st2 { v1.4s, v2.4s }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st2 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 2. 1 4.0 2.0 0.0 st2 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 3. 1 5.0 0.0 0.0 st2 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 4. 1 6.0 0.0 0.0 st2 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 1 3.6 0.6 0.0 <total>
+
+# CHECK: [69] Code Region - G70
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 706
+# CHECK-NEXT: Total uOps: 2600
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.68
+# CHECK-NEXT: IPC: 0.71
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeER . . st2 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeER . . st2 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,2] .D=eeeeER . . st2 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3] .D====eeeeER. st2 { v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,4] .D=====eeeeER st2 { v1.b, v2.b }[8], [x27], #2
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st2 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st2 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 3. 1 5.0 2.0 0.0 st2 { v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 4. 1 6.0 0.0 0.0 st2 { v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 1 3.2 0.6 0.0 <total>
+
+# CHECK: [70] Code Region - G71
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 506
+# CHECK-NEXT: Total uOps: 2000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.95
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeER . st2 { v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeER . st2 { v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,2] D==eeeeER . st2 { v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,3] D===eeeeER. st2 { v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,4] .D===eeeeER st2 { v1.h, v2.h }[0], [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st2 { v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st2 { v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st2 { v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st2 { v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+
+# CHECK: [71] Code Region - G72
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 506
+# CHECK-NEXT: Total uOps: 2000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.95
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeER . st2 { v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeER . st2 { v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,2] D==eeeeER . st2 { v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,3] D===eeeeER. st2 { v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,4] .D===eeeeER st2 { v1.d, v2.d }[0], [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st2 { v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st2 { v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st2 { v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st2 { v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+
+# CHECK: [72] Code Region - G73
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 807
+# CHECK-NEXT: Total uOps: 3000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.72
+# CHECK-NEXT: IPC: 0.62
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeER . . . st2g x26, [x27], #4064
+# CHECK-NEXT: [0,1] D=eER. . . st2g x26, [x27, #4064]!
+# CHECK-NEXT: [0,2] D==eeeeeeER . st3 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3] .D==eeeeeER . st3 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,4] .D======eeeeeER st3 { v1.4h, v2.4h, v3.4h }, [x27], #24
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2g x26, [x27], #4064
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st2g x26, [x27, #4064]!
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st3 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st3 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 4. 1 7.0 3.0 0.0 st3 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 1 3.2 0.8 0.0 <total>
+
+# CHECK: [73] Code Region - G74
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 1405
+# CHECK-NEXT: Total uOps: 4700
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.35
+# CHECK-NEXT: IPC: 0.36
+# CHECK-NEXT: Block RThroughput: 14.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . . . st3 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1] .DeeeeeER . . . st3 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,2] . D===eeeeeeER . . st3 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3] . D===eeeeeeER. . st3 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,4] . D=======eeeeeeER st3 { v1.2d, v2.2d, v3.2d }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st3 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 2. 1 4.0 3.0 0.0 st3 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st3 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 4. 1 8.0 4.0 0.0 st3 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 1 3.6 1.6 0.0 <total>
+
+# CHECK: [74] Code Region - G75
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 1206
+# CHECK-NEXT: Total uOps: 4100
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.40
+# CHECK-NEXT: IPC: 0.41
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234567
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . st3 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeeER . . . st3 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,2] .D===eeeeeeER . . st3 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,3] . D===eeeeeER . . st3 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,4] . D======eeeeeeER st3 { v1.8h, v2.8h, v3.8h }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st3 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 2. 1 4.0 2.0 0.0 st3 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st3 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 4. 1 7.0 3.0 0.0 st3 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 1 3.6 1.2 0.0 <total>
+
+# CHECK: [75] Code Region - G76
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 1106
+# CHECK-NEXT: Total uOps: 3800
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.44
+# CHECK-NEXT: IPC: 0.45
+# CHECK-NEXT: Block RThroughput: 11.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. st3 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1] .DeeeeeER . .. st3 { v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,2] .D====eeeeeER .. st3 { v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,3] . D====eeeeeER .. st3 { v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,4] . D=======eeeeeER st3 { v1.b, v2.b, v3.b }[8], [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st3 { v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 2. 1 5.0 3.0 0.0 st3 { v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 3. 1 5.0 0.0 0.0 st3 { v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 4. 1 8.0 2.0 0.0 st3 { v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 1 4.0 1.2 0.0 <total>
+
+# CHECK: [76] Code Region - G77
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 1005
+# CHECK-NEXT: Total uOps: 3500
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.48
+# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: Block RThroughput: 10.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012345
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeER . . st3 { v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,1] D=eeeeeER . . st3 { v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,2] .D===eeeeeER . st3 { v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,3] .D====eeeeeER . st3 { v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,4] . D======eeeeeER st3 { v1.s, v2.s, v3.s }[0], [x27], #12
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st3 { v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 2. 1 4.0 2.0 0.0 st3 { v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 3. 1 5.0 0.0 0.0 st3 { v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 4. 1 7.0 2.0 0.0 st3 { v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 1 3.8 1.0 0.0 <total>
+
+# CHECK: [77] Code Region - G78
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 1304
+# CHECK-NEXT: Total uOps: 4300
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.30
+# CHECK-NEXT: IPC: 0.38
+# CHECK-NEXT: Block RThroughput: 13.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeER . .. st3 { v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeeER . .. st3 { v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,2] .D===eeeeeER .. st3 { v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,3] . D===eeeeeER .. st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,4] . D=====eeeeeeER st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st3 { v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 2. 1 4.0 2.0 0.0 st3 { v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 4. 1 6.0 2.0 0.0 st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 1 3.4 1.0 0.0 <total>
+
+# CHECK: [78] Code Region - G79
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 2399
+# CHECK-NEXT: Total uOps: 6900
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 2.88
+# CHECK-NEXT: IPC: 0.21
+# CHECK-NEXT: Block RThroughput: 24.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012
+
+# CHECK: [0,0] DeeeeeeER . . . . st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1] .DeeeeeeeER . . . st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,2] . D====eeeeeeER. . . st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3] . D=========eeeeeeeER. st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,4] . D========eeeeeeeER st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 2. 1 5.0 4.0 0.0 st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 3. 1 10.0 5.0 0.0 st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 4. 1 9.0 0.0 0.0 st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 1 5.2 2.0 0.0 <total>
+
+# CHECK: [79] Code Region - G80
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 1903
+# CHECK-NEXT: Total uOps: 5700
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.00
+# CHECK-NEXT: IPC: 0.26
+# CHECK-NEXT: Block RThroughput: 19.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 01
+
+# CHECK: [0,0] DeeeeeER . . .. st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1] .DeeeeeeER. . .. st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,2] . D=====eeeeeeER .. st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,3] . D=====eeeeeeeER .. st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,4] . D=========eeeeeeER st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 2. 1 6.0 5.0 0.0 st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 3. 1 6.0 0.0 0.0 st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 4. 1 10.0 4.0 0.0 st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 1 4.8 2.0 0.0 <total>
+
+# CHECK: [80] Code Region - G81
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 1658
+# CHECK-NEXT: Total uOps: 4900
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 2.96
+# CHECK-NEXT: IPC: 0.30
+# CHECK-NEXT: Block RThroughput: 16.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 0123
+
+# CHECK: [0,0] DeeeeeeeER. . . . st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1] . DeeeeeeeER . . . st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,2] . D=========eeeeeeER . st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,3] . D===========eeeeeeER. st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,4] . D============eeeeeeER st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 2. 1 10.0 9.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 3. 1 12.0 1.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 4. 1 13.0 0.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 1 7.4 2.4 0.0 <total>
+
+# CHECK: [81] Code Region - G82
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 757
+# CHECK-NEXT: Total uOps: 2500
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.30
+# CHECK-NEXT: IPC: 0.66
+# CHECK-NEXT: Block RThroughput: 7.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 01234
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . . st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,1] D=eeeeeeER. . st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,2] D===eeeeeeER . st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,3] .D===eeeeeeER . st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,4] .D=====eeeeeeER st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 2. 1 4.0 1.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 4. 1 6.0 1.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 1 3.4 0.6 0.0 <total>
+
+# CHECK: [82] Code Region - G83
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 704
+# CHECK-NEXT: Total uOps: 2700
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.84
+# CHECK-NEXT: IPC: 0.71
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,1] D=eeeeeeER. st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,2] .D==eeeeER. st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,3] .D===eeeeER st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,4] . D===eE--R stg x26, [x27], #4064
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 2. 1 3.0 1.0 0.0 st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 4. 1 4.0 0.0 2.0 stg x26, [x27], #4064
+# CHECK-NEXT: 1 2.8 0.4 0.4 <total>
+
+# CHECK: [83] Code Region - G84
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 504
+# CHECK-NEXT: Total uOps: 1700
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.37
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 012345678
+
+# CHECK: [0,0] DeER . . stg x26, [x27, #4064]!
+# CHECK-NEXT: [0,1] D=eER. . stgp x1, x2, [x27], #992
+# CHECK-NEXT: [0,2] D==eER . stgp x1, x2, [x27, #992]!
+# CHECK-NEXT: [0,3] D===eeER. stp s1, s2, [x27], #248
+# CHECK-NEXT: [0,4] .D===eeER stp d1, d2, [x27], #496
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 stg x26, [x27, #4064]!
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 stgp x1, x2, [x27], #992
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 stgp x1, x2, [x27, #992]!
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 stp s1, s2, [x27], #248
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 stp d1, d2, [x27], #496
+# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+
+# CHECK: [84] Code Region - G85
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 703
+# CHECK-NEXT: Total uOps: 2000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 2.84
+# CHECK-NEXT: IPC: 0.71
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . stp q1, q2, [x27], #992
+# CHECK-NEXT: [0,1] D==eeER . stp s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,2] D===eeER . stp d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,3] .D===eeER. stp q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,4] .D=====eER stp w1, w2, [x27], #248
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 stp q1, q2, [x27], #992
+# CHECK-NEXT: 1. 1 3.0 0.0 0.0 stp s1, s2, [x27, #248]!
+# CHECK-NEXT: 2. 1 4.0 0.0 0.0 stp d1, d2, [x27, #496]!
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 stp q1, q2, [x27, #992]!
+# CHECK-NEXT: 4. 1 6.0 0.0 0.0 stp w1, w2, [x27], #248
+# CHECK-NEXT: 1 3.6 0.2 0.0 <total>
+
+# CHECK: [85] Code Region - G86
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 504
+# CHECK-NEXT: Total uOps: 1700
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.37
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 012345678
+
+# CHECK: [0,0] DeER . . stp x1, x2, [x27], #496
+# CHECK-NEXT: [0,1] D=eER. . stp w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,2] D==eER . stp x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,3] D===eeER. str b1, [x27], #254
+# CHECK-NEXT: [0,4] .D===eeER str h1, [x27], #254
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 stp x1, x2, [x27], #496
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 stp w1, w2, [x27, #248]!
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 stp x1, x2, [x27, #496]!
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 str b1, [x27], #254
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 str h1, [x27], #254
+# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+
+# CHECK: [86] Code Region - G87
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 504
+# CHECK-NEXT: Total uOps: 2000
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.97
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 012345678
+
+# CHECK: [0,0] DeeER. . str s1, [x27], #254
+# CHECK-NEXT: [0,1] D=eeER . str d1, [x27], #254
+# CHECK-NEXT: [0,2] D==eeER . str q1, [x27], #254
+# CHECK-NEXT: [0,3] D===eeER. str b1, [x27, #254]!
+# CHECK-NEXT: [0,4] .D===eeER str h1, [x27, #254]!
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 str s1, [x27], #254
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 str d1, [x27], #254
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 str q1, [x27], #254
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 str b1, [x27, #254]!
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 str h1, [x27, #254]!
+# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+
+# CHECK: [87] Code Region - G88
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 503
+# CHECK-NEXT: Total uOps: 1800
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.58
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 01234567
+
+# CHECK: [0,0] DeeER. . str s1, [x27, #254]!
+# CHECK-NEXT: [0,1] D=eeER . str d1, [x27, #254]!
+# CHECK-NEXT: [0,2] D==eeER. str q1, [x27, #254]!
+# CHECK-NEXT: [0,3] D===eER. str w1, [x27], #254
+# CHECK-NEXT: [0,4] .D===eER str x1, [x27], #254
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 str s1, [x27, #254]!
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 str d1, [x27, #254]!
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 str q1, [x27, #254]!
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 str w1, [x27], #254
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 str x1, [x27], #254
+# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+
+# CHECK: [88] Code Region - G89
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 503
+# CHECK-NEXT: Total uOps: 1500
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 2.98
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 01234567
+
+# CHECK: [0,0] DeER . . str w1, [x27, #254]!
+# CHECK-NEXT: [0,1] D=eER. . str x1, [x27, #254]!
+# CHECK-NEXT: [0,2] D==eER . strb w1, [x27], #254
+# CHECK-NEXT: [0,3] D===eER. strb w1, [x27, #254]!
+# CHECK-NEXT: [0,4] D====eER strh w1, [x27], #254
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 str w1, [x27, #254]!
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 str x1, [x27, #254]!
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 strb w1, [x27], #254
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 strb w1, [x27, #254]!
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 strh w1, [x27], #254
+# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+
+# CHECK: [89] Code Region - G90
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 500
+# CHECK-NEXT: Total Cycles: 503
+# CHECK-NEXT: Total uOps: 1500
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 2.98
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 01234567
+
+# CHECK: [0,0] DeER . . strh w1, [x27, #254]!
+# CHECK-NEXT: [0,1] D=eER. . stz2g x26, [x27], #4064
+# CHECK-NEXT: [0,2] D==eER . stz2g x26, [x27, #4064]!
+# CHECK-NEXT: [0,3] D===eER. stzg x26, [x27], #4064
+# CHECK-NEXT: [0,4] D====eER stzg x26, [x27, #4064]!
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 strh w1, [x27, #254]!
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 stz2g x26, [x27], #4064
+# CHECK-NEXT: 2. 1 3.0 0.0 0.0 stz2g x26, [x27, #4064]!
+# CHECK-NEXT: 3. 1 4.0 0.0 0.0 stzg x26, [x27], #4064
+# CHECK-NEXT: 4. 1 5.0 0.0 0.0 stzg x26, [x27, #4064]!
+# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+
+# CHECK: [90] Code Region - G91
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 200
+# CHECK-NEXT: Total Cycles: 110
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 16
+# CHECK-NEXT: uOps Per Cycle: 3.64
+# CHECK-NEXT: IPC: 1.82
+# CHECK-NEXT: Block RThroughput: 0.7
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeER . ldr x1, [x27], #254
+# CHECK-NEXT: [0,1] D====eeeeER ldr x2, [x1], #254
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldr x1, [x27], #254
+# CHECK-NEXT: 1. 1 5.0 0.0 0.0 ldr x2, [x1], #254
+# CHECK-NEXT: 1 3.0 0.5 0.0 <total>
More information about the llvm-commits
mailing list