[llvm] [AArch64] Change IssueWidth to 5 in AArch64SchedNeoverseN2.td (PR #145717)
Simon Wallis via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 25 08:06:55 PDT 2025
https://github.com/simonwallis2 created https://github.com/llvm/llvm-project/pull/145717
It has been observed that the issue width for neoverse-n2 CPUs is set too high, and does not properly reflect the dispatch constraints.
I tested various values of IssueWidth (10, 8, 6, 5, 4) with runs of various workloads on a neoverse-n2 machine and I got the highest overall geomean score with an issue width of 5.
If this patch were to cause any major regression post-commit, it could be easily reverted, but it is likely to show an overall improvement.
Related Neoverse-V2 PR: https://github.com/llvm/llvm-project/pull/142565
Change-Id: Icdbc0aef5ea004439da6abffaaae3151f91c1b5c
>From 8aac18eac63c0c7b36fc04c88bbdc0f9044246e5 Mon Sep 17 00:00:00 2001
From: Simon Wallis <simon.wallis2 at arm.com>
Date: Wed, 25 Jun 2025 15:52:05 +0100
Subject: [PATCH] [AArch64] Change IssueWidth to 5 in AArch64SchedNeoverseN2.td
It has been observed that the issue width for neoverse-n2 CPUs is set too high,
and does not properly reflect the dispatch constraints.
I tested various values of IssueWidth (10, 8, 6, 5, 4)
with runs of various workloads on a neoverse-n2 machine
and I got the highest overall geomean score with an issue width of 5.
If this patch were to cause any major regression post-commit,
it could be easily reverted, but it is likely to show an overall improvement.
Related Neoverse-V2 PR: https://github.com/llvm/llvm-project/pull/142565
Change-Id: Icdbc0aef5ea004439da6abffaaae3151f91c1b5c
---
.../Target/AArch64/AArch64SchedNeoverseN2.td | 2 +-
llvm/test/CodeGen/AArch64/machine-combiner.ll | 8 +-
.../AArch64/Neoverse/N2-sve-instructions.s | 18 +-
.../llvm-mca/AArch64/Neoverse/N2-writeback.s | 4014 +++++++++--------
4 files changed, 2023 insertions(+), 2019 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
index e23daec97bd2d..91a707910a7f3 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
@@ -11,7 +11,7 @@
//===----------------------------------------------------------------------===//
def NeoverseN2Model : SchedMachineModel {
- let IssueWidth = 10; // Micro-ops dispatched at a time.
+ let IssueWidth = 5; // Micro-ops dispatched at a time.
let MicroOpBufferSize = 160; // Entries in micro-op re-order buffer.
let LoadLatency = 4; // Optimistic load latency.
let MispredictPenalty = 10; // Extra cycles for mispredicted branch.
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner.ll b/llvm/test/CodeGen/AArch64/machine-combiner.ll
index c8df283aace0b..70a638857ce4a 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner.ll
+++ b/llvm/test/CodeGen/AArch64/machine-combiner.ll
@@ -262,8 +262,8 @@ define half @reassociate_adds_half(half %x0, half %x1, half %x2, half %x3) {
; CHECK-UNSAFE-LABEL: reassociate_adds_half:
; CHECK-UNSAFE: // %bb.0:
; CHECK-UNSAFE-NEXT: fdiv h0, h0, h1
-; CHECK-UNSAFE-NEXT: fadd h2, h3, h2
-; CHECK-UNSAFE-NEXT: fadd h0, h2, h0
+; CHECK-UNSAFE-NEXT: fadd h1, h3, h2
+; CHECK-UNSAFE-NEXT: fadd h0, h1, h0
; CHECK-UNSAFE-NEXT: ret
%t0 = fdiv half %x0, %x1
%t1 = fadd half %x2, %t0
@@ -284,8 +284,8 @@ define half @reassociate_muls_half(half %x0, half %x1, half %x2, half %x3) {
; CHECK-UNSAFE-LABEL: reassociate_muls_half:
; CHECK-UNSAFE: // %bb.0:
; CHECK-UNSAFE-NEXT: fdiv h0, h0, h1
-; CHECK-UNSAFE-NEXT: fmul h2, h3, h2
-; CHECK-UNSAFE-NEXT: fmul h0, h2, h0
+; CHECK-UNSAFE-NEXT: fmul h1, h3, h2
+; CHECK-UNSAFE-NEXT: fmul h0, h1, h0
; CHECK-UNSAFE-NEXT: ret
%t0 = fdiv half %x0, %x1
%t1 = fmul half %x2, %t0
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
index 99e39567b1ad6..ef9d4463ebe52 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
@@ -5066,19 +5066,19 @@ zip2 z31.s, z31.s, z31.s
# CHECK-NEXT: 2 2 1.00 movs p0.b, p0/z, p0.b
# CHECK-NEXT: 2 2 1.00 movs p15.b, p15.b
# CHECK-NEXT: 2 2 1.00 movs p15.b, p15/z, p15.b
-# CHECK-NEXT: 1 1 0.10 U mrs x3, ID_AA64ZFR0_EL1
-# CHECK-NEXT: 1 1 0.10 U mrs x3, ZCR_EL1
-# CHECK-NEXT: 1 1 0.10 U mrs x3, ZCR_EL12
-# CHECK-NEXT: 1 1 0.10 U mrs x3, ZCR_EL2
-# CHECK-NEXT: 1 1 0.10 U mrs x3, ZCR_EL3
+# CHECK-NEXT: 1 1 0.20 U mrs x3, ID_AA64ZFR0_EL1
+# CHECK-NEXT: 1 1 0.20 U mrs x3, ZCR_EL1
+# CHECK-NEXT: 1 1 0.20 U mrs x3, ZCR_EL12
+# CHECK-NEXT: 1 1 0.20 U mrs x3, ZCR_EL2
+# CHECK-NEXT: 1 1 0.20 U mrs x3, ZCR_EL3
# CHECK-NEXT: 1 4 1.00 msb z0.b, p7/m, z1.b, z31.b
# CHECK-NEXT: 2 5 2.00 msb z0.d, p7/m, z1.d, z31.d
# CHECK-NEXT: 1 4 1.00 msb z0.h, p7/m, z1.h, z31.h
# CHECK-NEXT: 1 4 1.00 msb z0.s, p7/m, z1.s, z31.s
-# CHECK-NEXT: 1 1 0.10 U msr ZCR_EL1, x3
-# CHECK-NEXT: 1 1 0.10 U msr ZCR_EL12, x3
-# CHECK-NEXT: 1 1 0.10 U msr ZCR_EL2, x3
-# CHECK-NEXT: 1 1 0.10 U msr ZCR_EL3, x3
+# CHECK-NEXT: 1 1 0.20 U msr ZCR_EL1, x3
+# CHECK-NEXT: 1 1 0.20 U msr ZCR_EL12, x3
+# CHECK-NEXT: 1 1 0.20 U msr ZCR_EL2, x3
+# CHECK-NEXT: 1 1 0.20 U msr ZCR_EL3, x3
# CHECK-NEXT: 1 4 1.00 mul z0.b, p7/m, z0.b, z31.b
# CHECK-NEXT: 1 4 1.00 mul z0.b, z1.b, z2.b
# CHECK-NEXT: 2 5 2.00 mul z0.d, p7/m, z0.d, z31.d
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
index 5ffaf9138d482..dee46a304582b 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
@@ -1185,10 +1185,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 2.95
# CHECK-NEXT: IPC: 1.97
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
# CHECK-NEXT: 012
@@ -1197,13 +1197,13 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.1d }, [x27], #8
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.2d }, [x27], #16
-# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] D==eeeeeeER . ld1 { v1.2s }, [x27], #8
-# CHECK-NEXT: [0,5] D===eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.4h }, [x27], #8
-# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] .D===eeeeeeER ld1 { v1.4s }, [x27], #16
-# CHECK-NEXT: [0,9] .D====eE----R add x0, x27, #1
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.2s }, [x27], #8
+# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeeER. ld1 { v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7] . D==eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeER ld1 { v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9] . D==eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1215,14 +1215,14 @@ add x0, x27, 1
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.1d }, [x27], #8
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.2d }, [x27], #16
-# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.2s }, [x27], #8
-# CHECK-NEXT: 5. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.4h }, [x27], #8
-# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ld1 { v1.4s }, [x27], #16
-# CHECK-NEXT: 9. 1 5.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.1 0.1 2.0 <total>
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.2s }, [x27], #8
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld1 { v1.4h }, [x27], #8
+# CHECK-NEXT: 7. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.4s }, [x27], #16
+# CHECK-NEXT: 9. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.1 2.0 <total>
# CHECK: [1] Code Region - G02
@@ -1231,10 +1231,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 2.95
# CHECK-NEXT: IPC: 1.97
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
# CHECK-NEXT: 012
@@ -1243,13 +1243,13 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.8b }, [x27], #8
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.8h }, [x27], #16
-# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] D==eeeeeeER . ld1 { v1.16b }, [x27], #16
-# CHECK-NEXT: [0,5] D===eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.1d }, [x27], x28
-# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] .D===eeeeeeER ld1 { v1.2d }, [x27], x28
-# CHECK-NEXT: [0,9] .D====eE----R add x0, x27, #1
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.16b }, [x27], #16
+# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeeER. ld1 { v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7] . D==eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeER ld1 { v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9] . D==eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1261,14 +1261,14 @@ add x0, x27, 1
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.8b }, [x27], #8
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.8h }, [x27], #16
-# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.16b }, [x27], #16
-# CHECK-NEXT: 5. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.1d }, [x27], x28
-# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ld1 { v1.2d }, [x27], x28
-# CHECK-NEXT: 9. 1 5.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.1 0.1 2.0 <total>
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.16b }, [x27], #16
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld1 { v1.1d }, [x27], x28
+# CHECK-NEXT: 7. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.2d }, [x27], x28
+# CHECK-NEXT: 9. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.1 2.0 <total>
# CHECK: [2] Code Region - G03
@@ -1277,10 +1277,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 2.95
# CHECK-NEXT: IPC: 1.97
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
# CHECK-NEXT: 012
@@ -1289,13 +1289,13 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.2s }, [x27], x28
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.4h }, [x27], x28
-# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] D==eeeeeeER . ld1 { v1.4s }, [x27], x28
-# CHECK-NEXT: [0,5] D===eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.8b }, [x27], x28
-# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] .D===eeeeeeER ld1 { v1.8h }, [x27], x28
-# CHECK-NEXT: [0,9] .D====eE----R add x0, x27, #1
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.4s }, [x27], x28
+# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeeeER. ld1 { v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7] . D==eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeeeER ld1 { v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9] . D==eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1307,14 +1307,14 @@ add x0, x27, 1
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.2s }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.4h }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.4s }, [x27], x28
-# CHECK-NEXT: 5. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.8b }, [x27], x28
-# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ld1 { v1.8h }, [x27], x28
-# CHECK-NEXT: 9. 1 5.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.1 0.1 2.0 <total>
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.4s }, [x27], x28
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld1 { v1.8b }, [x27], x28
+# CHECK-NEXT: 7. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.8h }, [x27], x28
+# CHECK-NEXT: 9. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.1 2.0 <total>
# CHECK: [3] Code Region - G04
@@ -1323,10 +1323,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1900
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 3.74
# CHECK-NEXT: IPC: 1.97
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 3.8
# CHECK: Timeline view:
# CHECK-NEXT: 012
@@ -1334,14 +1334,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.16b }, [x27], x28
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] D==eeeeeeER . ld1 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] .D===eeeeeeER ld1 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9] .D====eE----R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeER. . ld1 { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER . ld1 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5] . D=eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeER. ld1 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7] . D=eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeeeER ld1 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9] . D=eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1352,15 +1352,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.16b }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 9. 1 5.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.0 0.1 2.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ld1 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 2.0 <total>
# CHECK: [4] Code Region - G05
@@ -1369,10 +1369,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 3.94
# CHECK-NEXT: IPC: 1.97
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
# CHECK-NEXT: 012
@@ -1380,14 +1380,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.4s, v2.4s }, [x27], #32
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeER ld1 { v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeER. . ld1 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER . ld1 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5] . D=eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeER. ld1 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7] . D=eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeeeER ld1 { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,9] . D=eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1398,15 +1398,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.4s, v2.4s }, [x27], #32
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 2.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ld1 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 9. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 2.0 <total>
# CHECK: [5] Code Region - G06
@@ -1415,10 +1415,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 3.94
# CHECK-NEXT: IPC: 1.97
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
# CHECK-NEXT: 012
@@ -1426,14 +1426,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.2d, v2.2d }, [x27], x28
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeER ld1 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeER. . ld1 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER . ld1 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,5] . D=eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeER. ld1 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,7] . D=eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeeeER ld1 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,9] . D=eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1444,15 +1444,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.2d, v2.2d }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 2.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 5. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ld1 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 7. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 ld1 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 9. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 2.0 <total>
# CHECK: [6] Code Region - G07
@@ -1461,10 +1461,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 2300
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 4.53
# CHECK-NEXT: IPC: 1.97
-# CHECK-NEXT: Block RThroughput: 4.3
+# CHECK-NEXT: Block RThroughput: 4.6
# CHECK: Timeline view:
# CHECK-NEXT: 012
@@ -1472,14 +1472,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.8h, v2.8h }, [x27], x28
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeER ld1 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeER. . ld1 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER . ld1 { v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,5] . D=eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeER. ld1 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,7] . D=eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeeeER ld1 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,9] . D=eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1490,15 +1490,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.8h, v2.8h }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 2.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 5. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 7. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 ld1 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 9. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 2.0 <total>
# CHECK: [7] Code Region - G08
@@ -1507,7 +1507,7 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 2500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 4.92
# CHECK-NEXT: IPC: 1.97
# CHECK-NEXT: Block RThroughput: 5.0
@@ -1518,14 +1518,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.4h, v2.4h, v3.4h }, [x27], #24
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeER ld1 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeER. . ld1 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER . ld1 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,5] . D=eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeER. ld1 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,7] . D=eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeeeER ld1 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,9] . D=eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1536,15 +1536,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.4h, v2.4h, v3.4h }, [x27], #24
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 2.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 5. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 7. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 ld1 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 9. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 2.0 <total>
# CHECK: [8] Code Region - G09
@@ -1553,7 +1553,7 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 2500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 4.92
# CHECK-NEXT: IPC: 1.97
# CHECK-NEXT: Block RThroughput: 5.0
@@ -1564,14 +1564,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.1d, v2.1d, v3.1d }, [x27], x28
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeER ld1 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeER. . ld1 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER . ld1 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5] . D=eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeER. ld1 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7] . D=eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeeeER ld1 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9] . D=eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1582,26 +1582,26 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.1d, v2.1d, v3.1d }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 2.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 2.0 <total>
# CHECK: [9] Code Region - G10
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 608
+# CHECK-NEXT: Total Cycles: 708
# CHECK-NEXT: Total uOps: 2700
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 4.44
-# CHECK-NEXT: IPC: 1.64
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.81
+# CHECK-NEXT: IPC: 1.41
# CHECK-NEXT: Block RThroughput: 5.7
# CHECK: Timeline view:
@@ -1610,14 +1610,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.8b, v2.8b, v3.8b }, [x27], x28
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeeER. ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,7] . D==eE-----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeeER ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,9] . D===eE-----R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeER. . ld1 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER . ld1 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5] . D=eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeeER . ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,7] . DeE-----R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeeeeER ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,9] . .DeE-----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1628,42 +1628,42 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.8b, v2.8b, v3.8b }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 7. 1 3.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 9. 1 4.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.6 0.1 2.2 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 7. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 9. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.3 0.2 2.2 <total>
# CHECK: [10] Code Region - G11
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 675
+# CHECK-NEXT: Total Cycles: 1008
# CHECK-NEXT: Total uOps: 3000
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 4.44
-# CHECK-NEXT: IPC: 1.48
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 2.98
+# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 6.7
# CHECK: Timeline view:
# CHECK-NEXT: 01234
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeER. . ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,1] D=eE-----R. . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeeER . ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,3] .D=eE-----R . add x0, x27, #1
-# CHECK-NEXT: [0,4] . DeeeeeeeER . ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,5] . D=eE-----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeeeeER. ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,7] . D==eE-----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D=eeeeeeeER ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,9] . D==eE-----R add x0, x27, #1
+# CHECK: [0,0] DeeeeeeeER. . . ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,1] .DeE-----R. . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeeER . . ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,3] . DeE-----R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeER . . ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,5] . DeE-----R . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeeeER . ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,7] . . DeE-----R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeeER ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,9] . . DeE-----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1673,43 +1673,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 1. 1 2.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 3. 1 2.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 5. 1 2.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 1.0 0.0 ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 7. 1 3.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 2.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 9. 1 3.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 1 1.9 0.2 2.5 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 3. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 5. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 7. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 9. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 2.5 <total>
# CHECK: [11] Code Region - G12
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 675
+# CHECK-NEXT: Total Cycles: 1008
# CHECK-NEXT: Total uOps: 3000
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 4.44
-# CHECK-NEXT: IPC: 1.48
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 2.98
+# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 6.7
# CHECK: Timeline view:
# CHECK-NEXT: 01234
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeER. . ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,1] D=eE-----R. . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeeER . ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,3] .D=eE-----R . add x0, x27, #1
-# CHECK-NEXT: [0,4] . DeeeeeeeER . ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,5] . D=eE-----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeeeeER. ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,7] . D==eE-----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D=eeeeeeeER ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,9] . D==eE-----R add x0, x27, #1
+# CHECK: [0,0] DeeeeeeeER. . . ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,1] .DeE-----R. . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeeER . . ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,3] . DeE-----R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeER . . ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,5] . DeE-----R . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeeeER . ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,7] . . DeE-----R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeeER ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,9] . . DeE-----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1719,25 +1719,25 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 1. 1 2.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 3. 1 2.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 5. 1 2.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 1.0 0.0 ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 7. 1 3.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 2.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 9. 1 3.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 1 1.9 0.2 2.5 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 5. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 7. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 2.5 <total>
# CHECK: [12] Code Region - G13
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1210
+# CHECK-NEXT: Total Cycles: 1212
# CHECK-NEXT: Total uOps: 2800
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 2.31
# CHECK-NEXT: IPC: 0.83
# CHECK-NEXT: Block RThroughput: 5.7
@@ -1746,16 +1746,16 @@ add x0, x27, 1
# CHECK-NEXT: 0123456789
# CHECK-NEXT: Index 0123456789 01
-# CHECK: [0,0] DeeeeeeeER. . .. ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,1] D=eE-----R. . .. add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeeER . .. ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,3] .D=eE-----R . .. add x0, x27, #1
-# CHECK-NEXT: [0,4] . DeeeeeeeER . .. ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,5] . D=eE-----R . .. add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeeeeER . .. ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,7] . D==eE-----R . .. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D========eeeeeeeeER ld1 { v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,9] . D=========eE------R add x0, x27, #1
+# CHECK: [0,0] DeeeeeeeER. . . . ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,1] .DeE-----R. . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeeER . . . ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3] . DeE-----R . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeER . . . ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,5] . DeE-----R . . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeeeER . . ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,7] . . DeE-----R . . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . D======eeeeeeeeER ld1 { v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,9] . . D=======eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1765,16 +1765,16 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 3. 1 2.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 5. 1 2.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 1.0 0.0 ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 7. 1 3.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 9.0 0.0 0.0 ld1 { v1.b }[0], [x27], #1
-# CHECK-NEXT: 9. 1 10.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.3 0.2 2.6 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 5. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 7. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 7.0 0.0 0.0 ld1 { v1.b }[0], [x27], #1
+# CHECK-NEXT: 9. 1 8.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.4 2.6 <total>
# CHECK: [13] Code Region - G14
@@ -1783,10 +1783,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 0.50
# CHECK-NEXT: IPC: 0.25
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
@@ -1794,14 +1794,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld1 { v1.b }[8], [x27], #1
# CHECK-NEXT: [0,1] D=eE------R . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld1 { v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,3] D=========eE------R . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld1 { v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,5] .D================eE------R . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld1 { v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,7] .D========================eE------R. . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER ld1 { v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,9] . D===============================eE------R add x0, x27, #1
+# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld1 { v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,3] .D========eE------R . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld1 { v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,5] . D===============eE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld1 { v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,7] . D======================eE------R. . . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D============================eeeeeeeeER ld1 { v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,9] . D=============================eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1812,15 +1812,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.b }[8], [x27], #1
# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld1 { v1.b }[0], [x27], x28
-# CHECK-NEXT: 3. 1 10.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld1 { v1.b }[8], [x27], x28
-# CHECK-NEXT: 5. 1 17.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 24.0 0.0 0.0 ld1 { v1.h }[0], [x27], #2
-# CHECK-NEXT: 7. 1 25.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 31.0 0.0 0.0 ld1 { v1.h }[4], [x27], #2
-# CHECK-NEXT: 9. 1 32.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 16.7 0.1 3.0 <total>
+# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld1 { v1.b }[0], [x27], x28
+# CHECK-NEXT: 3. 1 9.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld1 { v1.b }[8], [x27], x28
+# CHECK-NEXT: 5. 1 16.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld1 { v1.h }[0], [x27], #2
+# CHECK-NEXT: 7. 1 23.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld1 { v1.h }[4], [x27], #2
+# CHECK-NEXT: 9. 1 30.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 15.5 0.1 3.0 <total>
# CHECK: [14] Code Region - G15
@@ -1829,10 +1829,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 0.50
# CHECK-NEXT: IPC: 0.25
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
@@ -1840,14 +1840,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld1 { v1.h }[0], [x27], x28
# CHECK-NEXT: [0,1] D=eE------R . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld1 { v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,3] D=========eE------R . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld1 { v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,5] .D================eE------R . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld1 { v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,7] .D========================eE------R. . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER ld1 { v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,9] . D===============================eE------R add x0, x27, #1
+# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld1 { v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,3] .D========eE------R . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld1 { v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,5] . D===============eE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld1 { v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,7] . D======================eE------R. . . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D============================eeeeeeeeER ld1 { v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,9] . D=============================eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1858,15 +1858,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.h }[0], [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld1 { v1.h }[4], [x27], x28
-# CHECK-NEXT: 3. 1 10.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld1 { v1.s }[0], [x27], #4
-# CHECK-NEXT: 5. 1 17.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 24.0 0.0 0.0 ld1 { v1.s }[0], [x27], x28
-# CHECK-NEXT: 7. 1 25.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 31.0 0.0 0.0 ld1 { v1.d }[0], [x27], #8
-# CHECK-NEXT: 9. 1 32.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 16.7 0.1 3.0 <total>
+# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld1 { v1.h }[4], [x27], x28
+# CHECK-NEXT: 3. 1 9.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld1 { v1.s }[0], [x27], #4
+# CHECK-NEXT: 5. 1 16.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld1 { v1.s }[0], [x27], x28
+# CHECK-NEXT: 7. 1 23.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld1 { v1.d }[0], [x27], #8
+# CHECK-NEXT: 9. 1 30.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 15.5 0.1 3.0 <total>
# CHECK: [15] Code Region - G16
@@ -1875,10 +1875,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 1203
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 1.66
# CHECK-NEXT: IPC: 0.83
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
# CHECK-NEXT: 01234
@@ -1886,14 +1886,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeeeER . ld1 { v1.d }[0], [x27], x28
# CHECK-NEXT: [0,1] D=eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeeeeeER . ld1r { v1.1d }, [x27], #8
-# CHECK-NEXT: [0,3] D==eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeeeER . ld1r { v1.2d }, [x27], #8
-# CHECK-NEXT: [0,5] .D==eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeeeER. ld1r { v1.2s }, [x27], #4
-# CHECK-NEXT: [0,7] .D===eE------R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld1r { v1.4h }, [x27], #2
-# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeER . ld1r { v1.1d }, [x27], #8
+# CHECK-NEXT: [0,3] .D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeER . ld1r { v1.2d }, [x27], #8
+# CHECK-NEXT: [0,5] . D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeeeER. ld1r { v1.2s }, [x27], #4
+# CHECK-NEXT: [0,7] . D=eE------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeeeeeER ld1r { v1.4h }, [x27], #2
+# CHECK-NEXT: [0,9] . D=eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1904,15 +1904,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.d }[0], [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1r { v1.1d }, [x27], #8
-# CHECK-NEXT: 3. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1r { v1.2d }, [x27], #8
-# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1r { v1.2s }, [x27], #4
-# CHECK-NEXT: 7. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1r { v1.4h }, [x27], #2
-# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 3.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1r { v1.1d }, [x27], #8
+# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1r { v1.2d }, [x27], #8
+# CHECK-NEXT: 5. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ld1r { v1.2s }, [x27], #4
+# CHECK-NEXT: 7. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 ld1r { v1.4h }, [x27], #2
+# CHECK-NEXT: 9. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 3.0 <total>
# CHECK: [16] Code Region - G17
@@ -1921,10 +1921,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 510
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 3.92
# CHECK-NEXT: IPC: 1.96
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
# CHECK-NEXT: 01234
@@ -1932,14 +1932,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeeeER . ld1r { v1.4s }, [x27], #4
# CHECK-NEXT: [0,1] D=eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeeeeeER . ld1r { v1.8b }, [x27], #1
-# CHECK-NEXT: [0,3] D==eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeeeER . ld1r { v1.8h }, [x27], #2
-# CHECK-NEXT: [0,5] .D==eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeeeER. ld1r { v1.16b }, [x27], #1
-# CHECK-NEXT: [0,7] .D===eE------R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld1r { v1.1d }, [x27], x28
-# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeER . ld1r { v1.8b }, [x27], #1
+# CHECK-NEXT: [0,3] .D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeER . ld1r { v1.8h }, [x27], #2
+# CHECK-NEXT: [0,5] . D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeeeER. ld1r { v1.16b }, [x27], #1
+# CHECK-NEXT: [0,7] . D=eE------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeeeeeER ld1r { v1.1d }, [x27], x28
+# CHECK-NEXT: [0,9] . D=eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1950,15 +1950,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1r { v1.4s }, [x27], #4
# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1r { v1.8b }, [x27], #1
-# CHECK-NEXT: 3. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1r { v1.8h }, [x27], #2
-# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1r { v1.16b }, [x27], #1
-# CHECK-NEXT: 7. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1r { v1.1d }, [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 3.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1r { v1.8b }, [x27], #1
+# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1r { v1.8h }, [x27], #2
+# CHECK-NEXT: 5. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ld1r { v1.16b }, [x27], #1
+# CHECK-NEXT: 7. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 ld1r { v1.1d }, [x27], x28
+# CHECK-NEXT: 9. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 3.0 <total>
# CHECK: [17] Code Region - G18
@@ -1967,10 +1967,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 510
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 3.92
# CHECK-NEXT: IPC: 1.96
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
# CHECK-NEXT: 01234
@@ -1978,14 +1978,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeeeER . ld1r { v1.2d }, [x27], x28
# CHECK-NEXT: [0,1] D=eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeeeeeER . ld1r { v1.2s }, [x27], x28
-# CHECK-NEXT: [0,3] D==eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeeeER . ld1r { v1.4h }, [x27], x28
-# CHECK-NEXT: [0,5] .D==eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeeeER. ld1r { v1.4s }, [x27], x28
-# CHECK-NEXT: [0,7] .D===eE------R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld1r { v1.8b }, [x27], x28
-# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeER . ld1r { v1.2s }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeER . ld1r { v1.4h }, [x27], x28
+# CHECK-NEXT: [0,5] . D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeeeER. ld1r { v1.4s }, [x27], x28
+# CHECK-NEXT: [0,7] . D=eE------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeeeeeER ld1r { v1.8b }, [x27], x28
+# CHECK-NEXT: [0,9] . D=eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1996,15 +1996,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1r { v1.2d }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1r { v1.2s }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1r { v1.4h }, [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1r { v1.4s }, [x27], x28
-# CHECK-NEXT: 7. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1r { v1.8b }, [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 3.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1r { v1.2s }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1r { v1.4h }, [x27], x28
+# CHECK-NEXT: 5. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ld1r { v1.4s }, [x27], x28
+# CHECK-NEXT: 7. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 ld1r { v1.8b }, [x27], x28
+# CHECK-NEXT: 9. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 3.0 <total>
# CHECK: [18] Code Region - G19
@@ -2013,10 +2013,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 510
# CHECK-NEXT: Total uOps: 2400
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 4.71
# CHECK-NEXT: IPC: 1.96
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 4.8
# CHECK: Timeline view:
# CHECK-NEXT: 01234
@@ -2024,14 +2024,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeeeER . ld1r { v1.8h }, [x27], x28
# CHECK-NEXT: [0,1] D=eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeeeeeER . ld1r { v1.16b }, [x27], x28
-# CHECK-NEXT: [0,3] D==eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeeeER . ld2 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5] .D==eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeeeER. ld2 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7] . D==eE------R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld2 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeER . ld1r { v1.16b }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeER . ld2 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5] . DeE------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeeeER. ld2 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7] . DeE------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeeeeeER ld2 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9] . DeE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2042,42 +2042,42 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1r { v1.8h }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1r { v1.16b }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld2 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld2 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 7. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld2 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.6 0.1 3.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1r { v1.16b }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld2 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ld2 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 ld2 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.2 0.1 3.0 <total>
# CHECK: [19] Code Region - G20
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 510
+# CHECK-NEXT: Total Cycles: 909
# CHECK-NEXT: Total uOps: 2900
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 5.69
-# CHECK-NEXT: IPC: 1.96
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.19
+# CHECK-NEXT: IPC: 1.10
+# CHECK-NEXT: Block RThroughput: 5.8
# CHECK: Timeline view:
-# CHECK-NEXT: 01234
+# CHECK-NEXT: 01234567
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeeER . ld2 { v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1] D=eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeeeeeER . ld2 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3] .D=eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeeeER . ld2 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5] .D==eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeeeeeER. ld2 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7] . D==eE------R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D=eeeeeeeeER ld2 { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,9] . D==eE------R add x0, x27, #1
+# CHECK: [0,0] DeeeeeeeeER . . ld2 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1] .DeE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeER . . ld2 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3] . DeE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeER . . ld2 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5] . DeE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeeeER . ld2 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7] . .DeE------R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeeeER ld2 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,9] . . DeE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2087,43 +2087,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld2 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld2 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld2 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 7. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 2.0 0.0 0.0 ld2 { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 9. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.2 0.1 3.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld2 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld2 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 ld2 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 ld2 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.4 3.0 <total>
# CHECK: [20] Code Region - G21
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 510
+# CHECK-NEXT: Total Cycles: 709
# CHECK-NEXT: Total uOps: 2700
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 5.29
-# CHECK-NEXT: IPC: 1.96
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.81
+# CHECK-NEXT: IPC: 1.41
+# CHECK-NEXT: Block RThroughput: 5.4
# CHECK: Timeline view:
-# CHECK-NEXT: 01234
+# CHECK-NEXT: 012345
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeeeER . ld2 { v1.2s, v2.2s }, [x27], x28
# CHECK-NEXT: [0,1] D=eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeeeeeER . ld2 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,3] D==eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeeeER . ld2 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,5] .D==eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeeeER. ld2 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,7] . D==eE------R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld2 { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeER . ld2 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeER . ld2 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,5] . DeE------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeeeER . ld2 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,7] . DeE------R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeeeeeER ld2 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,9] . .DeE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2134,15 +2134,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.2s, v2.2s }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld2 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld2 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld2 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 7. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld2 { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.6 0.1 3.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld2 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld2 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 5. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ld2 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 7. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 ld2 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.2 0.2 3.0 <total>
# CHECK: [21] Code Region - G22
@@ -2151,25 +2151,25 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 3310
# CHECK-NEXT: Total uOps: 2600
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 0.79
# CHECK-NEXT: IPC: 0.30
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK-NEXT: Block RThroughput: 5.2
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
# CHECK-NEXT: Index 0123456789 0123456789 012
# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld2 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,1] D=eE------R . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld2 { v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,3] .D========eE------R . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld2 { v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,5] .D================eE------R . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld2 { v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,7] . D=======================eE------R. . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER ld2 { v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,9] . D===============================eE------R add x0, x27, #1
+# CHECK-NEXT: [0,1] .DeE------R . . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld2 { v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,3] . D=======eE------R . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld2 { v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,5] . D==============eE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld2 { v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,7] . D=====================eE------R. . . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D============================eeeeeeeeER ld2 { v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,9] . D============================eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2179,16 +2179,16 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld2 { v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 3. 1 9.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld2 { v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 5. 1 17.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 24.0 0.0 0.0 ld2 { v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: 7. 1 24.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 31.0 0.0 0.0 ld2 { v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 9. 1 32.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 16.5 0.1 3.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld2 { v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 3. 1 8.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld2 { v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 5. 1 15.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld2 { v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 7. 1 22.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld2 { v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 9. 1 29.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 15.0 0.1 3.0 <total>
# CHECK: [22] Code Region - G23
@@ -2197,7 +2197,7 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 2500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 0.62
# CHECK-NEXT: IPC: 0.25
# CHECK-NEXT: Block RThroughput: 5.0
@@ -2208,14 +2208,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld2 { v1.h, v2.h }[0], [x27], #4
# CHECK-NEXT: [0,1] D=eE------R . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . . . . ld2 { v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,3] D=========eE------R . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . . . . ld2 { v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,5] .D================eE------R . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D=======================eeeeeeeeER. . . ld2 { v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,7] .D========================eE------R. . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==============================eeeeeeeeER ld2 { v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,9] . D===============================eE------R add x0, x27, #1
+# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld2 { v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,3] .D========eE------R . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld2 { v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,5] . D===============eE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld2 { v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,7] . D======================eE------R. . . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D============================eeeeeeeeER ld2 { v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,9] . D=============================eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2226,15 +2226,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.h, v2.h }[0], [x27], #4
# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld2 { v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 3. 1 10.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld2 { v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 5. 1 17.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 24.0 0.0 0.0 ld2 { v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: 7. 1 25.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 31.0 0.0 0.0 ld2 { v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 9. 1 32.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 16.7 0.1 3.0 <total>
+# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld2 { v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 3. 1 9.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld2 { v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 5. 1 16.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld2 { v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 7. 1 23.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld2 { v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 9. 1 30.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 15.5 0.1 3.0 <total>
# CHECK: [23] Code Region - G24
@@ -2243,7 +2243,7 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 2603
# CHECK-NEXT: Total uOps: 2500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 0.96
# CHECK-NEXT: IPC: 0.38
# CHECK-NEXT: Block RThroughput: 5.0
@@ -2254,14 +2254,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeeeER . . . . ld2 { v1.s, v2.s }[0], [x27], x28
# CHECK-NEXT: [0,1] D=eE------R . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D========eeeeeeeeER . . . ld2 { v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,3] D=========eE------R . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D===============eeeeeeeeER . ld2 { v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,5] .D================eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D================eeeeeeeeER. ld2r { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,7] .D=================eE------R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D================eeeeeeeeER ld2r { v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: [0,9] . D=================eE------R add x0, x27, #1
+# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . ld2 { v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,3] .D========eE------R . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . ld2 { v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,5] . D===============eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D==============eeeeeeeeER. ld2r { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,7] . D===============eE------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==============eeeeeeeeER ld2r { v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,9] . D===============eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2272,15 +2272,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.s, v2.s }[0], [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld2 { v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 3. 1 10.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 16.0 0.0 0.0 ld2 { v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 5. 1 17.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 17.0 0.0 0.0 ld2r { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 7. 1 18.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 17.0 0.0 0.0 ld2r { v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: 9. 1 18.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 12.5 0.1 3.0 <total>
+# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld2 { v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 3. 1 9.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld2 { v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 5. 1 16.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 15.0 0.0 0.0 ld2r { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 7. 1 16.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 15.0 0.0 0.0 ld2r { v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: 9. 1 16.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 11.3 0.1 3.0 <total>
# CHECK: [24] Code Region - G25
@@ -2289,7 +2289,7 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 510
# CHECK-NEXT: Total uOps: 2500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 4.90
# CHECK-NEXT: IPC: 1.96
# CHECK-NEXT: Block RThroughput: 5.0
@@ -2300,14 +2300,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeeeER . ld2r { v1.2s, v2.2s }, [x27], #8
# CHECK-NEXT: [0,1] D=eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeeeeeER . ld2r { v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: [0,3] D==eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeeeER . ld2r { v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: [0,5] .D==eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeeeER. ld2r { v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: [0,7] .D===eE------R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld2r { v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeER . ld2r { v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,3] .D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeER . ld2r { v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,5] . D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeeeER. ld2r { v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,7] . D=eE------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeeeeeER ld2r { v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,9] . D=eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2318,15 +2318,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2r { v1.2s, v2.2s }, [x27], #8
# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld2r { v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: 3. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld2r { v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld2r { v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: 7. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld2r { v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 3.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld2r { v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld2r { v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 5. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ld2r { v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 7. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 ld2r { v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: 9. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 3.0 <total>
# CHECK: [25] Code Region - G26
@@ -2335,7 +2335,7 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 510
# CHECK-NEXT: Total uOps: 2500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 4.90
# CHECK-NEXT: IPC: 1.96
# CHECK-NEXT: Block RThroughput: 5.0
@@ -2346,14 +2346,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeeeER . ld2r { v1.16b, v2.16b }, [x27], #2
# CHECK-NEXT: [0,1] D=eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeeeeeER . ld2r { v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,3] D==eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeeeER . ld2r { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,5] .D==eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeeeER. ld2r { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,7] .D===eE------R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld2r { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeER . ld2r { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeER . ld2r { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,5] . D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeeeER. ld2r { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,7] . D=eE------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeeeeeER ld2r { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,9] . D=eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2364,15 +2364,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2r { v1.16b, v2.16b }, [x27], #2
# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld2r { v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld2r { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld2r { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 7. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld2r { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 3.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld2r { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld2r { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 5. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ld2r { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 7. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 ld2r { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 9. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 3.0 <total>
# CHECK: [26] Code Region - G27
@@ -2381,10 +2381,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 609
# CHECK-NEXT: Total uOps: 2800
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 4.60
# CHECK-NEXT: IPC: 1.64
-# CHECK-NEXT: Block RThroughput: 5.5
+# CHECK-NEXT: Block RThroughput: 5.6
# CHECK: Timeline view:
# CHECK-NEXT: 01234
@@ -2392,14 +2392,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeeeER . ld2r { v1.4s, v2.4s }, [x27], x28
# CHECK-NEXT: [0,1] D=eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeeeeeER . ld2r { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,3] D==eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeeeER . ld2r { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,5] .D==eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeeeER. ld2r { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,7] .D===eE------R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld3 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeeeER . ld2r { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeER . ld2r { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,5] . D=eE------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeeeER. ld2r { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,7] . D=eE------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeeeeeER ld3 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,9] . DeE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2410,42 +2410,42 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2r { v1.4s, v2.4s }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld2r { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld2r { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld2r { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 7. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld3 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 3.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld2r { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld2r { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 5. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ld2r { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 7. 1 2.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 ld3 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 9. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.4 0.1 3.0 <total>
# CHECK: [27] Code Region - G28
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 759
+# CHECK-NEXT: Total Cycles: 1009
# CHECK-NEXT: Total uOps: 3700
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 4.87
-# CHECK-NEXT: IPC: 1.32
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.67
+# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 7.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 012345678
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeeER .. ld3 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,1] D=eE------R .. add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeeeER .. ld3 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,3] .D=eE------R .. add x0, x27, #1
-# CHECK-NEXT: [0,4] . D=eeeeeeeeER .. ld3 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,5] . D==eE------R .. add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeeeeeER.. ld3 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,7] . D==eE------R.. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld3 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
+# CHECK: [0,0] DeeeeeeeeER . . ld3 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1] .DeE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeeeER . . ld3 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,3] . DeE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeER. . ld3 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,5] . DeE------R. . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeeeeER . ld3 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,7] . . DeE------R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeeeER ld3 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,9] . . DeE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2455,43 +2455,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld3 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 1.0 0.0 ld3 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld3 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 7. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 1.0 0.0 ld3 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.3 0.3 3.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld3 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 3. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld3 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 5. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 ld3 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 7. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 ld3 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 9. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 3.0 <total>
# CHECK: [28] Code Region - G29
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 759
+# CHECK-NEXT: Total Cycles: 1009
# CHECK-NEXT: Total uOps: 3800
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 5.01
-# CHECK-NEXT: IPC: 1.32
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.77
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 7.6
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeeeeeeeER .. ld3 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,1] D=eE------R .. add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeeeER .. ld3 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3] .D=eE------R .. add x0, x27, #1
-# CHECK-NEXT: [0,4] . D=eeeeeeeeER .. ld3 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5] . D==eE------R .. add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeeeeeER.. ld3 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7] . D==eE------R.. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld3 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
+# CHECK-NEXT: 012345678
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . . ld3 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1] .DeE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeeeER . . ld3 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3] . DeE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeER. . ld3 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5] . DeE------R. . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeeeeER . ld3 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7] . . DeE------R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeeeER ld3 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9] . . DeE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2501,43 +2501,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld3 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 1.0 0.0 ld3 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld3 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 7. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 1.0 0.0 ld3 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.3 0.3 3.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld3 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld3 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 ld3 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 ld3 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 3.0 <total>
# CHECK: [29] Code Region - G30
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 2010
+# CHECK-NEXT: Total Cycles: 2011
# CHECK-NEXT: Total uOps: 3700
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 1.84
# CHECK-NEXT: IPC: 0.50
# CHECK-NEXT: Block RThroughput: 7.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789
-
-# CHECK: [0,0] DeeeeeeeeER . . . . ld3 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1] D=eE------R . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeeeER . . . . ld3 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3] .D=eE------R . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D=eeeeeeeeER . . . . ld3 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5] . D==eE------R . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D========eeeeeeeeER . . ld3 { v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,7] . D=========eE------R . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D===============eeeeeeeeER ld3 { v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,9] . D================eE------R add x0, x27, #1
+# CHECK-NEXT: 0123456789 0
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . . . . ld3 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1] .DeE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeeeER . . . . ld3 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3] . DeE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeER. . . . ld3 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5] . DeE------R. . . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .D======eeeeeeeeER . . ld3 { v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,7] . . D======eE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . D============eeeeeeeeER ld3 { v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,9] . . D============eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2547,16 +2547,16 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld3 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 1.0 0.0 ld3 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 9.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 7. 1 10.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 16.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 9. 1 17.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 6.3 0.2 3.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld3 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld3 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 7.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 7. 1 7.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 13.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 9. 1 13.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 4.6 0.3 3.0 <total>
# CHECK: [30] Code Region - G31
@@ -2565,7 +2565,7 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 3500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 0.87
# CHECK-NEXT: IPC: 0.25
# CHECK-NEXT: Block RThroughput: 7.5
@@ -2575,15 +2575,15 @@ add x0, x27, 1
# CHECK-NEXT: Index 0123456789 0123456789 012
# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld3 { v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,1] D=eE------R . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld3 { v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,3] .D========eE------R . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld3 { v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,5] . D===============eE------R . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld3 { v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,7] . D======================eE------R. . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D============================eeeeeeeeER ld3 { v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,9] . D=============================eE------R add x0, x27, #1
+# CHECK-NEXT: [0,1] .DeE------R . . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . D======eeeeeeeeER . . . . . . ld3 { v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,3] . D======eE------R . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D============eeeeeeeeER . . . . ld3 { v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,5] . D============eE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .D==================eeeeeeeeER. . . ld3 { v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,7] . . D==================eE------R. . . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . D========================eeeeeeeeER ld3 { v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,9] . . D========================eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2593,16 +2593,16 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 3. 1 9.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 5. 1 16.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 7. 1 23.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 9. 1 30.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 15.5 0.1 3.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 7.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 3. 1 7.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 13.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 5. 1 13.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 19.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 7. 1 19.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 25.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 9. 1 25.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 13.0 0.1 3.0 <total>
# CHECK: [31] Code Region - G32
@@ -2611,7 +2611,7 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 3500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 0.87
# CHECK-NEXT: IPC: 0.25
# CHECK-NEXT: Block RThroughput: 7.5
@@ -2621,15 +2621,15 @@ add x0, x27, 1
# CHECK-NEXT: Index 0123456789 0123456789 012
# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld3 { v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,1] D=eE------R . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld3 { v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,3] .D========eE------R . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld3 { v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,5] . D===============eE------R . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld3 { v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,7] . D======================eE------R. . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D============================eeeeeeeeER ld3 { v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,9] . D=============================eE------R add x0, x27, #1
+# CHECK-NEXT: [0,1] .DeE------R . . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . D======eeeeeeeeER . . . . . . ld3 { v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,3] . D======eE------R . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D============eeeeeeeeER . . . . ld3 { v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,5] . D============eE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .D==================eeeeeeeeER. . . ld3 { v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,7] . . D==================eE------R. . . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . D========================eeeeeeeeER ld3 { v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,9] . . D========================eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2639,43 +2639,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld3 { v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 3. 1 9.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld3 { v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 5. 1 16.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld3 { v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 7. 1 23.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld3 { v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 9. 1 30.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 15.5 0.1 3.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 7.0 0.0 0.0 ld3 { v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 3. 1 7.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 13.0 0.0 0.0 ld3 { v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 5. 1 13.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 19.0 0.0 0.0 ld3 { v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 7. 1 19.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 25.0 0.0 0.0 ld3 { v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 9. 1 25.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 13.0 0.1 3.0 <total>
# CHECK: [32] Code Region - G33
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 759
+# CHECK-NEXT: Total Cycles: 1009
# CHECK-NEXT: Total uOps: 3700
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 4.87
-# CHECK-NEXT: IPC: 1.32
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.67
+# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 7.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeeeeeeeER .. ld3r { v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1] D=eE------R .. add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeeeER .. ld3r { v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: [0,3] .D=eE------R .. add x0, x27, #1
-# CHECK-NEXT: [0,4] . D=eeeeeeeeER .. ld3r { v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: [0,5] . D==eE------R .. add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeeeeeER.. ld3r { v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: [0,7] . D==eE------R.. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld3r { v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
+# CHECK-NEXT: 012345678
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . . ld3r { v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1] .DeE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeeeER . . ld3r { v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,3] . DeE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeER. . ld3r { v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,5] . DeE------R. . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeeeeER . ld3r { v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,7] . . DeE------R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeeeER ld3r { v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: [0,9] . . DeE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2685,43 +2685,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3r { v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld3r { v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 1.0 0.0 ld3r { v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld3r { v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: 7. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 1.0 0.0 ld3r { v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.3 0.3 3.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld3r { v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 3. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld3r { v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 5. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 ld3r { v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 7. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 ld3r { v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: 9. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 3.0 <total>
# CHECK: [33] Code Region - G34
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 759
+# CHECK-NEXT: Total Cycles: 1009
# CHECK-NEXT: Total uOps: 3800
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 5.01
-# CHECK-NEXT: IPC: 1.32
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.77
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 7.6
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeeeeeeeER .. ld3r { v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: [0,1] D=eE------R .. add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeeeER .. ld3r { v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: [0,3] .D=eE------R .. add x0, x27, #1
-# CHECK-NEXT: [0,4] . D=eeeeeeeeER .. ld3r { v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: [0,5] . D==eE------R .. add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeeeeeER.. ld3r { v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7] . D==eE------R.. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld3r { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
+# CHECK-NEXT: 012345678
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . . ld3r { v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1] .DeE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeeeER . . ld3r { v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,3] . DeE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeER. . ld3r { v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,5] . DeE------R. . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeeeeER . ld3r { v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7] . . DeE------R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeeeER ld3r { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9] . . DeE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2731,43 +2731,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3r { v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld3r { v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 1.0 0.0 ld3r { v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld3r { v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 7. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 1.0 0.0 ld3r { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.3 0.3 3.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld3r { v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 3. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld3r { v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 5. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 ld3r { v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 ld3r { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 3.0 <total>
# CHECK: [34] Code Region - G35
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 759
+# CHECK-NEXT: Total Cycles: 1009
# CHECK-NEXT: Total uOps: 3700
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 4.87
-# CHECK-NEXT: IPC: 1.32
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.67
+# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 7.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeeeeeeeER .. ld3r { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1] D=eE------R .. add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeeeER .. ld3r { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3] .D=eE------R .. add x0, x27, #1
-# CHECK-NEXT: [0,4] . D=eeeeeeeeER .. ld3r { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5] . D==eE------R .. add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeeeeeER.. ld3r { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7] . D==eE------R.. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeeeER ld3r { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9] . D===eE------R add x0, x27, #1
+# CHECK-NEXT: 012345678
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . . ld3r { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1] .DeE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeeeER . . ld3r { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3] . DeE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeER. . ld3r { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5] . DeE------R. . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeeeeER . ld3r { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7] . . DeE------R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeeeER ld3r { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9] . . DeE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2777,43 +2777,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3r { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld3r { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 1.0 0.0 ld3r { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld3r { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 1.0 0.0 ld3r { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.3 0.3 3.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld3r { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld3r { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 ld3r { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 ld3r { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 3.0 <total>
# CHECK: [35] Code Region - G36
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 959
+# CHECK-NEXT: Total Cycles: 1010
# CHECK-NEXT: Total uOps: 4600
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 4.80
-# CHECK-NEXT: IPC: 1.04
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 4.55
+# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 9.5
# CHECK: Timeline view:
-# CHECK-NEXT: 012345678
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeeeeeeeER . . ld3r { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1] D=eE------R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeeeeER . . ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,3] .D=eE-------R . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D=eeeeeeeeER . . ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,5] . D==eE------R . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D==eeeeeeeeER . ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,7] . D===eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D===eeeeeeeeeER ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,9] . D====eE-------R add x0, x27, #1
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . . ld3r { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1] .DeE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeeeeER . . ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3] . DeE-------R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeER. . ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,5] . DeE------R. . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeeeeER . ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,7] . . DeE------R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeeeeER ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,9] . . DeE-------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2823,43 +2823,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3r { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 3. 1 2.0 0.0 7.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 1.0 0.0 ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 5. 1 3.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 1.0 0.0 ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 7. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 4.0 1.0 0.0 ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 9. 1 5.0 0.0 7.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.4 3.2 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 3. 1 1.0 0.0 7.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 5. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 7. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 9. 1 1.0 0.0 7.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 3.2 <total>
# CHECK: [36] Code Region - G37
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1008
+# CHECK-NEXT: Total Cycles: 1009
# CHECK-NEXT: Total uOps: 4800
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 4.76
# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 10.0
# CHECK: Timeline view:
# CHECK-NEXT: 012345678
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeeeER . . ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,1] D=eE------R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeeeeER . . ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,3] .D=eE-------R . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==eeeeeeeeeER . ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,5] . D===eE-------R . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D==eeeeeeeeeER . ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,7] . D===eE-------R . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D====eeeeeeeeER ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,9] . D=====eE------R add x0, x27, #1
+# CHECK-NEXT: [0,1] .DeE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeeeeER . . ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3] . DeE-------R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeeER . ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,5] . DeE-------R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeeeeeER. ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,7] . . DeE-------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeeeER ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,9] . . DeE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2869,43 +2869,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 3. 1 2.0 0.0 7.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 2.0 0.0 ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 5. 1 4.0 0.0 7.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 7. 1 4.0 0.0 7.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 5.0 2.0 0.0 ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 9. 1 6.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.1 0.5 3.3 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 3. 1 1.0 0.0 7.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 5. 1 1.0 0.0 7.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 7. 1 1.0 0.0 7.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 3.3 <total>
# CHECK: [37] Code Region - G38
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1009
+# CHECK-NEXT: Total Cycles: 1010
# CHECK-NEXT: Total uOps: 4800
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 4.76
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 4.75
# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 10.0
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeeeER . . ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,1] D=eE------R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeeeeER . . ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,3] .D=eE-------R . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==eeeeeeeeER. . ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,5] . D===eE------R. . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D==eeeeeeeeeER . ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,7] . D===eE-------R . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D====eeeeeeeeeER ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,9] . D=====eE-------R add x0, x27, #1
+# CHECK-NEXT: [0,1] .DeE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeeeeER . . ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,3] . DeE-------R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeER. . ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,5] . DeE------R. . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeeeeeER . ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,7] . . DeE-------R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeeeeER ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,9] . . DeE-------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2915,16 +2915,16 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 3. 1 2.0 0.0 7.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 2.0 0.0 ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 5. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 7. 1 4.0 0.0 7.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 5.0 2.0 0.0 ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 9. 1 6.0 0.0 7.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.1 0.5 3.3 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 7.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 5. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 7. 1 1.0 0.0 7.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 7.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 3.3 <total>
# CHECK: [38] Code Region - G39
@@ -2933,7 +2933,7 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 4500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 1.12
# CHECK-NEXT: IPC: 0.25
# CHECK-NEXT: Block RThroughput: 10.0
@@ -2943,15 +2943,15 @@ add x0, x27, 1
# CHECK-NEXT: Index 0123456789 0123456789 012
# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,1] D=eE------R . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,3] .D========eE------R . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,5] . D===============eE------R . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,7] . D======================eE------R. . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D============================eeeeeeeeER ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,9] . D=============================eE------R add x0, x27, #1
+# CHECK-NEXT: [0,1] .DeE------R . . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . D======eeeeeeeeER . . . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,3] . D======eE------R . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D============eeeeeeeeER . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,5] . D============eE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .D==================eeeeeeeeER. . . ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,7] . . D==================eE------R. . . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . D========================eeeeeeeeER ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,9] . . D========================eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2961,16 +2961,16 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 3. 1 9.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 5. 1 16.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 7. 1 23.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 9. 1 30.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 15.5 0.1 3.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 7.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 3. 1 7.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 13.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 5. 1 13.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 19.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 7. 1 19.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 25.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 9. 1 25.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 13.0 0.1 3.0 <total>
# CHECK: [39] Code Region - G40
@@ -2979,7 +2979,7 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 4500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 1.12
# CHECK-NEXT: IPC: 0.25
# CHECK-NEXT: Block RThroughput: 10.0
@@ -2989,15 +2989,15 @@ add x0, x27, 1
# CHECK-NEXT: Index 0123456789 0123456789 012
# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,1] D=eE------R . . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,3] .D========eE------R . . . . . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==============eeeeeeeeER . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,5] . D===============eE------R . . . . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=====================eeeeeeeeER. . . ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,7] . D======================eE------R. . . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D============================eeeeeeeeER ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,9] . D=============================eE------R add x0, x27, #1
+# CHECK-NEXT: [0,1] .DeE------R . . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . D======eeeeeeeeER . . . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,3] . D======eE------R . . . . . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D============eeeeeeeeER . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,5] . D============eE------R . . . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .D==================eeeeeeeeER. . . ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,7] . . D==================eE------R. . . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . D========================eeeeeeeeER ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,9] . . D========================eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3007,16 +3007,16 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 3. 1 9.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 5. 1 16.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 22.0 0.0 0.0 ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 7. 1 23.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 29.0 0.0 0.0 ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 9. 1 30.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 15.5 0.1 3.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 7.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 3. 1 7.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 13.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 5. 1 13.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 19.0 0.0 0.0 ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 7. 1 19.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 25.0 0.0 0.0 ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 9. 1 25.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 13.0 0.1 3.0 <total>
# CHECK: [40] Code Region - G41
@@ -3025,7 +3025,7 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 2103
# CHECK-NEXT: Total uOps: 4600
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 2.19
# CHECK-NEXT: IPC: 0.48
# CHECK-NEXT: Block RThroughput: 10.0
@@ -3035,15 +3035,15 @@ add x0, x27, 1
# CHECK-NEXT: Index 0123456789 0123
# CHECK: [0,0] DeeeeeeeeER . . . ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,1] D=eE------R . . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .D=======eeeeeeeeER . . ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,3] .D========eE------R . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D=======eeeeeeeeER. . ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,5] . D========eE------R. . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=========eeeeeeeeER. ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: [0,7] . D==========eE------R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D=========eeeeeeeeER ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: [0,9] . D==========eE------R add x0, x27, #1
+# CHECK-NEXT: [0,1] .DeE------R . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . D======eeeeeeeeER . . ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,3] . D======eE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D=====eeeeeeeeER. . ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,5] . D=====eE------R. . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .D======eeeeeeeeER. ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,7] . . D======eE------R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . . D=====eeeeeeeeER ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,9] . . D=====eE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3053,43 +3053,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 8.0 0.0 0.0 ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 3. 1 9.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 8.0 0.0 0.0 ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 5. 1 9.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 10.0 2.0 0.0 ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: 7. 1 11.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 10.0 0.0 0.0 ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: 9. 1 11.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 7.9 0.3 3.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 7.0 0.0 0.0 ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 3. 1 7.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 6.0 0.0 0.0 ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 5. 1 6.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 7.0 2.0 0.0 ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 7. 1 7.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 6.0 0.0 0.0 ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: 9. 1 6.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 5.4 0.3 3.0 <total>
# CHECK: [41] Code Region - G42
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1008
+# CHECK-NEXT: Total Cycles: 1009
# CHECK-NEXT: Total uOps: 4800
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 4.76
# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 10.0
# CHECK: Timeline view:
# CHECK-NEXT: 012345678
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeeeER . . ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: [0,1] D=eE------R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeeeER . . ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: [0,3] .D=eE------R . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==eeeeeeeeER. . ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: [0,5] . D===eE------R. . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D==eeeeeeeeER . ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: [0,7] . D===eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D====eeeeeeeeER ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: [0,9] . D=====eE------R add x0, x27, #1
+# CHECK-NEXT: [0,1] .DeE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeeeER . . ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,3] . DeE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeER. . ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,5] . DeE------R. . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeeeeER . ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,7] . . DeE------R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeeeER ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: [0,9] . . DeE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3099,43 +3099,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 2.0 0.0 ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: 5. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: 7. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 5.0 2.0 0.0 ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: 9. 1 6.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.1 0.5 3.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: 3. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: 5. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: 7. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: 9. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 3.0 <total>
# CHECK: [42] Code Region - G43
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1008
+# CHECK-NEXT: Total Cycles: 1009
# CHECK-NEXT: Total uOps: 4700
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 4.66
# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 10.0
# CHECK: Timeline view:
# CHECK-NEXT: 012345678
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeeeER . . ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,1] D=eE------R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeeeER . . ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,3] .D=eE------R . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==eeeeeeeeER. . ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,5] . D===eE------R. . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D==eeeeeeeeER . ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,7] . D===eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D====eeeeeeeeER ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,9] . D=====eE------R add x0, x27, #1
+# CHECK-NEXT: [0,1] .DeE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeeeER . . ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3] . DeE------R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeER. . ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,5] . DeE------R. . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeeeeER . ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,7] . . DeE------R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeeeER ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,9] . . DeE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3145,43 +3145,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 2.0 0.0 ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 5. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 7. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 5.0 2.0 0.0 ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 9. 1 6.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.1 0.5 3.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 5. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 7. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 3.0 <total>
# CHECK: [43] Code Region - G44
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 708
+# CHECK-NEXT: Total Cycles: 808
# CHECK-NEXT: Total uOps: 3900
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 5.51
-# CHECK-NEXT: IPC: 1.41
-# CHECK-NEXT: Block RThroughput: 6.0
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 4.83
+# CHECK-NEXT: IPC: 1.24
+# CHECK-NEXT: Block RThroughput: 7.8
# CHECK: Timeline view:
-# CHECK-NEXT: 01234
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeeeeeeeER . ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,1] D=eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeeeER . ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,3] .D=eE------R . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,5] . D===eE------R add x0, x27, #1
-# CHECK-NEXT: [0,6] . D==eeeeeeE-R ldp s1, s2, [x27], #248
-# CHECK-NEXT: [0,7] . D===eE-----R add x0, x27, #1
-# CHECK-NEXT: [0,8] . D===eeeeeeER ldp d1, d2, [x27], #496
-# CHECK-NEXT: [0,9] . D====eE----R add x0, x27, #1
+# CHECK-NEXT: 012345
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeeER . ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1] .DeE------R . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3] . DeE------R . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeeeER. ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,5] . DeE------R. add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeeER. ldp s1, s2, [x27], #248
+# CHECK-NEXT: [0,7] . .D=eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeER ldp d1, d2, [x27], #496
+# CHECK-NEXT: [0,9] . . D=eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3191,43 +3191,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 2.0 0.0 ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 5. 1 4.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 1.0 ldp s1, s2, [x27], #248
-# CHECK-NEXT: 7. 1 4.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ldp d1, d2, [x27], #496
-# CHECK-NEXT: 9. 1 5.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.9 0.3 2.8 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 5. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 ldp s1, s2, [x27], #248
+# CHECK-NEXT: 7. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 ldp d1, d2, [x27], #496
+# CHECK-NEXT: 9. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.2 0.4 2.6 <total>
# CHECK: [44] Code Region - G45
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 507
+# CHECK-NEXT: Total Cycles: 706
# CHECK-NEXT: Total uOps: 2800
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 5.52
-# CHECK-NEXT: IPC: 1.97
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.97
+# CHECK-NEXT: IPC: 1.42
+# CHECK-NEXT: Block RThroughput: 5.6
# CHECK: Timeline view:
-# CHECK-NEXT: 01
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeeeeeER .. ldp q1, q2, [x27], #992
-# CHECK-NEXT: [0,1] D=eE----R .. add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeER.. ldp s1, s2, [x27, #248]!
-# CHECK-NEXT: [0,3] .D=eE----R.. add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeER. ldp d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,5] .D==eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeeeER ldp q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,7] . D==eE----R add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeE-R ldp w1, w2, [x27], #248
-# CHECK-NEXT: [0,9] . D==eE---R add x0, x27, #1
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . . ldp q1, q2, [x27], #992
+# CHECK-NEXT: [0,1] .DeE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeER . ldp s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3] . D=eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER. ldp d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5] . D=eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeER ldp q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7] . DeE----R add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeE-R ldp w1, w2, [x27], #248
+# CHECK-NEXT: [0,9] . .DeE---R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3237,16 +3237,16 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldp q1, q2, [x27], #992
-# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ldp s1, s2, [x27, #248]!
+# CHECK-NEXT: 1. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ldp s1, s2, [x27, #248]!
# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ldp d1, d2, [x27, #496]!
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ldp q1, q2, [x27, #992]!
-# CHECK-NEXT: 7. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 1.0 ldp w1, w2, [x27], #248
-# CHECK-NEXT: 9. 1 3.0 0.0 3.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.2 0.1 2.0 <total>
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ldp d1, d2, [x27, #496]!
+# CHECK-NEXT: 5. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ldp q1, q2, [x27, #992]!
+# CHECK-NEXT: 7. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 1.0 ldp w1, w2, [x27], #248
+# CHECK-NEXT: 9. 1 1.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.2 0.2 2.0 <total>
# CHECK: [45] Code Region - G46
@@ -3255,7 +3255,7 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 1006
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 1.99
# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 10.0
@@ -3266,14 +3266,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeER . . ldp x1, x2, [x27], #496
# CHECK-NEXT: [0,1] D=eE--R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeER . . ldp w1, w2, [x27, #248]!
-# CHECK-NEXT: [0,3] D==eE--R . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeER . . ldp x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,5] .D==eE--R . . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeER . ldpsw x1, x2, [x27], #248
-# CHECK-NEXT: [0,7] .D===eE---R . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D======eeeeeER ldpsw x1, x2, [x27, #248]!
-# CHECK-NEXT: [0,9] . D=======eE---R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeER . . ldp w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3] .D=eE--R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeER . . ldp x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5] . D=eE--R . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeER . ldpsw x1, x2, [x27], #248
+# CHECK-NEXT: [0,7] . D=eE---R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . D====eeeeeER ldpsw x1, x2, [x27, #248]!
+# CHECK-NEXT: [0,9] . D=====eE---R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3284,15 +3284,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldp x1, x2, [x27], #496
# CHECK-NEXT: 1. 1 2.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ldp w1, w2, [x27, #248]!
-# CHECK-NEXT: 3. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ldp x1, x2, [x27, #496]!
-# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ldpsw x1, x2, [x27], #248
-# CHECK-NEXT: 7. 1 4.0 0.0 3.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 7.0 4.0 0.0 ldpsw x1, x2, [x27, #248]!
-# CHECK-NEXT: 9. 1 8.0 0.0 3.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.5 0.5 1.2 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ldp w1, w2, [x27, #248]!
+# CHECK-NEXT: 3. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ldp x1, x2, [x27, #496]!
+# CHECK-NEXT: 5. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ldpsw x1, x2, [x27], #248
+# CHECK-NEXT: 7. 1 2.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 5.0 4.0 0.0 ldpsw x1, x2, [x27, #248]!
+# CHECK-NEXT: 9. 1 6.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.5 1.2 <total>
# CHECK: [46] Code Region - G47
@@ -3301,25 +3301,25 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 3.94
# CHECK-NEXT: IPC: 1.97
-# CHECK-NEXT: Block RThroughput: 3.8
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
# CHECK-NEXT: 012
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeER . . ldr b1, [x27], #254
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeeeER. . ldr h1, [x27], #254
-# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeER . ldr s1, [x27], #254
-# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeER. ldr d1, [x27], #254
-# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeER ldr q1, [x27], #254
-# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeER. . ldr h1, [x27], #254
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER . ldr s1, [x27], #254
+# CHECK-NEXT: [0,5] . D=eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeER. ldr d1, [x27], #254
+# CHECK-NEXT: [0,7] . D=eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeeeER ldr q1, [x27], #254
+# CHECK-NEXT: [0,9] . D=eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3330,15 +3330,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldr b1, [x27], #254
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ldr h1, [x27], #254
-# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ldr s1, [x27], #254
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ldr d1, [x27], #254
-# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ldr q1, [x27], #254
-# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 2.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ldr h1, [x27], #254
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ldr s1, [x27], #254
+# CHECK-NEXT: 5. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ldr d1, [x27], #254
+# CHECK-NEXT: 7. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 ldr q1, [x27], #254
+# CHECK-NEXT: 9. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 2.0 <total>
# CHECK: [47] Code Region - G48
@@ -3347,25 +3347,25 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 3.94
# CHECK-NEXT: IPC: 1.97
-# CHECK-NEXT: Block RThroughput: 3.8
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
# CHECK-NEXT: 012
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeER . . ldr b1, [x27, #254]!
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeeeER. . ldr h1, [x27, #254]!
-# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeER . ldr s1, [x27, #254]!
-# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeeeER. ldr d1, [x27, #254]!
-# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeER ldr q1, [x27, #254]!
-# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeeeER. . ldr h1, [x27, #254]!
+# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER . ldr s1, [x27, #254]!
+# CHECK-NEXT: [0,5] . D=eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeeER. ldr d1, [x27, #254]!
+# CHECK-NEXT: [0,7] . D=eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeeeER ldr q1, [x27, #254]!
+# CHECK-NEXT: [0,9] . D=eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3376,15 +3376,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldr b1, [x27, #254]!
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ldr h1, [x27, #254]!
-# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ldr s1, [x27, #254]!
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ldr d1, [x27, #254]!
-# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ldr q1, [x27, #254]!
-# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 2.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ldr h1, [x27, #254]!
+# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ldr s1, [x27, #254]!
+# CHECK-NEXT: 5. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ldr d1, [x27, #254]!
+# CHECK-NEXT: 7. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 ldr q1, [x27, #254]!
+# CHECK-NEXT: 9. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 2.0 <total>
# CHECK: [48] Code Region - G49
@@ -3393,25 +3393,25 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 2.96
# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
# CHECK-NEXT: 0
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeER . ldr w1, [x27], #254
# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
# CHECK-NEXT: [0,2] D=eeeeER . ldr x1, [x27], #254
-# CHECK-NEXT: [0,3] D==eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,4] D==eeeeER . ldr w1, [x27, #254]!
-# CHECK-NEXT: [0,5] D===eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeER. ldr x1, [x27, #254]!
-# CHECK-NEXT: [0,7] .D===eE--R. add x0, x27, #1
-# CHECK-NEXT: [0,8] .D===eeeeER ldrb w1, [x27], #254
-# CHECK-NEXT: [0,9] .D====eE--R add x0, x27, #1
+# CHECK-NEXT: [0,3] .D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeER . ldr w1, [x27, #254]!
+# CHECK-NEXT: [0,5] .D==eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeER. ldr x1, [x27, #254]!
+# CHECK-NEXT: [0,7] . D==eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeER ldrb w1, [x27], #254
+# CHECK-NEXT: [0,9] . D==eE--R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3423,14 +3423,14 @@ add x0, x27, 1
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldr w1, [x27], #254
# CHECK-NEXT: 1. 1 2.0 0.0 2.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ldr x1, [x27], #254
-# CHECK-NEXT: 3. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ldr w1, [x27, #254]!
-# CHECK-NEXT: 5. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ldr x1, [x27, #254]!
-# CHECK-NEXT: 7. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ldrb w1, [x27], #254
-# CHECK-NEXT: 9. 1 5.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.1 0.1 1.0 <total>
+# CHECK-NEXT: 3. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ldr w1, [x27, #254]!
+# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ldr x1, [x27, #254]!
+# CHECK-NEXT: 7. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ldrb w1, [x27], #254
+# CHECK-NEXT: 9. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.1 1.0 <total>
# CHECK: [49] Code Region - G50
@@ -3439,25 +3439,25 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 2.96
# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
# CHECK-NEXT: 0
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeER . ldrb w1, [x27, #254]!
# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
# CHECK-NEXT: [0,2] D=eeeeER . ldrh w1, [x27], #254
-# CHECK-NEXT: [0,3] D==eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,4] D==eeeeER . ldrh w1, [x27, #254]!
-# CHECK-NEXT: [0,5] D===eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeER. ldrsb w1, [x27], #254
-# CHECK-NEXT: [0,7] .D===eE--R. add x0, x27, #1
-# CHECK-NEXT: [0,8] .D===eeeeER ldrsb x1, [x27], #254
-# CHECK-NEXT: [0,9] .D====eE--R add x0, x27, #1
+# CHECK-NEXT: [0,3] .D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeER . ldrh w1, [x27, #254]!
+# CHECK-NEXT: [0,5] .D==eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeER. ldrsb w1, [x27], #254
+# CHECK-NEXT: [0,7] . D==eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeER ldrsb x1, [x27], #254
+# CHECK-NEXT: [0,9] . D==eE--R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3469,14 +3469,14 @@ add x0, x27, 1
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldrb w1, [x27, #254]!
# CHECK-NEXT: 1. 1 2.0 0.0 2.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ldrh w1, [x27], #254
-# CHECK-NEXT: 3. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ldrh w1, [x27, #254]!
-# CHECK-NEXT: 5. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ldrsb w1, [x27], #254
-# CHECK-NEXT: 7. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ldrsb x1, [x27], #254
-# CHECK-NEXT: 9. 1 5.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.1 0.1 1.0 <total>
+# CHECK-NEXT: 3. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ldrh w1, [x27, #254]!
+# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ldrsb w1, [x27], #254
+# CHECK-NEXT: 7. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ldrsb x1, [x27], #254
+# CHECK-NEXT: 9. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.1 1.0 <total>
# CHECK: [50] Code Region - G51
@@ -3485,25 +3485,25 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 2.96
# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
# CHECK-NEXT: 0
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeER . ldrsb w1, [x27, #254]!
# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
# CHECK-NEXT: [0,2] D=eeeeER . ldrsb x1, [x27, #254]!
-# CHECK-NEXT: [0,3] D==eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,4] D==eeeeER . ldrsh w1, [x27], #254
-# CHECK-NEXT: [0,5] D===eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeER. ldrsh x1, [x27], #254
-# CHECK-NEXT: [0,7] .D===eE--R. add x0, x27, #1
-# CHECK-NEXT: [0,8] .D===eeeeER ldrsh w1, [x27, #254]!
-# CHECK-NEXT: [0,9] .D====eE--R add x0, x27, #1
+# CHECK-NEXT: [0,3] .D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeER . ldrsh w1, [x27], #254
+# CHECK-NEXT: [0,5] .D==eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeeeER. ldrsh x1, [x27], #254
+# CHECK-NEXT: [0,7] . D==eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eeeeER ldrsh w1, [x27, #254]!
+# CHECK-NEXT: [0,9] . D==eE--R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3515,14 +3515,14 @@ add x0, x27, 1
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldrsb w1, [x27, #254]!
# CHECK-NEXT: 1. 1 2.0 0.0 2.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ldrsb x1, [x27, #254]!
-# CHECK-NEXT: 3. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ldrsh w1, [x27], #254
-# CHECK-NEXT: 5. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ldrsh x1, [x27], #254
-# CHECK-NEXT: 7. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ldrsh w1, [x27, #254]!
-# CHECK-NEXT: 9. 1 5.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.1 0.1 1.0 <total>
+# CHECK-NEXT: 3. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ldrsh w1, [x27], #254
+# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ldrsh x1, [x27], #254
+# CHECK-NEXT: 7. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ldrsh w1, [x27, #254]!
+# CHECK-NEXT: 9. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.1 1.0 <total>
# CHECK: [51] Code Region - G52
@@ -3531,10 +3531,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 1700
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 3.37
# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.4
# CHECK: Timeline view:
# CHECK-NEXT: Index 012345678
@@ -3542,13 +3542,13 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeER . ldrsh x1, [x27, #254]!
# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
# CHECK-NEXT: [0,2] D=eeeeER. ldrsw x1, [x27], #254
-# CHECK-NEXT: [0,3] D==eE--R. add x0, x27, #1
-# CHECK-NEXT: [0,4] D==eeeeER ldrsw x1, [x27, #254]!
-# CHECK-NEXT: [0,5] D===eE--R add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeE-R st1 { v1.1d }, [x27], #8
-# CHECK-NEXT: [0,7] .D===eE-R add x0, x27, #1
-# CHECK-NEXT: [0,8] .D===eeER st1 { v1.2d }, [x27], #16
-# CHECK-NEXT: [0,9] .D====eER add x0, x27, #1
+# CHECK-NEXT: [0,3] .D=eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,4] .D=eeeeER ldrsw x1, [x27, #254]!
+# CHECK-NEXT: [0,5] .D==eE--R add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeE-R st1 { v1.1d }, [x27], #8
+# CHECK-NEXT: [0,7] . D==eE-R add x0, x27, #1
+# CHECK-NEXT: [0,8] . D=eeER st1 { v1.2d }, [x27], #16
+# CHECK-NEXT: [0,9] . D==eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3560,14 +3560,14 @@ add x0, x27, 1
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldrsh x1, [x27, #254]!
# CHECK-NEXT: 1. 1 2.0 0.0 2.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ldrsw x1, [x27], #254
-# CHECK-NEXT: 3. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ldrsw x1, [x27, #254]!
-# CHECK-NEXT: 5. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 1.0 st1 { v1.1d }, [x27], #8
-# CHECK-NEXT: 7. 1 4.0 0.0 1.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 4.0 0.0 0.0 st1 { v1.2d }, [x27], #16
-# CHECK-NEXT: 9. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.1 0.1 0.8 <total>
+# CHECK-NEXT: 3. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ldrsw x1, [x27, #254]!
+# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 1.0 st1 { v1.1d }, [x27], #8
+# CHECK-NEXT: 7. 1 3.0 0.0 1.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 2.0 0.0 0.0 st1 { v1.2d }, [x27], #16
+# CHECK-NEXT: 9. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.2 0.1 0.8 <total>
# CHECK: [52] Code Region - G53
@@ -3576,24 +3576,24 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 3.97
# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
# CHECK-NEXT: Index 012345678
# CHECK: [0,0] DeeER. . st1 { v1.2s }, [x27], #8
# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeER . st1 { v1.4h }, [x27], #8
-# CHECK-NEXT: [0,3] D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeER . st1 { v1.4s }, [x27], #16
-# CHECK-NEXT: [0,5] .D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeER. st1 { v1.8b }, [x27], #8
-# CHECK-NEXT: [0,7] .D===eER. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeER st1 { v1.8h }, [x27], #16
-# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeER . st1 { v1.4h }, [x27], #8
+# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeER . st1 { v1.4s }, [x27], #16
+# CHECK-NEXT: [0,5] . D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeER. st1 { v1.8b }, [x27], #8
+# CHECK-NEXT: [0,7] . D=eER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeER st1 { v1.8h }, [x27], #16
+# CHECK-NEXT: [0,9] . D=eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3604,15 +3604,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.2s }, [x27], #8
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.4h }, [x27], #8
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.4s }, [x27], #16
-# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st1 { v1.8b }, [x27], #8
-# CHECK-NEXT: 7. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st1 { v1.8h }, [x27], #16
-# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 0.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.4h }, [x27], #8
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.4s }, [x27], #16
+# CHECK-NEXT: 5. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 st1 { v1.8b }, [x27], #8
+# CHECK-NEXT: 7. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 st1 { v1.8h }, [x27], #16
+# CHECK-NEXT: 9. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 0.0 <total>
# CHECK: [53] Code Region - G54
@@ -3621,24 +3621,24 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 3.97
# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
# CHECK-NEXT: Index 012345678
# CHECK: [0,0] DeeER. . st1 { v1.16b }, [x27], #16
# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeER . st1 { v1.1d }, [x27], x28
-# CHECK-NEXT: [0,3] D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeER . st1 { v1.2d }, [x27], x28
-# CHECK-NEXT: [0,5] .D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeER. st1 { v1.2s }, [x27], x28
-# CHECK-NEXT: [0,7] .D===eER. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeER st1 { v1.4h }, [x27], x28
-# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeER . st1 { v1.1d }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeER . st1 { v1.2d }, [x27], x28
+# CHECK-NEXT: [0,5] . D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeER. st1 { v1.2s }, [x27], x28
+# CHECK-NEXT: [0,7] . D=eER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeER st1 { v1.4h }, [x27], x28
+# CHECK-NEXT: [0,9] . D=eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3649,15 +3649,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.16b }, [x27], #16
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.1d }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.2d }, [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st1 { v1.2s }, [x27], x28
-# CHECK-NEXT: 7. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st1 { v1.4h }, [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 0.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.1d }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.2d }, [x27], x28
+# CHECK-NEXT: 5. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 st1 { v1.2s }, [x27], x28
+# CHECK-NEXT: 7. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 st1 { v1.4h }, [x27], x28
+# CHECK-NEXT: 9. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 0.0 <total>
# CHECK: [54] Code Region - G55
@@ -3666,24 +3666,24 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 3.97
# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
# CHECK-NEXT: Index 012345678
# CHECK: [0,0] DeeER. . st1 { v1.4s }, [x27], x28
# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeER . st1 { v1.8b }, [x27], x28
-# CHECK-NEXT: [0,3] D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeER . st1 { v1.8h }, [x27], x28
-# CHECK-NEXT: [0,5] .D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeER. st1 { v1.16b }, [x27], x28
-# CHECK-NEXT: [0,7] .D===eER. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeER st1 { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeER . st1 { v1.8b }, [x27], x28
+# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeER . st1 { v1.8h }, [x27], x28
+# CHECK-NEXT: [0,5] . D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeER. st1 { v1.16b }, [x27], x28
+# CHECK-NEXT: [0,7] . D=eER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeER st1 { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,9] . D=eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3694,15 +3694,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.4s }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.8b }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.8h }, [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st1 { v1.16b }, [x27], x28
-# CHECK-NEXT: 7. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st1 { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 0.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.8b }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.8h }, [x27], x28
+# CHECK-NEXT: 5. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 st1 { v1.16b }, [x27], x28
+# CHECK-NEXT: 7. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 st1 { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 9. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 0.0 <total>
# CHECK: [55] Code Region - G56
@@ -3711,24 +3711,24 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 2400
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 4.76
# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 3.5
+# CHECK-NEXT: Block RThroughput: 4.8
# CHECK: Timeline view:
# CHECK-NEXT: Index 012345678
# CHECK: [0,0] DeeER. . st1 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeER . st1 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,3] D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeER . st1 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,5] .D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeER. st1 { v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,7] .D===eER. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeER st1 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
+# CHECK-NEXT: [0,1] .DeER. . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeER . st1 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeER . st1 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,5] . D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeER. st1 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,7] . DeER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeER st1 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,9] . D=eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3738,42 +3738,42 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st1 { v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 7. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st1 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 5. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 st1 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 7. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 st1 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 9. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.3 0.1 0.0 <total>
# CHECK: [56] Code Region - G57
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 504
+# CHECK-NEXT: Total Cycles: 604
# CHECK-NEXT: Total uOps: 2600
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 5.16
-# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 4.30
+# CHECK-NEXT: IPC: 1.66
+# CHECK-NEXT: Block RThroughput: 5.2
# CHECK: Timeline view:
-# CHECK-NEXT: Index 012345678
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeER. . st1 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeER . st1 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeER . st1 { v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,5] .D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeER. st1 { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,7] . D==eER. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeER st1 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
+# CHECK: [0,0] DeeER. . st1 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1] .DeER. . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeER . st1 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,3] . DeER . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeER . st1 { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,5] . D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeER. st1 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,7] . DeER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeER st1 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,9] . D=eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3783,42 +3783,42 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st1 { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 7. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st1 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.3 0.1 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st1 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 5. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 st1 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 7. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 st1 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 9. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.2 0.2 0.0 <total>
# CHECK: [57] Code Region - G58
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 504
+# CHECK-NEXT: Total Cycles: 604
# CHECK-NEXT: Total uOps: 2600
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 5.16
-# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 4.30
+# CHECK-NEXT: IPC: 1.66
+# CHECK-NEXT: Block RThroughput: 5.2
# CHECK: Timeline view:
-# CHECK-NEXT: Index 012345678
+# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeER. . st1 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeER . st1 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,3] D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeER . st1 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,5] .D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeER. st1 { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,7] .D===eER. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeER st1 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
+# CHECK: [0,0] DeeER. . st1 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeER . st1 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3] . DeER . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeER . st1 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,5] . D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeER . st1 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,7] . DeER . add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeER st1 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,9] . .DeER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3829,41 +3829,42 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.4h, v2.4h }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st1 { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 7. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st1 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 0.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 5. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 st1 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 7. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 st1 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.2 0.2 0.0 <total>
# CHECK: [58] Code Region - G59
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 703
+# CHECK-NEXT: Total Cycles: 1003
# CHECK-NEXT: Total uOps: 3400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 4.84
-# CHECK-NEXT: IPC: 1.42
-# CHECK-NEXT: Block RThroughput: 6.0
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.39
+# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: Block RThroughput: 6.8
# CHECK: Timeline view:
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeER. . st1 { v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeER . st1 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
-# CHECK-NEXT: [0,4] . DeeER . st1 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,5] . D=eER . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeER. st1 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,7] . D==eER. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D=eeER st1 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,9] . D==eER add x0, x27, #1
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . . st1 { v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1] .DeER. . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeER . . st1 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3] . DeER . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeER . . st1 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,5] . DeER . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeER . st1 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,7] . . DeER . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeER st1 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,9] . . DeER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3873,42 +3874,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 5. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 7. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 2.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 9. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 1.9 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st1 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st1 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 5. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 7. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 st1 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 9. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 0.0 <total>
# CHECK: [59] Code Region - G60
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 703
+# CHECK-NEXT: Total Cycles: 1003
# CHECK-NEXT: Total uOps: 3600
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 5.12
-# CHECK-NEXT: IPC: 1.42
-# CHECK-NEXT: Block RThroughput: 6.5
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.59
+# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: Block RThroughput: 7.2
# CHECK: Timeline view:
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeER. . st1 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeER . st1 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
-# CHECK-NEXT: [0,4] . DeeER . st1 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,5] . D=eER . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeER. st1 { v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7] . D==eER. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D=eeER st1 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9] . D==eER add x0, x27, #1
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . . st1 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1] .DeER. . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeER . . st1 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3] . DeER . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeER . . st1 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,5] . DeER . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeER . st1 { v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7] . . DeER . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeER st1 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9] . . DeER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3918,42 +3920,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 5. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 1.0 0.0 st1 { v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 7. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 2.0 0.0 0.0 st1 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 1.9 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st1 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 5. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 st1 { v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 st1 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 0.0 <total>
# CHECK: [60] Code Region - G61
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 703
+# CHECK-NEXT: Total Cycles: 1003
# CHECK-NEXT: Total uOps: 3400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 4.84
-# CHECK-NEXT: IPC: 1.42
-# CHECK-NEXT: Block RThroughput: 6.0
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.39
+# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: Block RThroughput: 6.8
# CHECK: Timeline view:
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeER. . st1 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeER . st1 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
-# CHECK-NEXT: [0,4] . DeeER . st1 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5] . D=eER . add x0, x27, #1
-# CHECK-NEXT: [0,6] . DeeER . st1 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7] . D=eER . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D=eeER st1 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9] . D==eER add x0, x27, #1
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . . st1 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1] .DeER. . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeER . . st1 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3] . DeER . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeER . . st1 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5] . DeER . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeER . st1 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7] . . DeER . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeER st1 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9] . . DeER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3963,43 +3966,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 1.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 2.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 1.7 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st1 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 st1 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 0.0 <total>
# CHECK: [61] Code Region - G62
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 704
+# CHECK-NEXT: Total Cycles: 1003
# CHECK-NEXT: Total uOps: 3600
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 5.11
-# CHECK-NEXT: IPC: 1.42
-# CHECK-NEXT: Block RThroughput: 6.5
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.59
+# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: Block RThroughput: 7.2
# CHECK: Timeline view:
-# CHECK-NEXT: 0
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeER. . st1 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeER . st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D=eeER . st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,5] . D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeER . st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,7] . D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeER st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . . st1 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1] .DeER. . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeER . . st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3] . DeER . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeER . . st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,5] . DeER . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeER . st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,7] . . DeER . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeER st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,9] . . DeER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4009,43 +4012,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 7. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.3 0.3 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 5. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 7. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 9. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 0.0 <total>
# CHECK: [62] Code Region - G63
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 804
+# CHECK-NEXT: Total Cycles: 1003
# CHECK-NEXT: Total uOps: 4200
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 5.22
-# CHECK-NEXT: IPC: 1.24
-# CHECK-NEXT: Block RThroughput: 8.0
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 4.19
+# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: Block RThroughput: 8.4
# CHECK: Timeline view:
-# CHECK-NEXT: 01
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeER. .. st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,1] D=eER. .. add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeER .. st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,3] .D=eER .. add x0, x27, #1
-# CHECK-NEXT: [0,4] . D=eeER .. st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,5] . D==eER .. add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeER .. st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,7] . D==eER .. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D===eeER st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,9] . D====eER add x0, x27, #1
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . . st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1] .DeER. . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeER . . st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3] . DeER . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeER . . st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,5] . DeER . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeER . st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,7] . . DeER . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeER st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,9] . . DeER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4055,42 +4058,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 7. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 4.0 2.0 0.0 st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 9. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.5 0.4 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 5. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 7. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 0.0 <total>
# CHECK: [63] Code Region - G64
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 703
+# CHECK-NEXT: Total Cycles: 1003
# CHECK-NEXT: Total uOps: 3800
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 5.41
-# CHECK-NEXT: IPC: 1.42
-# CHECK-NEXT: Block RThroughput: 7.0
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.79
+# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: Block RThroughput: 7.6
# CHECK: Timeline view:
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeER. . st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeER . st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D=eeER . st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5] . D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeER. st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7] . D==eER. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D=eeER st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9] . D==eER add x0, x27, #1
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . . st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1] .DeER. . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeER . . st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3] . DeER . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeER . . st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5] . DeER . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeER . st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7] . . DeER . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeER st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9] . . DeER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4100,16 +4104,16 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 7. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 2.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 9. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.1 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 0.0 <total>
# CHECK: [64] Code Region - G65
@@ -4118,25 +4122,25 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 706
# CHECK-NEXT: Total uOps: 3200
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 4.53
# CHECK-NEXT: IPC: 1.42
-# CHECK-NEXT: Block RThroughput: 5.5
+# CHECK-NEXT: Block RThroughput: 6.4
# CHECK: Timeline view:
# CHECK-NEXT: 012
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeER. . . st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1] D=eER. . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeER . . st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3] .D=eER . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==eeeeER . st1 { v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,5] . D===eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D===eeeeER. st1 { v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,7] . D====eE--R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D===eeeeER st1 { v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,9] . D====eE--R add x0, x27, #1
+# CHECK-NEXT: [0,1] .DeER. . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeER . . st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3] . DeER . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeER . st1 { v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,5] . D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeER. st1 { v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,7] . D=eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . .DeeeeER st1 { v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,9] . .D=eE--R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4146,16 +4150,16 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 2.0 0.0 st1 { v1.b }[0], [x27], #1
-# CHECK-NEXT: 5. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 4.0 0.0 0.0 st1 { v1.b }[8], [x27], #1
-# CHECK-NEXT: 7. 1 5.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 4.0 0.0 0.0 st1 { v1.b }[0], [x27], x28
-# CHECK-NEXT: 9. 1 5.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.1 0.3 0.6 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st1 { v1.b }[0], [x27], #1
+# CHECK-NEXT: 5. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 st1 { v1.b }[8], [x27], #1
+# CHECK-NEXT: 7. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 st1 { v1.b }[0], [x27], x28
+# CHECK-NEXT: 9. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.3 0.3 0.6 <total>
# CHECK: [65] Code Region - G66
@@ -4164,25 +4168,25 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 3.95
# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
# CHECK-NEXT: 0
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeER . st1 { v1.b }[8], [x27], x28
# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeER . st1 { v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,3] D==eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeER . st1 { v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,5] .D==eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeER. st1 { v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,7] .D===eE--R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeER st1 { v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,9] . D===eE--R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeER . st1 { v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,3] .D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeER . st1 { v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,5] . D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeER. st1 { v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,7] . D=eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeER st1 { v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,9] . D=eE--R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4193,15 +4197,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.b }[8], [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.h }[0], [x27], #2
-# CHECK-NEXT: 3. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.h }[4], [x27], #2
-# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st1 { v1.h }[0], [x27], x28
-# CHECK-NEXT: 7. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st1 { v1.h }[4], [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 1.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.h }[0], [x27], #2
+# CHECK-NEXT: 3. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.h }[4], [x27], #2
+# CHECK-NEXT: 5. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 st1 { v1.h }[0], [x27], x28
+# CHECK-NEXT: 7. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 st1 { v1.h }[4], [x27], x28
+# CHECK-NEXT: 9. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 1.0 <total>
# CHECK: [66] Code Region - G67
@@ -4210,25 +4214,25 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 2200
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 4.35
# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 4.4
# CHECK: Timeline view:
# CHECK-NEXT: 0
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeER . st1 { v1.s }[0], [x27], #4
# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeER . st1 { v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,3] D==eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeER . st1 { v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,5] .D==eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeER. st1 { v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,7] .D===eE--R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeER st2 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,9] . D===eE--R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeER . st1 { v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,3] .D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeER . st1 { v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,5] . D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeER. st1 { v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,7] . D=eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeER st2 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,9] . DeE--R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4239,15 +4243,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.s }[0], [x27], #4
# CHECK-NEXT: 1. 1 2.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.s }[0], [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.d }[0], [x27], #8
-# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st1 { v1.d }[0], [x27], x28
-# CHECK-NEXT: 7. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st2 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 9. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 1.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.s }[0], [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.d }[0], [x27], #8
+# CHECK-NEXT: 5. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 st1 { v1.d }[0], [x27], x28
+# CHECK-NEXT: 7. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 st2 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 9. 1 1.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.4 0.1 1.0 <total>
# CHECK: [67] Code Region - G68
@@ -4256,25 +4260,25 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 2400
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 4.74
# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 3.5
+# CHECK-NEXT: Block RThroughput: 4.8
# CHECK: Timeline view:
# CHECK-NEXT: 0
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeER . st2 { v1.2s, v2.2s }, [x27], #16
# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeER . st2 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,3] D==eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeER . st2 { v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,5] .D==eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeER. st2 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,7] .D===eE--R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeER st2 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,9] . D===eE--R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeER . st2 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3] .D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeER . st2 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,5] . DeE--R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeER. st2 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,7] . D=eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeER st2 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,9] . DeE--R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4285,42 +4289,42 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.2s, v2.2s }, [x27], #16
# CHECK-NEXT: 1. 1 2.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st2 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 3. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st2 { v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st2 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 7. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st2 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 9. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 1.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st2 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 3. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st2 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 5. 1 1.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 st2 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 7. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 st2 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 9. 1 1.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.3 0.1 1.0 <total>
# CHECK: [68] Code Region - G69
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 506
+# CHECK-NEXT: Total Cycles: 705
# CHECK-NEXT: Total uOps: 2600
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 5.14
-# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.69
+# CHECK-NEXT: IPC: 1.42
+# CHECK-NEXT: Block RThroughput: 5.2
# CHECK: Timeline view:
-# CHECK-NEXT: 0
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeeeER . st2 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeER . st2 { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,3] .D=eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeER . st2 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,5] .D==eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeER. st2 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,7] . D==eE--R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeER st2 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,9] . D===eE--R add x0, x27, #1
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeER .. st2 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1] .DeE--R .. add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeER .. st2 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3] . DeE--R .. add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeER.. st2 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,5] . D=eE--R.. add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeER. st2 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,7] . D=eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeER st2 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,9] . .DeE--R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4330,43 +4334,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 1. 1 2.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st2 { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 3. 1 2.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st2 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st2 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 7. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st2 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.3 0.1 1.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st2 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st2 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 5. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 st2 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 7. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 st2 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.2 0.2 1.0 <total>
# CHECK: [69] Code Region - G70
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 506
+# CHECK-NEXT: Total Cycles: 606
# CHECK-NEXT: Total uOps: 2400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 4.74
-# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 3.5
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.96
+# CHECK-NEXT: IPC: 1.65
+# CHECK-NEXT: Block RThroughput: 4.8
# CHECK: Timeline view:
-# CHECK-NEXT: 0
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeeeER . st2 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeER . st2 { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,3] D==eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeER . st2 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,5] .D==eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeER. st2 { v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,7] .D===eE--R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeER st2 { v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,9] . D===eE--R add x0, x27, #1
+# CHECK-NEXT: 01
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeER .. st2 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1] D=eE--R .. add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeER .. st2 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3] . DeE--R .. add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeER.. st2 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,5] . DeE--R.. add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeER. st2 { v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,7] . D=eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeER st2 { v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,9] . D=eE--R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4377,15 +4381,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.8b, v2.8b }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st2 { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st2 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st2 { v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 7. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st2 { v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 9. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 1.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st2 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st2 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 5. 1 1.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 st2 { v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 7. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 st2 { v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 9. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.3 0.2 1.0 <total>
# CHECK: [70] Code Region - G71
@@ -4394,25 +4398,25 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 3.95
# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
# CHECK-NEXT: 0
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeER . st2 { v1.b, v2.b }[0], [x27], x28
# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeER . st2 { v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,3] D==eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeER . st2 { v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,5] .D==eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeER. st2 { v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,7] .D===eE--R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeER st2 { v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,9] . D===eE--R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeER . st2 { v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,3] .D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeER . st2 { v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,5] . D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeER. st2 { v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,7] . D=eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeER st2 { v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,9] . D=eE--R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4423,15 +4427,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.b, v2.b }[0], [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st2 { v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st2 { v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st2 { v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 7. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st2 { v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 1.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st2 { v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st2 { v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 5. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 st2 { v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 7. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 st2 { v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 9. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 1.0 <total>
# CHECK: [71] Code Region - G72
@@ -4440,25 +4444,25 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 3.95
# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
# CHECK-NEXT: 0
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeER . st2 { v1.h, v2.h }[4], [x27], x28
# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeeeER . st2 { v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,3] D==eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeER . st2 { v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,5] .D==eE--R . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeeeER. st2 { v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,7] .D===eE--R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeER st2 { v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,9] . D===eE--R add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeeeER . st2 { v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,3] .D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeER . st2 { v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,5] . D=eE--R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeER. st2 { v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,7] . D=eE--R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeeeER st2 { v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,9] . D=eE--R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4469,42 +4473,42 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.h, v2.h }[4], [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st2 { v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 3. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st2 { v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st2 { v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 7. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 st2 { v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 1.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st2 { v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 3. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st2 { v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 5. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 st2 { v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 7. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 st2 { v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 9. 1 2.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 1.0 <total>
# CHECK: [72] Code Region - G73
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 607
+# CHECK-NEXT: Total Cycles: 707
# CHECK-NEXT: Total uOps: 2800
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 4.61
-# CHECK-NEXT: IPC: 1.65
-# CHECK-NEXT: Block RThroughput: 4.5
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.96
+# CHECK-NEXT: IPC: 1.41
+# CHECK-NEXT: Block RThroughput: 5.6
# CHECK: Timeline view:
-# CHECK-NEXT: 012
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeER . . . st2g x26, [x27], #4064
-# CHECK-NEXT: [0,1] D=eER. . . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eER. . . st2g x26, [x27, #4064]!
-# CHECK-NEXT: [0,3] D==eER . . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeER . st3 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeeER . st3 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,7] . D==eE---R . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeER st3 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,9] . D===eE---R add x0, x27, #1
+# CHECK-NEXT: 0123
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeER . . . st2g x26, [x27], #4064
+# CHECK-NEXT: [0,1] D=eER. . . add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeER. . . st2g x26, [x27, #4064]!
+# CHECK-NEXT: [0,3] .D=eER . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER . st3 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,5] . DeE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeeeeER . st3 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,7] . DeE---R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . .DeeeeeER st3 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,9] . . DeE---R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4515,42 +4519,42 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2g x26, [x27], #4064
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st2g x26, [x27, #4064]!
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st3 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st3 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 7. 1 3.0 0.0 3.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 1.0 0.0 st3 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 9. 1 4.0 0.0 3.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.5 0.2 1.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st2g x26, [x27, #4064]!
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st3 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 5. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 st3 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 7. 1 1.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 st3 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 9. 1 1.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.2 0.3 1.0 <total>
# CHECK: [73] Code Region - G74
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 708
+# CHECK-NEXT: Total Cycles: 1007
# CHECK-NEXT: Total uOps: 3800
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 5.37
-# CHECK-NEXT: IPC: 1.41
-# CHECK-NEXT: Block RThroughput: 7.0
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.77
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 7.6
# CHECK: Timeline view:
-# CHECK-NEXT: 01234
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeeeeeER . . st3 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeER . . st3 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,3] .D=eE---R . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D=eeeeeeER . st3 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,5] . D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeeeER . st3 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,7] . D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeER st3 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. st3 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1] .DeE----R . .. add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeER. .. st3 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3] . DeE---R. .. add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER .. st3 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,5] . DeE----R .. add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeeER.. st3 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,7] . . DeE----R.. add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeER st3 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9] . . DeE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4560,43 +4564,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st3 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 3. 1 2.0 0.0 3.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st3 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st3 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 7. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 1.0 0.0 st3 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.3 0.3 1.9 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st3 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3. 1 1.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st3 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 5. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 st3 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 7. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 st3 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 1.9 <total>
# CHECK: [74] Code Region - G75
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 707
+# CHECK-NEXT: Total Cycles: 1007
# CHECK-NEXT: Total uOps: 3400
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 4.81
-# CHECK-NEXT: IPC: 1.41
-# CHECK-NEXT: Block RThroughput: 6.0
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.38
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 6.8
# CHECK: Timeline view:
-# CHECK-NEXT: 0123
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeeeeER . . st3 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1] D=eE---R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeER . . st3 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3] .D=eE---R . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . DeeeeeeER . st3 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5] . D=eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] . DeeeeeER . st3 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7] . D=eE---R . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D=eeeeeeER st3 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9] . D==eE----R add x0, x27, #1
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeER . .. st3 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1] .DeE---R . .. add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeER. .. st3 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3] . DeE---R. .. add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER .. st3 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5] . DeE----R .. add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeER .. st3 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7] . . DeE---R .. add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeER st3 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9] . . DeE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4606,43 +4610,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 3.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st3 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3. 1 2.0 0.0 3.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st3 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 1.0 0.0 0.0 st3 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7. 1 2.0 0.0 3.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 2.0 1.0 0.0 st3 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 1.7 0.2 1.7 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st3 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st3 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 st3 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7. 1 1.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 st3 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 1.7 <total>
# CHECK: [75] Code Region - G76
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 757
+# CHECK-NEXT: Total Cycles: 1007
# CHECK-NEXT: Total uOps: 4000
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 5.28
-# CHECK-NEXT: IPC: 1.32
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.97
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 8.0
# CHECK: Timeline view:
-# CHECK-NEXT: 01234
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeeeeeER . . st3 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeER. . st3 { v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D=eeeeeeER . st3 { v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,5] . D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeeeER . st3 { v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,7] . D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeER st3 { v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. st3 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1] .DeE----R . .. add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeER .. st3 { v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,3] . DeE----R .. add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER .. st3 { v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,5] . DeE----R .. add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeeER.. st3 { v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,7] . . DeE----R.. add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeER st3 { v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,9] . . DeE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4652,43 +4656,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st3 { v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st3 { v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st3 { v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 7. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 1.0 0.0 st3 { v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.3 0.3 2.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st3 { v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 3. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st3 { v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 5. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 st3 { v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 7. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 st3 { v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 2.0 <total>
# CHECK: [76] Code Region - G77
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 757
+# CHECK-NEXT: Total Cycles: 1007
# CHECK-NEXT: Total uOps: 4000
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 5.28
-# CHECK-NEXT: IPC: 1.32
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.97
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 8.0
# CHECK: Timeline view:
-# CHECK-NEXT: 01234
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeeeeeER . . st3 { v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeER. . st3 { v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D=eeeeeeER . st3 { v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,5] . D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeeeER . st3 { v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,7] . D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeER st3 { v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. st3 { v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,1] .DeE----R . .. add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeER .. st3 { v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,3] . DeE----R .. add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER .. st3 { v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,5] . DeE----R .. add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeeER.. st3 { v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,7] . . DeE----R.. add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeER st3 { v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,9] . . DeE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4698,43 +4702,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st3 { v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st3 { v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st3 { v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 7. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 1.0 0.0 st3 { v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.3 0.3 2.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st3 { v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 3. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st3 { v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 5. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 st3 { v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 7. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 st3 { v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 9. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 2.0 <total>
# CHECK: [77] Code Region - G78
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 807
+# CHECK-NEXT: Total Cycles: 1007
# CHECK-NEXT: Total uOps: 4200
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 5.20
-# CHECK-NEXT: IPC: 1.24
-# CHECK-NEXT: Block RThroughput: 8.0
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 4.17
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 8.4
# CHECK: Timeline view:
-# CHECK-NEXT: 01234
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeeeeeER . . st3 { v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeER. . st3 { v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D=eeeeeeER . st3 { v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,5] . D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeeER . st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,7] . D==eE---R . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeER st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. st3 { v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1] .DeE----R . .. add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeER .. st3 { v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,3] . DeE----R .. add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER .. st3 { v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,5] . DeE----R .. add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeER .. st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,7] . . DeE---R .. add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeER st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,9] . . DeE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4744,43 +4748,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st3 { v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st3 { v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 7. 1 3.0 0.0 3.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 1.0 0.0 st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.3 0.3 1.9 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st3 { v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 3. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st3 { v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 5. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 7. 1 1.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 9. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 1.9 <total>
# CHECK: [78] Code Region - G79
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1205
+# CHECK-NEXT: Total Cycles: 1307
# CHECK-NEXT: Total uOps: 5800
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 4.81
-# CHECK-NEXT: IPC: 0.83
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 4.44
+# CHECK-NEXT: IPC: 0.77
# CHECK-NEXT: Block RThroughput: 12.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeeeeeER . .. st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,1] D=eE----R . .. add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeeER .. st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,3] . DeE-----R .. add x0, x27, #1
-# CHECK-NEXT: [0,4] . DeeeeeeER .. st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,5] . D=eE----R .. add x0, x27, #1
-# CHECK-NEXT: [0,6] . D==eeeeeeeER. st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,7] . D==eE-----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . .D=eeeeeeeER st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,9] . . D=eE-----R add x0, x27, #1
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . . . st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1] .DeE----R . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeeER . . st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3] . DeE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER . . st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,5] . .DeE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . . D=eeeeeeeER . st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,7] . . DeE-----R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeeER st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,9] . . . DeE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4790,43 +4794,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 3. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 1. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 3. 1 1.0 1.0 4.0 add x0, x27, #1
# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 5. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 2.0 0.0 st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 7. 1 3.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 2.0 0.0 0.0 st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 9. 1 2.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 1 1.8 0.4 2.3 <total>
+# CHECK-NEXT: 5. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 2.0 0.0 st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 7. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 9. 1 1.0 1.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.1 0.8 2.1 <total>
# CHECK: [79] Code Region - G80
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1006
+# CHECK-NEXT: Total Cycles: 1107
# CHECK-NEXT: Total uOps: 4800
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 4.77
-# CHECK-NEXT: IPC: 0.99
-# CHECK-NEXT: Block RThroughput: 9.5
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 4.34
+# CHECK-NEXT: IPC: 0.90
+# CHECK-NEXT: Block RThroughput: 9.6
# CHECK: Timeline view:
-# CHECK-NEXT: 012345
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeeeeER . . st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1] D=eE---R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeER. . st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==eeeeeeER . st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5] . D===eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D==eeeeeeeER. st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7] . D==eE-----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeER st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
+# CHECK-NEXT: 01234567
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1] .DeE---R . . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeER . . st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3] . DeE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER . . st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5] . DeE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeeeER . st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7] . . DeE----R . add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeER st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9] . . DeE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4836,43 +4840,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 3.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 2.0 0.0 st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 5. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 7. 1 3.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 1.0 0.0 st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.6 0.4 2.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 3.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7. 1 1.0 1.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.6 1.9 <total>
# CHECK: [80] Code Region - G81
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 1058
+# CHECK-NEXT: Total Cycles: 1207
# CHECK-NEXT: Total uOps: 5200
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 4.91
-# CHECK-NEXT: IPC: 0.95
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 4.31
+# CHECK-NEXT: IPC: 0.83
# CHECK-NEXT: Block RThroughput: 10.5
# CHECK: Timeline view:
-# CHECK-NEXT: 01234567
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeeeeeeER. . . st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1] .DeE-----R. . . add x0, x27, #1
-# CHECK-NEXT: [0,2] . DeeeeeeeER . . st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3] . DeE-----R . . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D==eeeeeeER. . st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,5] . D===eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D===eeeeeeER. st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,7] . D====eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . .D===eeeeeeER st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,9] . .D====eE----R add x0, x27, #1
+# CHECK-NEXT: 012345678
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeeER. . . st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1] . DeE----R. . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeeER . . st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3] . DeE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,4] . .DeeeeeeER. . st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,5] . . DeE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,6] . . D=eeeeeeER. st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,7] . . D=eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeER st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,9] . . .DeE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4882,43 +4886,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1. 1 1.0 0.0 5.0 add x0, x27, #1
+# CHECK-NEXT: 1. 1 1.0 1.0 4.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3. 1 1.0 0.0 5.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 3.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 5. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 4.0 1.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 7. 1 5.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 4.0 0.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 9. 1 5.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.9 0.6 2.2 <total>
+# CHECK-NEXT: 3. 1 1.0 1.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 5. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 2.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 7. 1 2.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.2 0.7 2.0 <total>
# CHECK: [81] Code Region - G82
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 757
+# CHECK-NEXT: Total Cycles: 1007
# CHECK-NEXT: Total uOps: 4000
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 5.28
-# CHECK-NEXT: IPC: 1.32
-# CHECK-NEXT: Block RThroughput: 7.5
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.97
+# CHECK-NEXT: IPC: 0.99
+# CHECK-NEXT: Block RThroughput: 8.0
# CHECK: Timeline view:
-# CHECK-NEXT: 01234
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeeeeeER . . st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeER. . st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] . D=eeeeeeER . st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,5] . D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeeeER . st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,7] . D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeER st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,9] . D===eE----R add x0, x27, #1
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . .. st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,1] .DeE----R . .. add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeER .. st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,3] . DeE----R .. add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeeeER .. st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,5] . DeE----R .. add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeeeER.. st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,7] . . DeE----R.. add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeeeeeeER st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,9] . . DeE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4928,43 +4932,43 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 7. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 1.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 9. 1 4.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.3 0.3 2.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 3. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 5. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 7. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 9. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.0 0.5 2.0 <total>
# CHECK: [82] Code Region - G83
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1000
-# CHECK-NEXT: Total Cycles: 704
+# CHECK-NEXT: Total Cycles: 904
# CHECK-NEXT: Total uOps: 3600
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 5.11
-# CHECK-NEXT: IPC: 1.42
-# CHECK-NEXT: Block RThroughput: 6.5
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 3.98
+# CHECK-NEXT: IPC: 1.11
+# CHECK-NEXT: Block RThroughput: 7.2
# CHECK: Timeline view:
-# CHECK-NEXT: 0
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeeeeeER . st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,1] D=eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeER. st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,3] .D=eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,4] . D=eeeeER. st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,5] . D==eE--R. add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeER st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,7] . D==eE--R add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eE-R stg x26, [x27], #4064
-# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
+# CHECK-NEXT: 012
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeeeER . . st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,1] .DeE----R . . add x0, x27, #1
+# CHECK-NEXT: [0,2] . DeeeeeeER . st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,3] . DeE----R . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeeeER . st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,5] . DeE--R . add x0, x27, #1
+# CHECK-NEXT: [0,6] . .DeeeeER st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,7] . . DeE--R add x0, x27, #1
+# CHECK-NEXT: [0,8] . . DeE-R stg x26, [x27], #4064
+# CHECK-NEXT: [0,9] . . D=eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -4974,16 +4978,16 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 5. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 7. 1 3.0 0.0 2.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 1.0 1.0 stg x26, [x27], #4064
-# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.3 0.3 1.3 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 5. 1 1.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 7. 1 1.0 0.0 2.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 1.0 1.0 stg x26, [x27], #4064
+# CHECK-NEXT: 9. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.1 0.5 1.3 <total>
# CHECK: [83] Code Region - G84
@@ -4992,24 +4996,24 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 2200
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 4.37
# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 4.4
# CHECK: Timeline view:
# CHECK-NEXT: Index 012345678
# CHECK: [0,0] DeER . . stg x26, [x27, #4064]!
# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eER. . stgp x1, x2, [x27], #992
-# CHECK-NEXT: [0,3] D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eER . stgp x1, x2, [x27, #992]!
-# CHECK-NEXT: [0,5] .D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeER. stp s1, s2, [x27], #248
-# CHECK-NEXT: [0,7] .D===eER. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeER stp d1, d2, [x27], #496
-# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeER. . stgp x1, x2, [x27], #992
+# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeER . stgp x1, x2, [x27, #992]!
+# CHECK-NEXT: [0,5] . D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeER. stp s1, s2, [x27], #248
+# CHECK-NEXT: [0,7] . D=eER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeER stp d1, d2, [x27], #496
+# CHECK-NEXT: [0,9] . D=eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -5020,15 +5024,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 stg x26, [x27, #4064]!
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 stgp x1, x2, [x27], #992
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 stgp x1, x2, [x27, #992]!
-# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 stp s1, s2, [x27], #248
-# CHECK-NEXT: 7. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 stp d1, d2, [x27], #496
-# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 0.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 stgp x1, x2, [x27], #992
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 stgp x1, x2, [x27, #992]!
+# CHECK-NEXT: 5. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 stp s1, s2, [x27], #248
+# CHECK-NEXT: 7. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 stp d1, d2, [x27], #496
+# CHECK-NEXT: 9. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 0.0 <total>
# CHECK: [84] Code Region - G85
@@ -5037,25 +5041,25 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 704
# CHECK-NEXT: Total uOps: 2500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 3.55
# CHECK-NEXT: IPC: 1.42
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: Block RThroughput: 5.0
# CHECK: Timeline view:
# CHECK-NEXT: 0
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeER. . stp q1, q2, [x27], #992
# CHECK-NEXT: [0,1] D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,2] D==eeER . stp s1, s2, [x27, #248]!
-# CHECK-NEXT: [0,3] D===eER . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D==eeER . stp d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,5] .D===eER . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D===eeER . stp q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,7] . D====eER. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D====eER. stp w1, w2, [x27], #248
-# CHECK-NEXT: [0,9] . D=====eER add x0, x27, #1
+# CHECK-NEXT: [0,2] .D=eeER . stp s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3] .D==eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] . D=eeER . stp d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5] . D==eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] . D=eeER . stp q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7] . D==eER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . D==eER. stp w1, w2, [x27], #248
+# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -5066,15 +5070,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 stp q1, q2, [x27], #992
# CHECK-NEXT: 1. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 stp s1, s2, [x27, #248]!
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 stp d1, d2, [x27, #496]!
-# CHECK-NEXT: 5. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 4.0 0.0 0.0 stp q1, q2, [x27, #992]!
-# CHECK-NEXT: 7. 1 5.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 5.0 0.0 0.0 stp w1, w2, [x27], #248
-# CHECK-NEXT: 9. 1 6.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 3.8 0.1 0.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 stp s1, s2, [x27, #248]!
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 stp d1, d2, [x27, #496]!
+# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 2.0 0.0 0.0 stp q1, q2, [x27, #992]!
+# CHECK-NEXT: 7. 1 3.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 3.0 0.0 0.0 stp w1, w2, [x27], #248
+# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.6 0.1 0.0 <total>
# CHECK: [85] Code Region - G86
@@ -5083,24 +5087,24 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 2200
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 4.37
# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 4.4
# CHECK: Timeline view:
# CHECK-NEXT: Index 012345678
# CHECK: [0,0] DeER . . stp x1, x2, [x27], #496
# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eER. . stp w1, w2, [x27, #248]!
-# CHECK-NEXT: [0,3] D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eER . stp x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,5] .D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeER. str b1, [x27], #254
-# CHECK-NEXT: [0,7] .D===eER. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeER str h1, [x27], #254
-# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeER. . stp w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeER . stp x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5] . D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeER. str b1, [x27], #254
+# CHECK-NEXT: [0,7] . D=eER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeER str h1, [x27], #254
+# CHECK-NEXT: [0,9] . D=eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -5111,15 +5115,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 stp x1, x2, [x27], #496
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 stp w1, w2, [x27, #248]!
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 stp x1, x2, [x27, #496]!
-# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 str b1, [x27], #254
-# CHECK-NEXT: 7. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 str h1, [x27], #254
-# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 0.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 stp w1, w2, [x27, #248]!
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 stp x1, x2, [x27, #496]!
+# CHECK-NEXT: 5. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 str b1, [x27], #254
+# CHECK-NEXT: 7. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 str h1, [x27], #254
+# CHECK-NEXT: 9. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 0.0 <total>
# CHECK: [86] Code Region - G87
@@ -5128,24 +5132,24 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 2500
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 4.96
# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 3.8
+# CHECK-NEXT: Block RThroughput: 5.0
# CHECK: Timeline view:
# CHECK-NEXT: Index 012345678
# CHECK: [0,0] DeeER. . str s1, [x27], #254
# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeER . str d1, [x27], #254
-# CHECK-NEXT: [0,3] D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeER . str q1, [x27], #254
-# CHECK-NEXT: [0,5] .D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eeER. str b1, [x27, #254]!
-# CHECK-NEXT: [0,7] .D===eER. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeER str h1, [x27, #254]!
-# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeER . str d1, [x27], #254
+# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeER . str q1, [x27], #254
+# CHECK-NEXT: [0,5] . D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeeER. str b1, [x27, #254]!
+# CHECK-NEXT: [0,7] . D=eER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeeER str h1, [x27, #254]!
+# CHECK-NEXT: [0,9] . D=eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -5156,15 +5160,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 str s1, [x27], #254
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 str d1, [x27], #254
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 str q1, [x27], #254
-# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 str b1, [x27, #254]!
-# CHECK-NEXT: 7. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 str h1, [x27, #254]!
-# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 0.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 str d1, [x27], #254
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 str q1, [x27], #254
+# CHECK-NEXT: 5. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 str b1, [x27, #254]!
+# CHECK-NEXT: 7. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 str h1, [x27, #254]!
+# CHECK-NEXT: 9. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 0.0 <total>
# CHECK: [87] Code Region - G88
@@ -5173,24 +5177,24 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 2300
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 4.56
# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 3.3
+# CHECK-NEXT: Block RThroughput: 4.6
# CHECK: Timeline view:
# CHECK-NEXT: Index 012345678
# CHECK: [0,0] DeeER. . str s1, [x27, #254]!
# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eeER . str d1, [x27, #254]!
-# CHECK-NEXT: [0,3] D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeER . str q1, [x27, #254]!
-# CHECK-NEXT: [0,5] .D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eER . str w1, [x27], #254
-# CHECK-NEXT: [0,7] .D===eER. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eER. str x1, [x27], #254
-# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeeER . str d1, [x27, #254]!
+# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeeER . str q1, [x27, #254]!
+# CHECK-NEXT: [0,5] . D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeER . str w1, [x27], #254
+# CHECK-NEXT: [0,7] . D=eER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeER. str x1, [x27], #254
+# CHECK-NEXT: [0,9] . D=eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -5201,15 +5205,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 str s1, [x27, #254]!
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 str d1, [x27, #254]!
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 str q1, [x27, #254]!
-# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 str w1, [x27], #254
-# CHECK-NEXT: 7. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 str x1, [x27], #254
-# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 0.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 str d1, [x27, #254]!
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 str q1, [x27, #254]!
+# CHECK-NEXT: 5. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 str w1, [x27], #254
+# CHECK-NEXT: 7. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 str x1, [x27], #254
+# CHECK-NEXT: 9. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 0.0 <total>
# CHECK: [88] Code Region - G89
@@ -5218,24 +5222,24 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 3.97
# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
# CHECK-NEXT: Index 012345678
# CHECK: [0,0] DeER . . str w1, [x27, #254]!
# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eER. . str x1, [x27, #254]!
-# CHECK-NEXT: [0,3] D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eER . strb w1, [x27], #254
-# CHECK-NEXT: [0,5] .D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eER . strb w1, [x27, #254]!
-# CHECK-NEXT: [0,7] .D===eER. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eER. strh w1, [x27], #254
-# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeER. . str x1, [x27, #254]!
+# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeER . strb w1, [x27], #254
+# CHECK-NEXT: [0,5] . D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeER . strb w1, [x27, #254]!
+# CHECK-NEXT: [0,7] . D=eER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeER. strh w1, [x27], #254
+# CHECK-NEXT: [0,9] . D=eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -5246,15 +5250,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 str w1, [x27, #254]!
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 str x1, [x27, #254]!
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 strb w1, [x27], #254
-# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 strb w1, [x27, #254]!
-# CHECK-NEXT: 7. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 strh w1, [x27], #254
-# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 0.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 str x1, [x27, #254]!
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 strb w1, [x27], #254
+# CHECK-NEXT: 5. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 strb w1, [x27, #254]!
+# CHECK-NEXT: 7. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 strh w1, [x27], #254
+# CHECK-NEXT: 9. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 0.0 <total>
# CHECK: [89] Code Region - G90
@@ -5263,24 +5267,24 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 10
+# CHECK: Dispatch Width: 5
# CHECK-NEXT: uOps Per Cycle: 3.97
# CHECK-NEXT: IPC: 1.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
# CHECK-NEXT: Index 012345678
# CHECK: [0,0] DeER . . strh w1, [x27, #254]!
# CHECK-NEXT: [0,1] D=eER. . add x0, x27, #1
-# CHECK-NEXT: [0,2] D=eER. . stz2g x26, [x27], #4064
-# CHECK-NEXT: [0,3] D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eER . stz2g x26, [x27, #4064]!
-# CHECK-NEXT: [0,5] .D==eER . add x0, x27, #1
-# CHECK-NEXT: [0,6] .D==eER . stzg x26, [x27], #4064
-# CHECK-NEXT: [0,7] .D===eER. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eER. stzg x26, [x27, #4064]!
-# CHECK-NEXT: [0,9] . D===eER add x0, x27, #1
+# CHECK-NEXT: [0,2] .DeER. . stz2g x26, [x27], #4064
+# CHECK-NEXT: [0,3] .D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,4] . DeER . stz2g x26, [x27, #4064]!
+# CHECK-NEXT: [0,5] . D=eER . add x0, x27, #1
+# CHECK-NEXT: [0,6] . DeER . stzg x26, [x27], #4064
+# CHECK-NEXT: [0,7] . D=eER. add x0, x27, #1
+# CHECK-NEXT: [0,8] . DeER. stzg x26, [x27, #4064]!
+# CHECK-NEXT: [0,9] . D=eER add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -5291,36 +5295,36 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 strh w1, [x27, #254]!
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 stz2g x26, [x27], #4064
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 stz2g x26, [x27, #4064]!
-# CHECK-NEXT: 5. 1 3.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 3.0 0.0 0.0 stzg x26, [x27], #4064
-# CHECK-NEXT: 7. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 stzg x26, [x27, #4064]!
-# CHECK-NEXT: 9. 1 4.0 0.0 0.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.7 0.1 0.0 <total>
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 stz2g x26, [x27], #4064
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 stz2g x26, [x27, #4064]!
+# CHECK-NEXT: 5. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 1.0 0.0 0.0 stzg x26, [x27], #4064
+# CHECK-NEXT: 7. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 1.0 0.0 0.0 stzg x26, [x27, #4064]!
+# CHECK-NEXT: 9. 1 2.0 0.0 0.0 add x0, x27, #1
+# CHECK-NEXT: 1 1.5 0.1 0.0 <total>
# CHECK: [90] Code Region - G91
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 110
+# CHECK-NEXT: Total Cycles: 143
# CHECK-NEXT: Total uOps: 600
-# CHECK: Dispatch Width: 10
-# CHECK-NEXT: uOps Per Cycle: 5.45
-# CHECK-NEXT: IPC: 3.64
-# CHECK-NEXT: Block RThroughput: 1.0
+# CHECK: Dispatch Width: 5
+# CHECK-NEXT: uOps Per Cycle: 4.20
+# CHECK-NEXT: IPC: 2.80
+# CHECK-NEXT: Block RThroughput: 1.2
# CHECK: Timeline view:
# CHECK-NEXT: 0
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeER . ldr x1, [x27], #254
# CHECK-NEXT: [0,1] D=eE--R . add x0, x27, #1
# CHECK-NEXT: [0,2] D====eeeeER ldr x2, [x1], #254
-# CHECK-NEXT: [0,3] D=eE------R add x0, x27, #1
+# CHECK-NEXT: [0,3] .DeE------R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -5332,5 +5336,5 @@ add x0, x27, 1
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldr x1, [x27], #254
# CHECK-NEXT: 1. 1 2.0 0.0 2.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 5.0 0.0 0.0 ldr x2, [x1], #254
-# CHECK-NEXT: 3. 1 2.0 0.0 6.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.5 0.3 2.0 <total>
+# CHECK-NEXT: 3. 1 1.0 0.0 6.0 add x0, x27, #1
+# CHECK-NEXT: 1 2.3 0.3 2.0 <total>
More information about the llvm-commits
mailing list