[llvm] [AArch64] Change IssueWidth to 6 in AArch64SchedNeoverseV2.td (PR #142565)
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 4 01:20:59 PDT 2025
https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/142565
>From 6206892d71fbe57c18ed418389ea9a30cca2febc Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Wed, 4 Jun 2025 08:09:22 +0000
Subject: [PATCH] [AArch64] Change IssueWidth to 6 in AArch64SchedNeoverseV2.td
I think that the issue width for neoverse-v2 CPUs is set too
high and does not properly reflect the dispatch constraints.
I tested various values of IssueWidth (16, 8 and 6) with runs
of SPEC2017 on a neoverse-v2 machine and I got the highest
overall geomean score with an issue width of 6, although it's
only a marginal 0.14% improvement. I also observed a 1-2%
improvement when testing the Gromacs application with some
workloads. Here are some notable changes in SPEC2017 ref
runtimes, i.e. has a ~0.5% change or greater ('-' means
faster):
548.exchange2: -1.7%
510.parest: -0.78%
538.imagick: -0.73%
500.perlbench: -0.57%
525.x264: -0.55%
507.cactuBSSN: -0.5%
520.omnetpp: -0.48%
511.povray: +0.57%
544.nab: +0.65%
503.bwaves: +0.68%
526.blender: +0.75%
If this patch causes any major regressions post-commit it can
be easily reverted, but I think it should be an overall
improvement.
---
.../Target/AArch64/AArch64SchedNeoverseV2.td | 2 +-
.../AArch64/Neoverse/V2-basic-instructions.s | 10 +-
.../AArch64/Neoverse/V2-clear-upper-regs.s | 124 +-
.../llvm-mca/AArch64/Neoverse/V2-forwarding.s | 692 +++---
.../AArch64/Neoverse/V2-sve-instructions.s | 18 +-
.../llvm-mca/AArch64/Neoverse/V2-writeback.s | 1950 +++++++++--------
.../AArch64/Neoverse/V2-zero-lat-movs.s | 24 +-
7 files changed, 1411 insertions(+), 1409 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
index 39f7077ae4514..2fea569296427 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
@@ -14,7 +14,7 @@
//===----------------------------------------------------------------------===//
def NeoverseV2Model : SchedMachineModel {
- let IssueWidth = 16; // Micro-ops dispatched at a time.
+ let IssueWidth = 6; // Micro-ops dispatched at a time.
let MicroOpBufferSize = 320; // Entries in micro-op re-order buffer.
let LoadLatency = 4; // Optimistic load latency.
let MispredictPenalty = 10; // Extra cycles for mispredicted branch. NOTE: Copied from N2.
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
index 581dad6b68dcf..54b5f1644be48 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
@@ -2536,14 +2536,14 @@ drps
# CHECK-NEXT: 1 2 0.50 bics x3, xzr, x3, lsl #1
# CHECK-NEXT: 1 2 0.50 tst w3, w7, lsl #31
# CHECK-NEXT: 1 2 0.50 tst x2, x20, asr #2
-# CHECK-NEXT: 1 0 0.06 mov x3, x6
-# CHECK-NEXT: 1 0 0.06 mov x3, xzr
-# CHECK-NEXT: 1 0 0.06 mov wzr, w2
-# CHECK-NEXT: 1 0 0.06 mov w3, w5
+# CHECK-NEXT: 1 0 0.17 mov x3, x6
+# CHECK-NEXT: 1 0 0.17 mov x3, xzr
+# CHECK-NEXT: 1 0 0.17 mov wzr, w2
+# CHECK-NEXT: 1 0 0.17 mov w3, w5
# CHECK-NEXT: 1 1 0.17 movz w2, #0, lsl #16
# CHECK-NEXT: 1 1 0.17 mov w2, #-1235
# CHECK-NEXT: 1 1 0.17 mov x2, #5299989643264
-# CHECK-NEXT: 1 0 0.06 mov x2, #0
+# CHECK-NEXT: 1 0 0.17 mov x2, #0
# CHECK-NEXT: 1 1 0.17 movk w3, #0
# CHECK-NEXT: 1 1 0.17 movz x4, #0, lsl #16
# CHECK-NEXT: 1 1 0.17 movk w5, #0, lsl #16
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
index fbf65e26e99a5..3398331a67f5b 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
@@ -58,7 +58,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -116,8 +116,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr b0, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr b0, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ldr b0, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -126,9 +126,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr b0, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ldr b0, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [1] Code Region - FPR16-bit
@@ -137,7 +137,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -195,8 +195,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr h0, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr h0, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ldr h0, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -205,9 +205,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr h0, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ldr h0, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [2] Code Region - FPR32-bit
@@ -216,7 +216,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -274,8 +274,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr s0, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr s0, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ldr s0, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -284,9 +284,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr s0, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ldr s0, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [3] Code Region - FPR64-bit
@@ -295,7 +295,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -353,8 +353,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr d0, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr d0, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ldr d0, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -363,9 +363,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr d0, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ldr d0, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [4] Code Region - FPR128-bit
@@ -374,7 +374,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -432,8 +432,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr q0, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr q0, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ldr q0, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -442,9 +442,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr q0, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ldr q0, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [5] Code Region - SIMD64-bit-b
@@ -453,7 +453,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -511,8 +511,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.8b }, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.8b }, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -521,9 +521,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.8b }, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [6] Code Region - SIMD64-bit-h
@@ -532,7 +532,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -590,8 +590,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.4h }, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.4h }, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -600,9 +600,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.4h }, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ld1 { v0.4h }, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [7] Code Region - SIMD64-bit-s
@@ -611,7 +611,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -669,8 +669,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.2s }, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.2s }, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -679,9 +679,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.2s }, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ld1 { v0.2s }, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [8] Code Region - SIMD64-bit-d
@@ -690,7 +690,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -748,8 +748,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.1d }, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.1d }, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -758,9 +758,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.1d }, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ld1 { v0.1d }, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [9] Code Region - insr
@@ -769,7 +769,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 803
# CHECK-NEXT: Total uOps: 300
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.37
# CHECK-NEXT: IPC: 0.25
# CHECK-NEXT: Block RThroughput: 1.0
@@ -825,10 +825,10 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [0,1] D======eeER . . . . . add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,0] D========eeeeeeER . . . . insr z0.s, w0
# CHECK-NEXT: [1,1] D==============eeER . . . . add z0.s, z0.s, z0.s
-# CHECK-NEXT: [2,0] D================eeeeeeER. . . insr z0.s, w0
-# CHECK-NEXT: [2,1] D======================eeER . . add z0.s, z0.s, z0.s
-# CHECK-NEXT: [3,0] D========================eeeeeeER . insr z0.s, w0
-# CHECK-NEXT: [3,1] D==============================eeER add z0.s, z0.s, z0.s
+# CHECK-NEXT: [2,0] .D===============eeeeeeER. . . insr z0.s, w0
+# CHECK-NEXT: [2,1] .D=====================eeER . . add z0.s, z0.s, z0.s
+# CHECK-NEXT: [3,0] .D=======================eeeeeeER . insr z0.s, w0
+# CHECK-NEXT: [3,1] .D=============================eeER add z0.s, z0.s, z0.s
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -837,6 +837,6 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 13.0 0.3 0.0 insr z0.s, w0
-# CHECK-NEXT: 1. 4 19.0 0.0 0.0 add z0.s, z0.s, z0.s
-# CHECK-NEXT: 4 16.0 0.1 0.0 <total>
+# CHECK-NEXT: 0. 4 12.5 0.3 0.0 insr z0.s, w0
+# CHECK-NEXT: 1. 4 18.5 0.0 0.0 add z0.s, z0.s, z0.s
+# CHECK-NEXT: 4 15.5 0.1 0.0 <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s
index 0f5ab183f5358..39a779b27fe7f 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s
@@ -315,7 +315,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 703
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.57
# CHECK-NEXT: IPC: 0.57
# CHECK-NEXT: Block RThroughput: 3.0
@@ -330,8 +330,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D=====eeER. .. madd x0, x0, x0, x0
# CHECK-NEXT: [1,0] D=======eeER .. mul x0, x0, x0
# CHECK-NEXT: [1,1] D=========eeER .. madd x0, x1, x2, x0
-# CHECK-NEXT: [1,2] D==========eeER.. madd x0, x1, x2, x0
-# CHECK-NEXT: [1,3] D============eeER madd x0, x0, x0, x0
+# CHECK-NEXT: [1,2] .D=========eeER.. madd x0, x1, x2, x0
+# CHECK-NEXT: [1,3] .D===========eeER madd x0, x0, x0, x0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -342,9 +342,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0
# CHECK-NEXT: 1. 2 6.5 0.0 0.0 madd x0, x1, x2, x0
-# CHECK-NEXT: 2. 2 7.5 0.0 0.0 madd x0, x1, x2, x0
-# CHECK-NEXT: 3. 2 9.5 0.0 0.0 madd x0, x0, x0, x0
-# CHECK-NEXT: 2 7.0 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 7.0 0.0 0.0 madd x0, x1, x2, x0
+# CHECK-NEXT: 3. 2 9.0 0.0 0.0 madd x0, x0, x0, x0
+# CHECK-NEXT: 2 6.8 0.1 0.0 <total>
# CHECK: [1] Code Region - smaddl
@@ -353,7 +353,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 703
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.57
# CHECK-NEXT: IPC: 0.57
# CHECK-NEXT: Block RThroughput: 3.0
@@ -368,8 +368,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D=====eeER. .. smaddl x0, w0, w0, x0
# CHECK-NEXT: [1,0] D=======eeER .. mul x0, x0, x0
# CHECK-NEXT: [1,1] D=========eeER .. smaddl x0, w1, w2, x0
-# CHECK-NEXT: [1,2] D==========eeER.. smaddl x0, w1, w2, x0
-# CHECK-NEXT: [1,3] D============eeER smaddl x0, w0, w0, x0
+# CHECK-NEXT: [1,2] .D=========eeER.. smaddl x0, w1, w2, x0
+# CHECK-NEXT: [1,3] .D===========eeER smaddl x0, w0, w0, x0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -380,9 +380,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0
# CHECK-NEXT: 1. 2 6.5 0.0 0.0 smaddl x0, w1, w2, x0
-# CHECK-NEXT: 2. 2 7.5 0.0 0.0 smaddl x0, w1, w2, x0
-# CHECK-NEXT: 3. 2 9.5 0.0 0.0 smaddl x0, w0, w0, x0
-# CHECK-NEXT: 2 7.0 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 7.0 0.0 0.0 smaddl x0, w1, w2, x0
+# CHECK-NEXT: 3. 2 9.0 0.0 0.0 smaddl x0, w0, w0, x0
+# CHECK-NEXT: 2 6.8 0.1 0.0 <total>
# CHECK: [2] Code Region - fmadd
@@ -391,7 +391,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1703
# CHECK-NEXT: Total uOps: 600
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.35
# CHECK-NEXT: IPC: 0.35
# CHECK-NEXT: Block RThroughput: 1.5
@@ -406,12 +406,12 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D=======eeeeER . . . . .. fmadd d0, d1, d2, d0
# CHECK-NEXT: [0,4] D=========eeeeER . . . .. fmadd d0, d1, d2, d0
# CHECK-NEXT: [0,5] D=============eeeeER. . . .. fmadd d0, d0, d1, d2
-# CHECK-NEXT: [1,0] D=================eeER . . .. fadd d0, d0, d0
-# CHECK-NEXT: [1,1] D===================eeeeER . .. fmadd d0, d1, d2, d0
-# CHECK-NEXT: [1,2] D=======================eeeER . .. fmul d0, d0, d0
-# CHECK-NEXT: [1,3] D========================eeeeER .. fmadd d0, d1, d2, d0
-# CHECK-NEXT: [1,4] D==========================eeeeER .. fmadd d0, d1, d2, d0
-# CHECK-NEXT: [1,5] D==============================eeeeER fmadd d0, d0, d1, d2
+# CHECK-NEXT: [1,0] .D================eeER . . .. fadd d0, d0, d0
+# CHECK-NEXT: [1,1] .D==================eeeeER . .. fmadd d0, d1, d2, d0
+# CHECK-NEXT: [1,2] .D======================eeeER . .. fmul d0, d0, d0
+# CHECK-NEXT: [1,3] .D=======================eeeeER .. fmadd d0, d1, d2, d0
+# CHECK-NEXT: [1,4] .D=========================eeeeER .. fmadd d0, d1, d2, d0
+# CHECK-NEXT: [1,5] .D=============================eeeeER fmadd d0, d0, d1, d2
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -420,13 +420,13 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.5 0.5 0.0 fadd d0, d0, d0
-# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fmadd d0, d1, d2, d0
-# CHECK-NEXT: 2. 2 15.5 0.0 0.0 fmul d0, d0, d0
-# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmadd d0, d1, d2, d0
-# CHECK-NEXT: 4. 2 18.5 0.0 0.0 fmadd d0, d1, d2, d0
-# CHECK-NEXT: 5. 2 22.5 0.0 0.0 fmadd d0, d0, d1, d2
-# CHECK-NEXT: 2 15.7 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 9.0 0.5 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 1. 2 11.0 0.0 0.0 fmadd d0, d1, d2, d0
+# CHECK-NEXT: 2. 2 15.0 0.0 0.0 fmul d0, d0, d0
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fmadd d0, d1, d2, d0
+# CHECK-NEXT: 4. 2 18.0 0.0 0.0 fmadd d0, d1, d2, d0
+# CHECK-NEXT: 5. 2 22.0 0.0 0.0 fmadd d0, d0, d1, d2
+# CHECK-NEXT: 2 15.2 0.1 0.0 <total>
# CHECK: [3] Code Region - saba
@@ -435,7 +435,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.31
# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 1.5
@@ -450,8 +450,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D=========eeeeER . . . saba v0.4s, v0.4s, v1.4s
# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1] D=================eeeeER . . saba v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2] D==================eeeeER. . saba v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3] D======================eeeeER saba v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,2] .D=================eeeeER. . saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3] .D=====================eeeeER saba v0.4s, v0.4s, v1.4s
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -462,9 +462,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1. 2 11.5 0.0 0.0 saba v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2. 2 12.5 0.0 0.0 saba v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3. 2 16.5 0.0 0.0 saba v0.4s, v0.4s, v1.4s
-# CHECK-NEXT: 2 12.0 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 saba v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
# CHECK: [4] Code Region - sdot
@@ -473,7 +473,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1103
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.36
# CHECK-NEXT: IPC: 0.36
# CHECK-NEXT: Block RThroughput: 0.8
@@ -488,8 +488,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D========eeeER . . . sdot v0.4s, v0.16b, v1.16b
# CHECK-NEXT: [1,0] D===========eeeeER . . mul v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1] D===============eeeER . sdot v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,2] D================eeeER . sdot v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,3] D===================eeeER sdot v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: [1,2] .D===============eeeER . sdot v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,3] .D==================eeeER sdot v0.4s, v0.16b, v1.16b
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -500,9 +500,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1. 2 10.5 0.0 0.0 sdot v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 2. 2 11.5 0.0 0.0 sdot v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 3. 2 14.5 0.0 0.0 sdot v0.4s, v0.16b, v1.16b
-# CHECK-NEXT: 2 10.8 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 11.0 0.0 0.0 sdot v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 3. 2 14.0 0.0 0.0 sdot v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: 2 10.5 0.1 0.0 <total>
# CHECK: [5] Code Region - smmla
@@ -511,7 +511,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1103
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.36
# CHECK-NEXT: IPC: 0.36
# CHECK-NEXT: Block RThroughput: 0.8
@@ -526,8 +526,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D========eeeER . . . smmla v0.4s, v0.16b, v1.16b
# CHECK-NEXT: [1,0] D===========eeeeER . . mul v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1] D===============eeeER . smmla v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,2] D================eeeER . smmla v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: [1,3] D===================eeeER smmla v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: [1,2] .D===============eeeER . smmla v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: [1,3] .D==================eeeER smmla v0.4s, v0.16b, v1.16b
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -538,9 +538,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1. 2 10.5 0.0 0.0 smmla v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 2. 2 11.5 0.0 0.0 smmla v0.4s, v1.16b, v2.16b
-# CHECK-NEXT: 3. 2 14.5 0.0 0.0 smmla v0.4s, v0.16b, v1.16b
-# CHECK-NEXT: 2 10.8 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 11.0 0.0 0.0 smmla v0.4s, v1.16b, v2.16b
+# CHECK-NEXT: 3. 2 14.0 0.0 0.0 smmla v0.4s, v0.16b, v1.16b
+# CHECK-NEXT: 2 10.5 0.1 0.0 <total>
# CHECK: [6] Code Region - mla
@@ -549,7 +549,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.31
# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 2.0
@@ -564,8 +564,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D=========eeeeER . . . mla v0.4s, v0.4s, v1.4s
# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1] D=================eeeeER . . mla v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2] D==================eeeeER. . mla v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3] D======================eeeeER mla v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,2] .D=================eeeeER. . mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3] .D=====================eeeeER mla v0.4s, v0.4s, v1.4s
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -576,9 +576,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1. 2 11.5 0.0 0.0 mla v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2. 2 12.5 0.0 0.0 mla v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3. 2 16.5 0.0 0.0 mla v0.4s, v0.4s, v1.4s
-# CHECK-NEXT: 2 12.0 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 mla v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
# CHECK: [7] Code Region - sqrdmlah
@@ -587,7 +587,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1403
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.29
# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: Block RThroughput: 3.5
@@ -602,8 +602,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D==========eeeeER . . . sqrdmlah v0.4s, v0.4s, v1.4s
# CHECK-NEXT: [1,0] D==============eeeeER . . mul v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1] D==================eeeeER. . sqrdmlah v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2] D====================eeeeER . sqrdmlah v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3] D========================eeeeER sqrdmlah v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,2] .D===================eeeeER . sqrdmlah v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3] .D=======================eeeeER sqrdmlah v0.4s, v0.4s, v1.4s
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -614,9 +614,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1. 2 12.0 0.0 0.0 sqrdmlah v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2. 2 14.0 0.0 0.0 sqrdmlah v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3. 2 18.0 0.0 0.0 sqrdmlah v0.4s, v0.4s, v1.4s
-# CHECK-NEXT: 2 13.0 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 13.5 0.0 0.0 sqrdmlah v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 sqrdmlah v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
# CHECK: [8] Code Region - smlal2
@@ -625,7 +625,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.31
# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 2.0
@@ -640,8 +640,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D=========eeeeER . . . smlal2 v0.4s, v0.8h, v1.8h
# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1] D=================eeeeER . . smlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2] D==================eeeeER. . smlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3] D======================eeeeER smlal2 v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,2] .D=================eeeeER. . smlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3] .D=====================eeeeER smlal2 v0.4s, v0.8h, v1.8h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -652,9 +652,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1. 2 11.5 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2. 2 12.5 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3. 2 16.5 0.0 0.0 smlal2 v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: 2 12.0 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 smlal2 v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
# CHECK: [9] Code Region - sadalp
@@ -663,7 +663,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.31
# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 1.5
@@ -678,8 +678,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D=========eeeeER . . . sadalp v0.2d, v0.4s
# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1] D=================eeeeER . . sadalp v0.2d, v1.4s
-# CHECK-NEXT: [1,2] D==================eeeeER. . sadalp v0.2d, v1.4s
-# CHECK-NEXT: [1,3] D======================eeeeER sadalp v0.2d, v0.4s
+# CHECK-NEXT: [1,2] .D=================eeeeER. . sadalp v0.2d, v1.4s
+# CHECK-NEXT: [1,3] .D=====================eeeeER sadalp v0.2d, v0.4s
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -690,9 +690,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1. 2 11.5 0.0 0.0 sadalp v0.2d, v1.4s
-# CHECK-NEXT: 2. 2 12.5 0.0 0.0 sadalp v0.2d, v1.4s
-# CHECK-NEXT: 3. 2 16.5 0.0 0.0 sadalp v0.2d, v0.4s
-# CHECK-NEXT: 2 12.0 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 sadalp v0.2d, v1.4s
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sadalp v0.2d, v0.4s
+# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
# CHECK: [10] Code Region - ssra
@@ -701,7 +701,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.31
# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 1.5
@@ -716,8 +716,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D=========eeeeER . . . ssra v0.2d, v0.2d, #1
# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1] D=================eeeeER . . ssra v0.2d, v1.2d, #1
-# CHECK-NEXT: [1,2] D==================eeeeER. . ssra v0.2d, v1.2d, #1
-# CHECK-NEXT: [1,3] D======================eeeeER ssra v0.2d, v0.2d, #1
+# CHECK-NEXT: [1,2] .D=================eeeeER. . ssra v0.2d, v1.2d, #1
+# CHECK-NEXT: [1,3] .D=====================eeeeER ssra v0.2d, v0.2d, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -728,9 +728,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1. 2 11.5 0.0 0.0 ssra v0.2d, v1.2d, #1
-# CHECK-NEXT: 2. 2 12.5 0.0 0.0 ssra v0.2d, v1.2d, #1
-# CHECK-NEXT: 3. 2 16.5 0.0 0.0 ssra v0.2d, v0.2d, #1
-# CHECK-NEXT: 2 12.0 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 ssra v0.2d, v1.2d, #1
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 ssra v0.2d, v0.2d, #1
+# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
# CHECK: [11] Code Region - fcmla
@@ -739,7 +739,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.31
# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 1.0
@@ -754,8 +754,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D=========eeeeER . . . fcmla v0.2d, v0.2d, v1.2d, #90
# CHECK-NEXT: [1,0] D=============eeeER . . . fmul v0.4s, v0.4s, v0.4s
# CHECK-NEXT: [1,1] D================eeeeER . . fcmla v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: [1,2] D==================eeeeER. . fcmla v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: [1,3] D======================eeeeER fcmla v0.2d, v0.2d, v1.2d, #90
+# CHECK-NEXT: [1,2] .D=================eeeeER. . fcmla v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: [1,3] .D=====================eeeeER fcmla v0.2d, v0.2d, v1.2d, #90
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -766,9 +766,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul v0.4s, v0.4s, v0.4s
# CHECK-NEXT: 1. 2 10.5 0.0 0.0 fcmla v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: 2. 2 12.5 0.0 0.0 fcmla v0.2d, v1.2d, v2.2d, #90
-# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fcmla v0.2d, v0.2d, v1.2d, #90
-# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 fcmla v0.2d, v1.2d, v2.2d, #90
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fcmla v0.2d, v0.2d, v1.2d, #90
+# CHECK-NEXT: 2 11.5 0.1 0.0 <total>
# CHECK: [12] Code Region - fmla
@@ -777,7 +777,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1703
# CHECK-NEXT: Total uOps: 600
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.35
# CHECK-NEXT: IPC: 0.35
# CHECK-NEXT: Block RThroughput: 1.5
@@ -792,12 +792,12 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D=======eeeeER . . . . .. fmla v0.2d, v1.2d, v2.2d
# CHECK-NEXT: [0,4] D=========eeeeER . . . .. fmla v0.2d, v1.2d, v2.2d
# CHECK-NEXT: [0,5] D=============eeeeER. . . .. fmla v0.2d, v0.2d, v1.2d
-# CHECK-NEXT: [1,0] D=================eeeER . . .. fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1] D==================eeeeER. . .. fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,2] D======================eeER . .. fadd v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,3] D========================eeeeER .. fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,4] D==========================eeeeER .. fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,5] D==============================eeeeER fmla v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: [1,0] .D================eeeER . . .. fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1] .D=================eeeeER. . .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,2] .D=====================eeER . .. fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3] .D=======================eeeeER .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,4] .D=========================eeeeER .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,5] .D=============================eeeeER fmla v0.2d, v0.2d, v1.2d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -806,13 +806,13 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1. 2 10.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 2. 2 14.5 0.0 0.0 fadd v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 4. 2 18.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 5. 2 22.5 0.0 0.0 fmla v0.2d, v0.2d, v1.2d
-# CHECK-NEXT: 2 15.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 9.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1. 2 10.0 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 2. 2 14.0 0.0 0.0 fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 4. 2 18.0 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 5. 2 22.0 0.0 0.0 fmla v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: 2 14.8 0.1 0.0 <total>
# CHECK: [13] Code Region - fmlal
@@ -821,7 +821,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1903
# CHECK-NEXT: Total uOps: 600
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.32
# CHECK-NEXT: IPC: 0.32
# CHECK-NEXT: Block RThroughput: 1.5
@@ -836,12 +836,12 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D=========eeeeER . . . . . fmlal v0.4s, v1.4h, v2.4h
# CHECK-NEXT: [0,4] D===========eeeeER . . . . . fmlal v0.4s, v1.4h, v2.4h
# CHECK-NEXT: [0,5] D===============eeeeER . . . . fmlal v0.4s, v0.4h, v1.4h
-# CHECK-NEXT: [1,0] D===================eeeER. . . . fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1] D======================eeeeER . . . fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,2] D==========================eeER . . fadd v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,3] D============================eeeeER. . fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,4] D==============================eeeeER . fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,5] D==================================eeeeER fmlal v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: [1,0] .D==================eeeER. . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1] .D=====================eeeeER . . . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,2] .D=========================eeER . . fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3] .D===========================eeeeER. . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,4] .D=============================eeeeER . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,5] .D=================================eeeeER fmlal v0.4s, v0.4h, v1.4h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -850,13 +850,13 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 10.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1. 2 13.5 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 2. 2 17.5 0.0 0.0 fadd v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 3. 2 19.5 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 4. 2 21.5 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 5. 2 25.5 0.0 0.0 fmlal v0.4s, v0.4h, v1.4h
-# CHECK-NEXT: 2 18.0 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 10.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1. 2 13.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 2. 2 17.0 0.0 0.0 fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3. 2 19.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 4. 2 21.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 5. 2 25.0 0.0 0.0 fmlal v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: 2 17.5 0.1 0.0 <total>
# CHECK: [14] Code Region - bfdot
@@ -865,7 +865,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1603
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.25
# CHECK-NEXT: IPC: 0.25
# CHECK-NEXT: Block RThroughput: 1.0
@@ -880,8 +880,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D===========eeeeeER . . . . bfdot v0.4s, v0.8h, v1.8h
# CHECK-NEXT: [1,0] D================eeeER . . . fmul v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [1,1] D===================eeeeeER . . bfdot v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2] D======================eeeeeER. . bfdot v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3] D===========================eeeeeER bfdot v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,2] .D=====================eeeeeER. . bfdot v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3] .D==========================eeeeeER bfdot v0.4s, v0.8h, v1.8h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -892,9 +892,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 9.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
# CHECK-NEXT: 1. 2 12.0 0.0 0.0 bfdot v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2. 2 15.0 0.0 0.0 bfdot v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3. 2 20.0 0.0 0.0 bfdot v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: 2 14.0 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 14.5 0.0 0.0 bfdot v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3. 2 19.5 0.0 0.0 bfdot v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
# CHECK: [15] Code Region - bfmmla
@@ -903,7 +903,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1903
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.21
# CHECK-NEXT: IPC: 0.21
# CHECK-NEXT: Block RThroughput: 1.0
@@ -918,8 +918,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D=============eeeeeeER . . . . bfmmla v0.4s, v0.8h, v1.8h
# CHECK-NEXT: [1,0] D===================eeeER. . . . fmul v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [1,1] D======================eeeeeeER . . bfmmla v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2] D==========================eeeeeeER. . bfmmla v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3] D================================eeeeeeER bfmmla v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,2] .D=========================eeeeeeER. . bfmmla v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3] .D===============================eeeeeeER bfmmla v0.4s, v0.8h, v1.8h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -930,9 +930,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 10.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
# CHECK-NEXT: 1. 2 13.5 0.0 0.0 bfmmla v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2. 2 17.5 0.0 0.0 bfmmla v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3. 2 23.5 0.0 0.0 bfmmla v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 17.0 0.0 0.0 bfmmla v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3. 2 23.0 0.0 0.0 bfmmla v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: 2 16.0 0.1 0.0 <total>
# CHECK: [16] Code Region - bfmlalb
@@ -941,7 +941,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1503
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.27
# CHECK-NEXT: IPC: 0.27
# CHECK-NEXT: Block RThroughput: 1.0
@@ -956,8 +956,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D==========eeeeeER . . . . bfmlalb v0.4s, v0.8h, v1.8h
# CHECK-NEXT: [1,0] D===============eeeER . . . fmul v0.2d, v0.2d, v0.2d
# CHECK-NEXT: [1,1] D==================eeeeeER . . bfmlalb v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2] D====================eeeeeER . . bfmlalb v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3] D=========================eeeeeER bfmlalb v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,2] .D===================eeeeeER . . bfmlalb v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3] .D========================eeeeeER bfmlalb v0.4s, v0.8h, v1.8h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -968,9 +968,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
# CHECK-NEXT: 1. 2 11.5 0.0 0.0 bfmlalb v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2. 2 13.5 0.0 0.0 bfmlalb v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3. 2 18.5 0.0 0.0 bfmlalb v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: 2 13.0 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 13.0 0.0 0.0 bfmlalb v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3. 2 18.0 0.0 0.0 bfmlalb v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
# CHECK: [17] Code Region - crc32b
@@ -979,7 +979,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 703
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.57
# CHECK-NEXT: IPC: 0.57
# CHECK-NEXT: Block RThroughput: 3.0
@@ -994,8 +994,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D=====eeER. .. crc32b w0, w0, w0
# CHECK-NEXT: [1,0] D=======eeER .. mul w0, w0, w0
# CHECK-NEXT: [1,1] D=========eeER .. crc32b w0, w0, w1
-# CHECK-NEXT: [1,2] D==========eeER.. crc32b w0, w0, w1
-# CHECK-NEXT: [1,3] D============eeER crc32b w0, w0, w0
+# CHECK-NEXT: [1,2] .D=========eeER.. crc32b w0, w0, w1
+# CHECK-NEXT: [1,3] .D===========eeER crc32b w0, w0, w0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1006,9 +1006,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul w0, w0, w0
# CHECK-NEXT: 1. 2 6.5 0.0 0.0 crc32b w0, w0, w1
-# CHECK-NEXT: 2. 2 7.5 0.0 0.0 crc32b w0, w0, w1
-# CHECK-NEXT: 3. 2 9.5 0.0 0.0 crc32b w0, w0, w0
-# CHECK-NEXT: 2 7.0 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 7.0 0.0 0.0 crc32b w0, w0, w1
+# CHECK-NEXT: 3. 2 9.0 0.0 0.0 crc32b w0, w0, w0
+# CHECK-NEXT: 2 6.8 0.1 0.0 <total>
# CHECK: [18] Code Region - Z saba
@@ -1017,7 +1017,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1403
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.36
# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: Block RThroughput: 1.5
@@ -1030,10 +1030,10 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,1] D=====eeeeER . . . . saba z0.d, z1.d, z2.d
# CHECK-NEXT: [0,2] D======eeeeER . . . . saba z0.d, z1.d, z2.d
# CHECK-NEXT: [0,3] D==========eeeeER . . . saba z0.d, z0.d, z1.d
-# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D===================eeeeER . saba z0.d, z1.d, z2.d
-# CHECK-NEXT: [1,2] D====================eeeeER . saba z0.d, z1.d, z2.d
-# CHECK-NEXT: [1,3] D========================eeeeER saba z0.d, z0.d, z1.d
+# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D==================eeeeER . saba z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,2] .D===================eeeeER . saba z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,3] .D=======================eeeeER saba z0.d, z0.d, z1.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1042,11 +1042,11 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 13.0 0.0 0.0 saba z0.d, z1.d, z2.d
-# CHECK-NEXT: 2. 2 14.0 0.0 0.0 saba z0.d, z1.d, z2.d
-# CHECK-NEXT: 3. 2 18.0 0.0 0.0 saba z0.d, z0.d, z1.d
-# CHECK-NEXT: 2 13.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 12.5 0.0 0.0 saba z0.d, z1.d, z2.d
+# CHECK-NEXT: 2. 2 13.5 0.0 0.0 saba z0.d, z1.d, z2.d
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 saba z0.d, z0.d, z1.d
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
# CHECK: [19] Code Region - Z sadalp
@@ -1055,7 +1055,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1403
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.36
# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: Block RThroughput: 1.5
@@ -1068,10 +1068,10 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,1] D=====eeeeER . . . . sadalp z0.d, p0/m, z1.s
# CHECK-NEXT: [0,2] D======eeeeER . . . . sadalp z0.d, p0/m, z1.s
# CHECK-NEXT: [0,3] D==========eeeeER . . . sadalp z0.d, p0/m, z0.s
-# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D===================eeeeER . sadalp z0.d, p0/m, z1.s
-# CHECK-NEXT: [1,2] D====================eeeeER . sadalp z0.d, p0/m, z1.s
-# CHECK-NEXT: [1,3] D========================eeeeER sadalp z0.d, p0/m, z0.s
+# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D==================eeeeER . sadalp z0.d, p0/m, z1.s
+# CHECK-NEXT: [1,2] .D===================eeeeER . sadalp z0.d, p0/m, z1.s
+# CHECK-NEXT: [1,3] .D=======================eeeeER sadalp z0.d, p0/m, z0.s
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1080,11 +1080,11 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sadalp z0.d, p0/m, z1.s
-# CHECK-NEXT: 2. 2 14.0 0.0 0.0 sadalp z0.d, p0/m, z1.s
-# CHECK-NEXT: 3. 2 18.0 0.0 0.0 sadalp z0.d, p0/m, z0.s
-# CHECK-NEXT: 2 13.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 12.5 0.0 0.0 sadalp z0.d, p0/m, z1.s
+# CHECK-NEXT: 2. 2 13.5 0.0 0.0 sadalp z0.d, p0/m, z1.s
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 sadalp z0.d, p0/m, z0.s
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
# CHECK: [20] Code Region - Z ssra
@@ -1093,7 +1093,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1403
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.36
# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: Block RThroughput: 1.5
@@ -1106,10 +1106,10 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,1] D=====eeeeER . . . . ssra z0.d, z1.d, #1
# CHECK-NEXT: [0,2] D======eeeeER . . . . ssra z0.d, z1.d, #1
# CHECK-NEXT: [0,3] D==========eeeeER . . . ssra z0.d, z0.d, #1
-# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D===================eeeeER . ssra z0.d, z1.d, #1
-# CHECK-NEXT: [1,2] D====================eeeeER . ssra z0.d, z1.d, #1
-# CHECK-NEXT: [1,3] D========================eeeeER ssra z0.d, z0.d, #1
+# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D==================eeeeER . ssra z0.d, z1.d, #1
+# CHECK-NEXT: [1,2] .D===================eeeeER . ssra z0.d, z1.d, #1
+# CHECK-NEXT: [1,3] .D=======================eeeeER ssra z0.d, z0.d, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1118,11 +1118,11 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 13.0 0.0 0.0 ssra z0.d, z1.d, #1
-# CHECK-NEXT: 2. 2 14.0 0.0 0.0 ssra z0.d, z1.d, #1
-# CHECK-NEXT: 3. 2 18.0 0.0 0.0 ssra z0.d, z0.d, #1
-# CHECK-NEXT: 2 13.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 12.5 0.0 0.0 ssra z0.d, z1.d, #1
+# CHECK-NEXT: 2. 2 13.5 0.0 0.0 ssra z0.d, z1.d, #1
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 ssra z0.d, z0.d, #1
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
# CHECK: [21] Code Region - Z cdot.s
@@ -1131,7 +1131,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1203
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.42
# CHECK-NEXT: IPC: 0.33
# CHECK-NEXT: Block RThroughput: 1.0
@@ -1144,10 +1144,10 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,1] D=====eeeER . . .. cdot z0.s, z1.b, z2.b, #90
# CHECK-NEXT: [0,2] D======eeeER . . .. cdot z0.s, z1.b, z2.b, #90
# CHECK-NEXT: [0,3] D=========eeeER. . .. cdot z0.s, z0.b, z1.b, #90
-# CHECK-NEXT: [1,0] D============eeeeeER. .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D=================eeeER .. cdot z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: [1,2] D==================eeeER .. cdot z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: [1,3] D=====================eeeER cdot z0.s, z0.b, z1.b, #90
+# CHECK-NEXT: [1,0] .D===========eeeeeER. .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D================eeeER .. cdot z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [1,2] .D=================eeeER .. cdot z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: [1,3] .D====================eeeER cdot z0.s, z0.b, z1.b, #90
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1156,11 +1156,11 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 12.0 0.0 0.0 cdot z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: 2. 2 13.0 0.0 0.0 cdot z0.s, z1.b, z2.b, #90
-# CHECK-NEXT: 3. 2 16.0 0.0 0.0 cdot z0.s, z0.b, z1.b, #90
-# CHECK-NEXT: 2 12.0 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 cdot z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: 2. 2 12.5 0.0 0.0 cdot z0.s, z1.b, z2.b, #90
+# CHECK-NEXT: 3. 2 15.5 0.0 0.0 cdot z0.s, z0.b, z1.b, #90
+# CHECK-NEXT: 2 11.5 0.1 0.0 <total>
# CHECK: [22] Code Region - Z cdot.d
@@ -1169,7 +1169,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1403
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.36
# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: Block RThroughput: 2.5
@@ -1182,10 +1182,10 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,1] D=====eeeeER . . . . cdot z0.d, z1.h, z2.h, #90
# CHECK-NEXT: [0,2] D======eeeeER . . . . cdot z0.d, z1.h, z2.h, #90
# CHECK-NEXT: [0,3] D==========eeeeER . . . cdot z0.d, z0.h, z1.h, #90
-# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D===================eeeeER . cdot z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: [1,2] D====================eeeeER . cdot z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: [1,3] D========================eeeeER cdot z0.d, z0.h, z1.h, #90
+# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D==================eeeeER . cdot z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [1,2] .D===================eeeeER . cdot z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: [1,3] .D=======================eeeeER cdot z0.d, z0.h, z1.h, #90
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1194,11 +1194,11 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 13.0 0.0 0.0 cdot z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: 2. 2 14.0 0.0 0.0 cdot z0.d, z1.h, z2.h, #90
-# CHECK-NEXT: 3. 2 18.0 0.0 0.0 cdot z0.d, z0.h, z1.h, #90
-# CHECK-NEXT: 2 13.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 12.5 0.0 0.0 cdot z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: 2. 2 13.5 0.0 0.0 cdot z0.d, z1.h, z2.h, #90
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 cdot z0.d, z0.h, z1.h, #90
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
# CHECK: [23] Code Region - Z cmla.b
@@ -1207,7 +1207,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1403
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.36
# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: Block RThroughput: 2.5
@@ -1220,10 +1220,10 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,1] D=====eeeeER . . . . cmla z0.b, z1.b, z2.b, #90
# CHECK-NEXT: [0,2] D======eeeeER . . . . cmla z0.b, z1.b, z2.b, #90
# CHECK-NEXT: [0,3] D==========eeeeER . . . cmla z0.b, z0.b, z1.b, #90
-# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D===================eeeeER . cmla z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: [1,2] D====================eeeeER . cmla z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: [1,3] D========================eeeeER cmla z0.b, z0.b, z1.b, #90
+# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D==================eeeeER . cmla z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [1,2] .D===================eeeeER . cmla z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: [1,3] .D=======================eeeeER cmla z0.b, z0.b, z1.b, #90
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1232,11 +1232,11 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 13.0 0.0 0.0 cmla z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: 2. 2 14.0 0.0 0.0 cmla z0.b, z1.b, z2.b, #90
-# CHECK-NEXT: 3. 2 18.0 0.0 0.0 cmla z0.b, z0.b, z1.b, #90
-# CHECK-NEXT: 2 13.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 12.5 0.0 0.0 cmla z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: 2. 2 13.5 0.0 0.0 cmla z0.b, z1.b, z2.b, #90
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 cmla z0.b, z0.b, z1.b, #90
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
# CHECK: [24] Code Region - Z cmla.d
@@ -1245,7 +1245,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1803
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.28
# CHECK-NEXT: IPC: 0.22
# CHECK-NEXT: Block RThroughput: 4.0
@@ -1258,10 +1258,10 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,1] D=====eeeeeER . . . . . . cmla z0.d, z1.d, z2.d, #90
# CHECK-NEXT: [0,2] D========eeeeeER . . . . . cmla z0.d, z1.d, z2.d, #90
# CHECK-NEXT: [0,3] D=============eeeeeER . . . . cmla z0.d, z0.d, z1.d, #90
-# CHECK-NEXT: [1,0] D==================eeeeeER . . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D=======================eeeeeER . . cmla z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: [1,2] D==========================eeeeeER . . cmla z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: [1,3] D===============================eeeeeER cmla z0.d, z0.d, z1.d, #90
+# CHECK-NEXT: [1,0] .D=================eeeeeER . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D======================eeeeeER . . cmla z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [1,2] .D=========================eeeeeER . . cmla z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: [1,3] .D==============================eeeeeER cmla z0.d, z0.d, z1.d, #90
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1270,11 +1270,11 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 10.0 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 15.0 0.0 0.0 cmla z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: 2. 2 18.0 0.0 0.0 cmla z0.d, z1.d, z2.d, #90
-# CHECK-NEXT: 3. 2 23.0 0.0 0.0 cmla z0.d, z0.d, z1.d, #90
-# CHECK-NEXT: 2 16.5 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 14.5 0.0 0.0 cmla z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: 2. 2 17.5 0.0 0.0 cmla z0.d, z1.d, z2.d, #90
+# CHECK-NEXT: 3. 2 22.5 0.0 0.0 cmla z0.d, z0.d, z1.d, #90
+# CHECK-NEXT: 2 16.0 0.1 0.0 <total>
# CHECK: [25] Code Region - Z sdot.s
@@ -1283,7 +1283,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1203
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.42
# CHECK-NEXT: IPC: 0.33
# CHECK-NEXT: Block RThroughput: 1.0
@@ -1296,10 +1296,10 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,1] D=====eeeER . . .. sdot z0.s, z1.b, z2.b
# CHECK-NEXT: [0,2] D======eeeER . . .. sdot z0.s, z1.b, z2.b
# CHECK-NEXT: [0,3] D=========eeeER. . .. sdot z0.s, z0.b, z1.b
-# CHECK-NEXT: [1,0] D============eeeeeER. .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D=================eeeER .. sdot z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,2] D==================eeeER .. sdot z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,3] D=====================eeeER sdot z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,0] .D===========eeeeeER. .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D================eeeER .. sdot z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,2] .D=================eeeER .. sdot z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3] .D====================eeeER sdot z0.s, z0.b, z1.b
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1308,11 +1308,11 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 12.0 0.0 0.0 sdot z0.s, z1.b, z2.b
-# CHECK-NEXT: 2. 2 13.0 0.0 0.0 sdot z0.s, z1.b, z2.b
-# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sdot z0.s, z0.b, z1.b
-# CHECK-NEXT: 2 12.0 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 sdot z0.s, z1.b, z2.b
+# CHECK-NEXT: 2. 2 12.5 0.0 0.0 sdot z0.s, z1.b, z2.b
+# CHECK-NEXT: 3. 2 15.5 0.0 0.0 sdot z0.s, z0.b, z1.b
+# CHECK-NEXT: 2 11.5 0.1 0.0 <total>
# CHECK: [26] Code Region - Z sudot
@@ -1321,7 +1321,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1203
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.42
# CHECK-NEXT: IPC: 0.33
# CHECK-NEXT: Block RThroughput: 1.0
@@ -1334,10 +1334,10 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,1] D=====eeeER . . .. sdot z0.s, z1.b, z2.b[1]
# CHECK-NEXT: [0,2] D======eeeER . . .. sdot z0.s, z1.b, z2.b[1]
# CHECK-NEXT: [0,3] D=========eeeER. . .. sdot z0.s, z0.b, z1.b[1]
-# CHECK-NEXT: [1,0] D============eeeeeER. .. mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D=================eeeER .. sdot z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: [1,2] D==================eeeER .. sdot z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: [1,3] D=====================eeeER sdot z0.s, z0.b, z1.b[1]
+# CHECK-NEXT: [1,0] .D===========eeeeeER. .. mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D================eeeER .. sdot z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,2] .D=================eeeER .. sdot z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: [1,3] .D====================eeeER sdot z0.s, z0.b, z1.b[1]
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1346,11 +1346,11 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 12.0 0.0 0.0 sdot z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: 2. 2 13.0 0.0 0.0 sdot z0.s, z1.b, z2.b[1]
-# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sdot z0.s, z0.b, z1.b[1]
-# CHECK-NEXT: 2 12.0 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 sdot z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 2. 2 12.5 0.0 0.0 sdot z0.s, z1.b, z2.b[1]
+# CHECK-NEXT: 3. 2 15.5 0.0 0.0 sdot z0.s, z0.b, z1.b[1]
+# CHECK-NEXT: 2 11.5 0.1 0.0 <total>
# CHECK: [27] Code Region - Z sdot.d
@@ -1359,7 +1359,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1403
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.36
# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: Block RThroughput: 2.5
@@ -1372,10 +1372,10 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,1] D=====eeeeER . . . . sdot z0.d, z1.h, z2.h
# CHECK-NEXT: [0,2] D======eeeeER . . . . sdot z0.d, z1.h, z2.h
# CHECK-NEXT: [0,3] D==========eeeeER . . . sdot z0.d, z0.h, z1.h
-# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D===================eeeeER . sdot z0.d, z1.h, z2.h
-# CHECK-NEXT: [1,2] D====================eeeeER . sdot z0.d, z1.h, z2.h
-# CHECK-NEXT: [1,3] D========================eeeeER sdot z0.d, z0.h, z1.h
+# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D==================eeeeER . sdot z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,2] .D===================eeeeER . sdot z0.d, z1.h, z2.h
+# CHECK-NEXT: [1,3] .D=======================eeeeER sdot z0.d, z0.h, z1.h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1384,11 +1384,11 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sdot z0.d, z1.h, z2.h
-# CHECK-NEXT: 2. 2 14.0 0.0 0.0 sdot z0.d, z1.h, z2.h
-# CHECK-NEXT: 3. 2 18.0 0.0 0.0 sdot z0.d, z0.h, z1.h
-# CHECK-NEXT: 2 13.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 12.5 0.0 0.0 sdot z0.d, z1.h, z2.h
+# CHECK-NEXT: 2. 2 13.5 0.0 0.0 sdot z0.d, z1.h, z2.h
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 sdot z0.d, z0.h, z1.h
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
# CHECK: [28] Code Region - Z smmla
@@ -1397,7 +1397,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1103
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.36
# CHECK-NEXT: IPC: 0.36
# CHECK-NEXT: Block RThroughput: 0.8
@@ -1412,8 +1412,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D========eeeER . . . smmla z0.s, z0.b, z1.b
# CHECK-NEXT: [1,0] D===========eeeeER . . mul z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D===============eeeER . smmla z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,2] D================eeeER . smmla z0.s, z1.b, z2.b
-# CHECK-NEXT: [1,3] D===================eeeER smmla z0.s, z0.b, z1.b
+# CHECK-NEXT: [1,2] .D===============eeeER . smmla z0.s, z1.b, z2.b
+# CHECK-NEXT: [1,3] .D==================eeeER smmla z0.s, z0.b, z1.b
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1424,9 +1424,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul z0.s, z0.s, z0.s
# CHECK-NEXT: 1. 2 10.5 0.0 0.0 smmla z0.s, z1.b, z2.b
-# CHECK-NEXT: 2. 2 11.5 0.0 0.0 smmla z0.s, z1.b, z2.b
-# CHECK-NEXT: 3. 2 14.5 0.0 0.0 smmla z0.s, z0.b, z1.b
-# CHECK-NEXT: 2 10.8 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 11.0 0.0 0.0 smmla z0.s, z1.b, z2.b
+# CHECK-NEXT: 3. 2 14.0 0.0 0.0 smmla z0.s, z0.b, z1.b
+# CHECK-NEXT: 2 10.5 0.1 0.0 <total>
# CHECK: [29] Code Region - Z mla.b
@@ -1435,7 +1435,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1403
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.36
# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: Block RThroughput: 4.0
@@ -1448,10 +1448,10 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,1] D=====eeeeER . . . . mla z0.b, p0/m, z1.b, z2.b
# CHECK-NEXT: [0,2] D======eeeeER . . . . mla z0.b, p0/m, z1.b, z2.b
# CHECK-NEXT: [0,3] D==========eeeeER . . . mla z0.b, p0/m, z0.b, z1.b
-# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D===================eeeeER . mla z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: [1,2] D====================eeeeER . mla z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: [1,3] D========================eeeeER mla z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D==================eeeeER . mla z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [1,2] .D===================eeeeER . mla z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: [1,3] .D=======================eeeeER mla z0.b, p0/m, z0.b, z1.b
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1460,11 +1460,11 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 13.0 0.0 0.0 mla z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: 2. 2 14.0 0.0 0.0 mla z0.b, p0/m, z1.b, z2.b
-# CHECK-NEXT: 3. 2 18.0 0.0 0.0 mla z0.b, p0/m, z0.b, z1.b
-# CHECK-NEXT: 2 13.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 12.5 0.0 0.0 mla z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: 2. 2 13.5 0.0 0.0 mla z0.b, p0/m, z1.b, z2.b
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 mla z0.b, p0/m, z0.b, z1.b
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
# CHECK: [30] Code Region - Z mla.d
@@ -1473,7 +1473,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1803
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.28
# CHECK-NEXT: IPC: 0.22
# CHECK-NEXT: Block RThroughput: 4.0
@@ -1486,10 +1486,10 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,1] D=====eeeeeER . . . . . . mla z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [0,2] D========eeeeeER . . . . . mla z0.d, p0/m, z1.d, z2.d
# CHECK-NEXT: [0,3] D=============eeeeeER . . . . mla z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT: [1,0] D==================eeeeeER . . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D=======================eeeeeER . . mla z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,2] D==========================eeeeeER . . mla z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,3] D===============================eeeeeER mla z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,0] .D=================eeeeeER . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D======================eeeeeER . . mla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,2] .D=========================eeeeeER . . mla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3] .D==============================eeeeeER mla z0.d, p0/m, z0.d, z1.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1498,11 +1498,11 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 10.0 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 15.0 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 2. 2 18.0 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 3. 2 23.0 0.0 0.0 mla z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT: 2 16.5 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 14.5 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 2. 2 17.5 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3. 2 22.5 0.0 0.0 mla z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: 2 16.0 0.1 0.0 <total>
# CHECK: [31] Code Region - Z smlalb
@@ -1511,7 +1511,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1403
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.36
# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: Block RThroughput: 2.5
@@ -1524,10 +1524,10 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,1] D=====eeeeER . . . . smlalb z0.d, z1.s, z2.s
# CHECK-NEXT: [0,2] D======eeeeER . . . . smlalb z0.d, z1.s, z2.s
# CHECK-NEXT: [0,3] D==========eeeeER . . . smlalb z0.d, z0.s, z1.s
-# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D===================eeeeER . smlalb z0.d, z1.s, z2.s
-# CHECK-NEXT: [1,2] D====================eeeeER . smlalb z0.d, z1.s, z2.s
-# CHECK-NEXT: [1,3] D========================eeeeER smlalb z0.d, z0.s, z1.s
+# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D==================eeeeER . smlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,2] .D===================eeeeER . smlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,3] .D=======================eeeeER smlalb z0.d, z0.s, z1.s
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1536,11 +1536,11 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 13.0 0.0 0.0 smlalb z0.d, z1.s, z2.s
-# CHECK-NEXT: 2. 2 14.0 0.0 0.0 smlalb z0.d, z1.s, z2.s
-# CHECK-NEXT: 3. 2 18.0 0.0 0.0 smlalb z0.d, z0.s, z1.s
-# CHECK-NEXT: 2 13.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 12.5 0.0 0.0 smlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: 2. 2 13.5 0.0 0.0 smlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: 3. 2 17.5 0.0 0.0 smlalb z0.d, z0.s, z1.s
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
# CHECK: [32] Code Region - Z sqdmlalb
@@ -1549,7 +1549,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1503
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.33
# CHECK-NEXT: IPC: 0.27
# CHECK-NEXT: Block RThroughput: 2.5
@@ -1562,10 +1562,10 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,1] D=====eeeeER . . . . . sqdmlalb z0.d, z1.s, z2.s
# CHECK-NEXT: [0,2] D=======eeeeER . . . . . sqdmlalb z0.d, z1.s, z2.s
# CHECK-NEXT: [0,3] D===========eeeeER . . . . sqdmlalb z0.d, z0.s, z1.s
-# CHECK-NEXT: [1,0] D===============eeeeeER . . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D====================eeeeER . . sqdmlalb z0.d, z1.s, z2.s
-# CHECK-NEXT: [1,2] D======================eeeeER . . sqdmlalb z0.d, z1.s, z2.s
-# CHECK-NEXT: [1,3] D==========================eeeeER sqdmlalb z0.d, z0.s, z1.s
+# CHECK-NEXT: [1,0] .D==============eeeeeER . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D===================eeeeER . . sqdmlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,2] .D=====================eeeeER . . sqdmlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: [1,3] .D=========================eeeeER sqdmlalb z0.d, z0.s, z1.s
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1574,11 +1574,11 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.5 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 13.5 0.0 0.0 sqdmlalb z0.d, z1.s, z2.s
-# CHECK-NEXT: 2. 2 15.5 0.0 0.0 sqdmlalb z0.d, z1.s, z2.s
-# CHECK-NEXT: 3. 2 19.5 0.0 0.0 sqdmlalb z0.d, z0.s, z1.s
-# CHECK-NEXT: 2 14.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sqdmlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: 2. 2 15.0 0.0 0.0 sqdmlalb z0.d, z1.s, z2.s
+# CHECK-NEXT: 3. 2 19.0 0.0 0.0 sqdmlalb z0.d, z0.s, z1.s
+# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
# CHECK: [33] Code Region - Z sqrdmlah.b
@@ -1587,7 +1587,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1503
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.33
# CHECK-NEXT: IPC: 0.27
# CHECK-NEXT: Block RThroughput: 2.5
@@ -1600,10 +1600,10 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,1] D=====eeeeER . . . . . sqrdmlah z0.b, z1.b, z2.b
# CHECK-NEXT: [0,2] D=======eeeeER . . . . . sqrdmlah z0.b, z1.b, z2.b
# CHECK-NEXT: [0,3] D===========eeeeER . . . . sqrdmlah z0.b, z0.b, z1.b
-# CHECK-NEXT: [1,0] D===============eeeeeER . . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D====================eeeeER . . sqrdmlah z0.b, z1.b, z2.b
-# CHECK-NEXT: [1,2] D======================eeeeER . . sqrdmlah z0.b, z1.b, z2.b
-# CHECK-NEXT: [1,3] D==========================eeeeER sqrdmlah z0.b, z0.b, z1.b
+# CHECK-NEXT: [1,0] .D==============eeeeeER . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D===================eeeeER . . sqrdmlah z0.b, z1.b, z2.b
+# CHECK-NEXT: [1,2] .D=====================eeeeER . . sqrdmlah z0.b, z1.b, z2.b
+# CHECK-NEXT: [1,3] .D=========================eeeeER sqrdmlah z0.b, z0.b, z1.b
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1612,11 +1612,11 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.5 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 13.5 0.0 0.0 sqrdmlah z0.b, z1.b, z2.b
-# CHECK-NEXT: 2. 2 15.5 0.0 0.0 sqrdmlah z0.b, z1.b, z2.b
-# CHECK-NEXT: 3. 2 19.5 0.0 0.0 sqrdmlah z0.b, z0.b, z1.b
-# CHECK-NEXT: 2 14.3 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sqrdmlah z0.b, z1.b, z2.b
+# CHECK-NEXT: 2. 2 15.0 0.0 0.0 sqrdmlah z0.b, z1.b, z2.b
+# CHECK-NEXT: 3. 2 19.0 0.0 0.0 sqrdmlah z0.b, z0.b, z1.b
+# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
# CHECK: [34] Code Region - Z sqrdmlah.d
@@ -1625,7 +1625,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1803
# CHECK-NEXT: Total uOps: 500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.28
# CHECK-NEXT: IPC: 0.22
# CHECK-NEXT: Block RThroughput: 4.0
@@ -1638,10 +1638,10 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,1] D=====eeeeeER . . . . . . sqrdmlah z0.d, z1.d, z2.d
# CHECK-NEXT: [0,2] D========eeeeeER . . . . . sqrdmlah z0.d, z1.d, z2.d
# CHECK-NEXT: [0,3] D=============eeeeeER . . . . sqrdmlah z0.d, z0.d, z1.d
-# CHECK-NEXT: [1,0] D==================eeeeeER . . . mul z0.d, z0.d, z0.d
-# CHECK-NEXT: [1,1] D=======================eeeeeER . . sqrdmlah z0.d, z1.d, z2.d
-# CHECK-NEXT: [1,2] D==========================eeeeeER . . sqrdmlah z0.d, z1.d, z2.d
-# CHECK-NEXT: [1,3] D===============================eeeeeER sqrdmlah z0.d, z0.d, z1.d
+# CHECK-NEXT: [1,0] .D=================eeeeeER . . . mul z0.d, z0.d, z0.d
+# CHECK-NEXT: [1,1] .D======================eeeeeER . . sqrdmlah z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,2] .D=========================eeeeeER . . sqrdmlah z0.d, z1.d, z2.d
+# CHECK-NEXT: [1,3] .D==============================eeeeeER sqrdmlah z0.d, z0.d, z1.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1650,11 +1650,11 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 10.0 0.5 0.0 mul z0.d, z0.d, z0.d
-# CHECK-NEXT: 1. 2 15.0 0.0 0.0 sqrdmlah z0.d, z1.d, z2.d
-# CHECK-NEXT: 2. 2 18.0 0.0 0.0 sqrdmlah z0.d, z1.d, z2.d
-# CHECK-NEXT: 3. 2 23.0 0.0 0.0 sqrdmlah z0.d, z0.d, z1.d
-# CHECK-NEXT: 2 16.5 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d
+# CHECK-NEXT: 1. 2 14.5 0.0 0.0 sqrdmlah z0.d, z1.d, z2.d
+# CHECK-NEXT: 2. 2 17.5 0.0 0.0 sqrdmlah z0.d, z1.d, z2.d
+# CHECK-NEXT: 3. 2 22.5 0.0 0.0 sqrdmlah z0.d, z0.d, z1.d
+# CHECK-NEXT: 2 16.0 0.1 0.0 <total>
# CHECK: [35] Code Region - Z fcmla ZPmZZ
@@ -1663,7 +1663,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1503
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.27
# CHECK-NEXT: IPC: 0.27
# CHECK-NEXT: Block RThroughput: 1.0
@@ -1678,8 +1678,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D==========eeeeeER . . . . fcmla z0.d, p0/m, z0.d, z1.d, #90
# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1] D==================eeeeeER . . fcmla z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: [1,2] D====================eeeeeER . . fcmla z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: [1,3] D=========================eeeeeER fcmla z0.d, p0/m, z0.d, z1.d, #90
+# CHECK-NEXT: [1,2] .D===================eeeeeER . . fcmla z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: [1,3] .D========================eeeeeER fcmla z0.d, p0/m, z0.d, z1.d, #90
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1690,9 +1690,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d
# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fcmla z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: 2. 2 13.5 0.0 0.0 fcmla z0.d, p0/m, z1.d, z2.d, #90
-# CHECK-NEXT: 3. 2 18.5 0.0 0.0 fcmla z0.d, p0/m, z0.d, z1.d, #90
-# CHECK-NEXT: 2 13.0 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 13.0 0.0 0.0 fcmla z0.d, p0/m, z1.d, z2.d, #90
+# CHECK-NEXT: 3. 2 18.0 0.0 0.0 fcmla z0.d, p0/m, z0.d, z1.d, #90
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
# CHECK: [36] Code Region - Z fcmla ZZZI
@@ -1701,7 +1701,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1503
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.27
# CHECK-NEXT: IPC: 0.27
# CHECK-NEXT: Block RThroughput: 1.0
@@ -1716,8 +1716,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D==========eeeeeER . . . . fcmla z0.s, z0.s, z1.s[1], #90
# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1] D==================eeeeeER . . fcmla z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: [1,2] D====================eeeeeER . . fcmla z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: [1,3] D=========================eeeeeER fcmla z0.s, z0.s, z1.s[1], #90
+# CHECK-NEXT: [1,2] .D===================eeeeeER . . fcmla z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: [1,3] .D========================eeeeeER fcmla z0.s, z0.s, z1.s[1], #90
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1728,9 +1728,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d
# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fcmla z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: 2. 2 13.5 0.0 0.0 fcmla z0.s, z1.s, z2.s[1], #90
-# CHECK-NEXT: 3. 2 18.5 0.0 0.0 fcmla z0.s, z0.s, z1.s[1], #90
-# CHECK-NEXT: 2 13.0 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 13.0 0.0 0.0 fcmla z0.s, z1.s, z2.s[1], #90
+# CHECK-NEXT: 3. 2 18.0 0.0 0.0 fcmla z0.s, z0.s, z1.s[1], #90
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
# CHECK: [37] Code Region - Z fmla ZPmZZ
@@ -1739,7 +1739,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.31
# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 1.0
@@ -1754,8 +1754,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D=========eeeeER . . . fmla z0.d, p0/m, z0.d, z1.d
# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1] D================eeeeER . . fmla z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,2] D==================eeeeER. . fmla z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: [1,3] D======================eeeeER fmla z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: [1,2] .D=================eeeeER. . fmla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: [1,3] .D=====================eeeeER fmla z0.d, p0/m, z0.d, z1.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1766,9 +1766,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d
# CHECK-NEXT: 1. 2 10.5 0.0 0.0 fmla z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 2. 2 12.5 0.0 0.0 fmla z0.d, p0/m, z1.d, z2.d
-# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmla z0.d, p0/m, z0.d, z1.d
-# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 fmla z0.d, p0/m, z1.d, z2.d
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fmla z0.d, p0/m, z0.d, z1.d
+# CHECK-NEXT: 2 11.5 0.1 0.0 <total>
# CHECK: [38] Code Region - Z fmla ZZZI
@@ -1777,7 +1777,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.31
# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 1.0
@@ -1792,8 +1792,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D=========eeeeER . . . fmla z0.d, z0.d, z1.d[1]
# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1] D================eeeeER . . fmla z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: [1,2] D==================eeeeER. . fmla z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: [1,3] D======================eeeeER fmla z0.d, z0.d, z1.d[1]
+# CHECK-NEXT: [1,2] .D=================eeeeER. . fmla z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: [1,3] .D=====================eeeeER fmla z0.d, z0.d, z1.d[1]
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1804,9 +1804,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d
# CHECK-NEXT: 1. 2 10.5 0.0 0.0 fmla z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: 2. 2 12.5 0.0 0.0 fmla z0.d, z1.d, z2.d[1]
-# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmla z0.d, z0.d, z1.d[1]
-# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 fmla z0.d, z1.d, z2.d[1]
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fmla z0.d, z0.d, z1.d[1]
+# CHECK-NEXT: 2 11.5 0.1 0.0 <total>
# CHECK: [39] Code Region - Z fmlalb ZZZ
@@ -1815,7 +1815,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1303
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.31
# CHECK-NEXT: IPC: 0.31
# CHECK-NEXT: Block RThroughput: 1.0
@@ -1830,8 +1830,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D=========eeeeER . . . fmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1] D================eeeeER . . fmlalb z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2] D==================eeeeER. . fmlalb z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3] D======================eeeeER fmlalb z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,2] .D=================eeeeER. . fmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3] .D=====================eeeeER fmlalb z0.s, z0.h, z1.h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1842,9 +1842,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d
# CHECK-NEXT: 1. 2 10.5 0.0 0.0 fmlalb z0.s, z1.h, z2.h
-# CHECK-NEXT: 2. 2 12.5 0.0 0.0 fmlalb z0.s, z1.h, z2.h
-# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmlalb z0.s, z0.h, z1.h
-# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 fmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fmlalb z0.s, z0.h, z1.h
+# CHECK-NEXT: 2 11.5 0.1 0.0 <total>
# CHECK: [40] Code Region - Z bfdot
@@ -1853,7 +1853,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1603
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.25
# CHECK-NEXT: IPC: 0.25
# CHECK-NEXT: Block RThroughput: 1.0
@@ -1868,8 +1868,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D===========eeeeeER . . . . bfdot z0.s, z0.h, z1.h
# CHECK-NEXT: [1,0] D================eeeER . . . fmul z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1] D===================eeeeeER . . bfdot z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2] D======================eeeeeER. . bfdot z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3] D===========================eeeeeER bfdot z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,2] .D=====================eeeeeER. . bfdot z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3] .D==========================eeeeeER bfdot z0.s, z0.h, z1.h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1880,9 +1880,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 9.0 0.5 0.0 fmul z0.d, z0.d, z0.d
# CHECK-NEXT: 1. 2 12.0 0.0 0.0 bfdot z0.s, z1.h, z2.h
-# CHECK-NEXT: 2. 2 15.0 0.0 0.0 bfdot z0.s, z1.h, z2.h
-# CHECK-NEXT: 3. 2 20.0 0.0 0.0 bfdot z0.s, z0.h, z1.h
-# CHECK-NEXT: 2 14.0 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 14.5 0.0 0.0 bfdot z0.s, z1.h, z2.h
+# CHECK-NEXT: 3. 2 19.5 0.0 0.0 bfdot z0.s, z0.h, z1.h
+# CHECK-NEXT: 2 13.8 0.1 0.0 <total>
# CHECK: [41] Code Region - Z bfmmla
@@ -1891,7 +1891,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1903
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.21
# CHECK-NEXT: IPC: 0.21
# CHECK-NEXT: Block RThroughput: 1.0
@@ -1906,8 +1906,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D=============eeeeeeER . . . . bfmmla z0.s, z0.h, z1.h
# CHECK-NEXT: [1,0] D===================eeeER. . . . fmul z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1] D======================eeeeeeER . . bfmmla z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2] D==========================eeeeeeER. . bfmmla z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3] D================================eeeeeeER bfmmla z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,2] .D=========================eeeeeeER. . bfmmla z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3] .D===============================eeeeeeER bfmmla z0.s, z0.h, z1.h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1918,9 +1918,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 10.5 0.5 0.0 fmul z0.d, z0.d, z0.d
# CHECK-NEXT: 1. 2 13.5 0.0 0.0 bfmmla z0.s, z1.h, z2.h
-# CHECK-NEXT: 2. 2 17.5 0.0 0.0 bfmmla z0.s, z1.h, z2.h
-# CHECK-NEXT: 3. 2 23.5 0.0 0.0 bfmmla z0.s, z0.h, z1.h
-# CHECK-NEXT: 2 16.3 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 17.0 0.0 0.0 bfmmla z0.s, z1.h, z2.h
+# CHECK-NEXT: 3. 2 23.0 0.0 0.0 bfmmla z0.s, z0.h, z1.h
+# CHECK-NEXT: 2 16.0 0.1 0.0 <total>
# CHECK: [42] Code Region - bfmlalb
@@ -1929,7 +1929,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 1503
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.27
# CHECK-NEXT: IPC: 0.27
# CHECK-NEXT: Block RThroughput: 1.0
@@ -1944,8 +1944,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D==========eeeeeER . . . . bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d
# CHECK-NEXT: [1,1] D==================eeeeeER . . bfmlalb z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,2] D====================eeeeeER . . bfmlalb z0.s, z1.h, z2.h
-# CHECK-NEXT: [1,3] D=========================eeeeeER bfmlalb z0.s, z0.h, z1.h
+# CHECK-NEXT: [1,2] .D===================eeeeeER . . bfmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: [1,3] .D========================eeeeeER bfmlalb z0.s, z0.h, z1.h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1956,6 +1956,6 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d
# CHECK-NEXT: 1. 2 11.5 0.0 0.0 bfmlalb z0.s, z1.h, z2.h
-# CHECK-NEXT: 2. 2 13.5 0.0 0.0 bfmlalb z0.s, z1.h, z2.h
-# CHECK-NEXT: 3. 2 18.5 0.0 0.0 bfmlalb z0.s, z0.h, z1.h
-# CHECK-NEXT: 2 13.0 0.1 0.0 <total>
+# CHECK-NEXT: 2. 2 13.0 0.0 0.0 bfmlalb z0.s, z1.h, z2.h
+# CHECK-NEXT: 3. 2 18.0 0.0 0.0 bfmlalb z0.s, z0.h, z1.h
+# CHECK-NEXT: 2 12.8 0.1 0.0 <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
index 6cba45cdd42be..49af4df3d8ff5 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
@@ -5071,19 +5071,19 @@ zip2 z31.s, z31.s, z31.s
# CHECK-NEXT: 2 2 1.00 movs p0.b, p0/z, p0.b
# CHECK-NEXT: 2 2 1.00 movs p15.b, p15.b
# CHECK-NEXT: 2 2 1.00 movs p15.b, p15/z, p15.b
-# CHECK-NEXT: 1 1 0.06 U mrs x3, ID_AA64ZFR0_EL1
-# CHECK-NEXT: 1 1 0.06 U mrs x3, ZCR_EL1
-# CHECK-NEXT: 1 1 0.06 U mrs x3, ZCR_EL12
-# CHECK-NEXT: 1 1 0.06 U mrs x3, ZCR_EL2
-# CHECK-NEXT: 1 1 0.06 U mrs x3, ZCR_EL3
+# CHECK-NEXT: 1 1 0.17 U mrs x3, ID_AA64ZFR0_EL1
+# CHECK-NEXT: 1 1 0.17 U mrs x3, ZCR_EL1
+# CHECK-NEXT: 1 1 0.17 U mrs x3, ZCR_EL12
+# CHECK-NEXT: 1 1 0.17 U mrs x3, ZCR_EL2
+# CHECK-NEXT: 1 1 0.17 U mrs x3, ZCR_EL3
# CHECK-NEXT: 1 4 1.00 msb z0.b, p7/m, z1.b, z31.b
# CHECK-NEXT: 1 5 1.00 msb z0.d, p7/m, z1.d, z31.d
# CHECK-NEXT: 1 4 1.00 msb z0.h, p7/m, z1.h, z31.h
# CHECK-NEXT: 1 4 1.00 msb z0.s, p7/m, z1.s, z31.s
-# CHECK-NEXT: 1 1 0.06 U msr ZCR_EL1, x3
-# CHECK-NEXT: 1 1 0.06 U msr ZCR_EL12, x3
-# CHECK-NEXT: 1 1 0.06 U msr ZCR_EL2, x3
-# CHECK-NEXT: 1 1 0.06 U msr ZCR_EL3, x3
+# CHECK-NEXT: 1 1 0.17 U msr ZCR_EL1, x3
+# CHECK-NEXT: 1 1 0.17 U msr ZCR_EL12, x3
+# CHECK-NEXT: 1 1 0.17 U msr ZCR_EL2, x3
+# CHECK-NEXT: 1 1 0.17 U msr ZCR_EL3, x3
# CHECK-NEXT: 1 4 0.50 mul z0.b, p7/m, z0.b, z31.b
# CHECK-NEXT: 1 4 0.50 mul z0.b, z1.b, z2.b
# CHECK-NEXT: 2 5 1.00 mul z0.d, p7/m, z0.d, z31.d
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s
index 1ef746813966d..c7a93d1b4ed35 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s
@@ -733,7 +733,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 1.97
# CHECK-NEXT: IPC: 0.98
# CHECK-NEXT: Block RThroughput: 1.7
@@ -745,8 +745,8 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.1d }, [x27], #8
# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.2d }, [x27], #16
# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.2s }, [x27], #8
-# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.4h }, [x27], #8
-# CHECK-NEXT: [0,4] D====eeeeeeER ld1 { v1.4s }, [x27], #16
+# CHECK-NEXT: [0,3] .D==eeeeeeER. ld1 { v1.4h }, [x27], #8
+# CHECK-NEXT: [0,4] .D===eeeeeeER ld1 { v1.4s }, [x27], #16
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -758,9 +758,9 @@ ldr x2, [x1], #254
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.1d }, [x27], #8
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.2d }, [x27], #16
# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.2s }, [x27], #8
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.4h }, [x27], #8
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.4s }, [x27], #16
-# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1 { v1.4h }, [x27], #8
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld1 { v1.4s }, [x27], #16
+# CHECK-NEXT: 1 2.6 0.2 0.0 <total>
# CHECK: [1] Code Region - G02
@@ -769,7 +769,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 1.97
# CHECK-NEXT: IPC: 0.98
# CHECK-NEXT: Block RThroughput: 1.7
@@ -781,8 +781,8 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.8b }, [x27], #8
# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.8h }, [x27], #16
# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.16b }, [x27], #16
-# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.1d }, [x27], x28
-# CHECK-NEXT: [0,4] D====eeeeeeER ld1 { v1.2d }, [x27], x28
+# CHECK-NEXT: [0,3] .D==eeeeeeER. ld1 { v1.1d }, [x27], x28
+# CHECK-NEXT: [0,4] .D===eeeeeeER ld1 { v1.2d }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -794,9 +794,9 @@ ldr x2, [x1], #254
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.8b }, [x27], #8
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.8h }, [x27], #16
# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.16b }, [x27], #16
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.1d }, [x27], x28
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.2d }, [x27], x28
-# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1 { v1.1d }, [x27], x28
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld1 { v1.2d }, [x27], x28
+# CHECK-NEXT: 1 2.6 0.2 0.0 <total>
# CHECK: [2] Code Region - G03
@@ -805,7 +805,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 1.97
# CHECK-NEXT: IPC: 0.98
# CHECK-NEXT: Block RThroughput: 1.7
@@ -817,8 +817,8 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.2s }, [x27], x28
# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.4h }, [x27], x28
# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.4s }, [x27], x28
-# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.8b }, [x27], x28
-# CHECK-NEXT: [0,4] D====eeeeeeER ld1 { v1.8h }, [x27], x28
+# CHECK-NEXT: [0,3] .D==eeeeeeER. ld1 { v1.8b }, [x27], x28
+# CHECK-NEXT: [0,4] .D===eeeeeeER ld1 { v1.8h }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -830,9 +830,9 @@ ldr x2, [x1], #254
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.2s }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.4h }, [x27], x28
# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.4s }, [x27], x28
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.8b }, [x27], x28
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.8h }, [x27], x28
-# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1 { v1.8b }, [x27], x28
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld1 { v1.8h }, [x27], x28
+# CHECK-NEXT: 1 2.6 0.2 0.0 <total>
# CHECK: [3] Code Region - G04
@@ -841,7 +841,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 2.76
# CHECK-NEXT: IPC: 0.98
# CHECK-NEXT: Block RThroughput: 3.0
@@ -852,9 +852,9 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.16b }, [x27], x28
# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,4] D====eeeeeeER ld1 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,2] .D=eeeeeeER . ld1 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,3] .D==eeeeeeER. ld1 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,4] . D==eeeeeeER ld1 { v1.4h, v2.4h }, [x27], #16
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -865,10 +865,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.16b }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
# CHECK: [4] Code Region - G05
@@ -877,7 +877,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 2.95
# CHECK-NEXT: IPC: 0.98
# CHECK-NEXT: Block RThroughput: 3.3
@@ -888,9 +888,9 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.4s, v2.4s }, [x27], #32
# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,4] D====eeeeeeER ld1 { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,2] .D=eeeeeeER . ld1 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,3] .D==eeeeeeER. ld1 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,4] . D==eeeeeeER ld1 { v1.1d, v2.1d }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -901,10 +901,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.4s, v2.4s }, [x27], #32
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
# CHECK: [5] Code Region - G06
@@ -913,7 +913,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 2.95
# CHECK-NEXT: IPC: 0.98
# CHECK-NEXT: Block RThroughput: 3.3
@@ -924,9 +924,9 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.2d, v2.2d }, [x27], x28
# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,4] D====eeeeeeER ld1 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,2] .D=eeeeeeER . ld1 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3] .D==eeeeeeER. ld1 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,4] . D==eeeeeeER ld1 { v1.8b, v2.8b }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -937,10 +937,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.2d, v2.2d }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
# CHECK: [6] Code Region - G07
@@ -949,7 +949,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1800
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.54
# CHECK-NEXT: IPC: 0.98
# CHECK-NEXT: Block RThroughput: 4.3
@@ -960,9 +960,9 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.8h, v2.8h }, [x27], x28
# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,4] .D===eeeeeeER ld1 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,2] .D=eeeeeeER . ld1 { v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,3] . D=eeeeeeER. ld1 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,4] . D=eeeeeeER ld1 { v1.2s, v2.2s, v3.2s }, [x27], #24
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -973,10 +973,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.8h, v2.8h }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld1 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 1 1.8 0.2 0.0 <total>
# CHECK: [7] Code Region - G08
@@ -985,7 +985,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.94
# CHECK-NEXT: IPC: 0.98
# CHECK-NEXT: Block RThroughput: 5.0
@@ -995,10 +995,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,4] .D===eeeeeeER ld1 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1] .DeeeeeeER. . ld1 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,2] . DeeeeeeER . ld1 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3] . DeeeeeeER. ld1 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,4] . DeeeeeeER ld1 { v1.16b, v2.16b, v3.16b }, [x27], #48
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1008,11 +1008,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld1 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 1 1.0 0.2 0.0 <total>
# CHECK: [8] Code Region - G09
@@ -1021,7 +1021,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.94
# CHECK-NEXT: IPC: 0.98
# CHECK-NEXT: Block RThroughput: 5.0
@@ -1031,10 +1031,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,3] D===eeeeeeER. ld1 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,4] .D===eeeeeeER ld1 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,1] .DeeeeeeER. . ld1 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeeeER . ld1 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,3] . DeeeeeeER. ld1 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,4] . DeeeeeeER ld1 { v1.4s, v2.4s, v3.4s }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1044,11 +1044,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 1 1.0 0.2 0.0 <total>
# CHECK: [9] Code Region - G10
@@ -1057,7 +1057,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 608
# CHECK-NEXT: Total uOps: 2200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.62
# CHECK-NEXT: IPC: 0.82
# CHECK-NEXT: Block RThroughput: 5.7
@@ -1067,10 +1067,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1] D=eeeeeeER. . ld1 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,2] D==eeeeeeER . ld1 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,3] .D==eeeeeeeER. ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,4] .D===eeeeeeeER ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,1] .DeeeeeeER. . ld1 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeeeER . ld1 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,3] . DeeeeeeeER. ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,4] . DeeeeeeeER ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1080,11 +1080,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 1 2.6 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 1 1.0 0.2 0.0 <total>
# CHECK: [10] Code Region - G11
@@ -1093,7 +1093,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 675
# CHECK-NEXT: Total uOps: 2500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.70
# CHECK-NEXT: IPC: 0.74
# CHECK-NEXT: Block RThroughput: 6.7
@@ -1103,10 +1103,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeeER. . ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,1] D=eeeeeeeER . ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,2] D==eeeeeeeER . ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,3] .D===eeeeeeeER. ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,4] .D====eeeeeeeER ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,1] .DeeeeeeeER . ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,2] . DeeeeeeeER . ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3] . D=eeeeeeeER. ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,4] . D=eeeeeeeER ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1116,11 +1116,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 3. 1 4.0 1.0 0.0 ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 1 3.0 0.4 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 3. 1 2.0 1.0 0.0 ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 1 1.4 0.4 0.0 <total>
# CHECK: [11] Code Region - G12
@@ -1129,7 +1129,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 675
# CHECK-NEXT: Total uOps: 2500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.70
# CHECK-NEXT: IPC: 0.74
# CHECK-NEXT: Block RThroughput: 6.7
@@ -1139,10 +1139,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeeER. . ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,1] D=eeeeeeeER . ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,2] D==eeeeeeeER . ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,3] .D===eeeeeeeER. ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,4] .D====eeeeeeeER ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1] .DeeeeeeeER . ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeeeeER . ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3] . D=eeeeeeeER. ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,4] . D=eeeeeeeER ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1152,11 +1152,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 3. 1 4.0 1.0 0.0 ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 1 3.0 0.4 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 1.0 0.0 ld1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 1 1.4 0.4 0.0 <total>
# CHECK: [12] Code Region - G13
@@ -1165,7 +1165,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 1210
# CHECK-NEXT: Total uOps: 2300
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 1.90
# CHECK-NEXT: IPC: 0.41
# CHECK-NEXT: Block RThroughput: 5.7
@@ -1175,10 +1175,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Index 0123456789 01
# CHECK: [0,0] DeeeeeeeER. . .. ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,1] D=eeeeeeeER . .. ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,2] D==eeeeeeeER . .. ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,3] .D===eeeeeeeER . .. ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,4] .D==========eeeeeeeeER ld1 { v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,1] .DeeeeeeeER . .. ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeeeeER . .. ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3] . D=eeeeeeeER . .. ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,4] . D=======eeeeeeeeER ld1 { v1.b }[0], [x27], #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1188,11 +1188,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 3. 1 4.0 1.0 0.0 ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 4. 1 11.0 0.0 0.0 ld1 { v1.b }[0], [x27], #1
-# CHECK-NEXT: 1 4.2 0.4 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 1.0 0.0 ld1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 4. 1 8.0 0.0 0.0 ld1 { v1.b }[0], [x27], #1
+# CHECK-NEXT: 1 2.6 0.4 0.0 <total>
# CHECK: [13] Code Region - G14
@@ -1201,10 +1201,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.37
# CHECK-NEXT: IPC: 0.12
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
@@ -1212,9 +1212,9 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld1 { v1.b }[8], [x27], #1
# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld1 { v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,2] D================eeeeeeeeER . . . . ld1 { v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,3] D========================eeeeeeeeER. . . ld1 { v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,4] D================================eeeeeeeeER ld1 { v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,2] .D===============eeeeeeeeER . . . . ld1 { v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,3] .D=======================eeeeeeeeER. . . ld1 { v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,4] . D==============================eeeeeeeeER ld1 { v1.h }[4], [x27], #2
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1225,10 +1225,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.b }[8], [x27], #1
# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld1 { v1.b }[0], [x27], x28
-# CHECK-NEXT: 2. 1 17.0 0.0 0.0 ld1 { v1.b }[8], [x27], x28
-# CHECK-NEXT: 3. 1 25.0 0.0 0.0 ld1 { v1.h }[0], [x27], #2
-# CHECK-NEXT: 4. 1 33.0 0.0 0.0 ld1 { v1.h }[4], [x27], #2
-# CHECK-NEXT: 1 17.0 0.2 0.0 <total>
+# CHECK-NEXT: 2. 1 16.0 0.0 0.0 ld1 { v1.b }[8], [x27], x28
+# CHECK-NEXT: 3. 1 24.0 0.0 0.0 ld1 { v1.h }[0], [x27], #2
+# CHECK-NEXT: 4. 1 31.0 0.0 0.0 ld1 { v1.h }[4], [x27], #2
+# CHECK-NEXT: 1 16.2 0.2 0.0 <total>
# CHECK: [14] Code Region - G15
@@ -1237,10 +1237,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.37
# CHECK-NEXT: IPC: 0.12
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
@@ -1248,9 +1248,9 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld1 { v1.h }[0], [x27], x28
# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld1 { v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,2] D================eeeeeeeeER . . . . ld1 { v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,3] D========================eeeeeeeeER. . . ld1 { v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,4] D================================eeeeeeeeER ld1 { v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,2] .D===============eeeeeeeeER . . . . ld1 { v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,3] .D=======================eeeeeeeeER. . . ld1 { v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,4] . D==============================eeeeeeeeER ld1 { v1.d }[0], [x27], #8
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1261,10 +1261,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.h }[0], [x27], x28
# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld1 { v1.h }[4], [x27], x28
-# CHECK-NEXT: 2. 1 17.0 0.0 0.0 ld1 { v1.s }[0], [x27], #4
-# CHECK-NEXT: 3. 1 25.0 0.0 0.0 ld1 { v1.s }[0], [x27], x28
-# CHECK-NEXT: 4. 1 33.0 0.0 0.0 ld1 { v1.d }[0], [x27], #8
-# CHECK-NEXT: 1 17.0 0.2 0.0 <total>
+# CHECK-NEXT: 2. 1 16.0 0.0 0.0 ld1 { v1.s }[0], [x27], #4
+# CHECK-NEXT: 3. 1 24.0 0.0 0.0 ld1 { v1.s }[0], [x27], x28
+# CHECK-NEXT: 4. 1 31.0 0.0 0.0 ld1 { v1.d }[0], [x27], #8
+# CHECK-NEXT: 1 16.2 0.2 0.0 <total>
# CHECK: [15] Code Region - G16
@@ -1273,10 +1273,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 1203
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 1.25
# CHECK-NEXT: IPC: 0.42
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
# CHECK-NEXT: 01234
@@ -1284,9 +1284,9 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeeeeeeeER . ld1 { v1.d }[0], [x27], x28
# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld1r { v1.1d }, [x27], #8
-# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld1r { v1.2d }, [x27], #8
-# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld1r { v1.2s }, [x27], #4
-# CHECK-NEXT: [0,4] D====eeeeeeeeER ld1r { v1.4h }, [x27], #2
+# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld1r { v1.2d }, [x27], #8
+# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld1r { v1.2s }, [x27], #4
+# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld1r { v1.4h }, [x27], #2
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1297,10 +1297,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.d }[0], [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1r { v1.1d }, [x27], #8
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1r { v1.2d }, [x27], #8
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1r { v1.2s }, [x27], #4
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1r { v1.4h }, [x27], #2
-# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1r { v1.2d }, [x27], #8
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1r { v1.2s }, [x27], #4
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1r { v1.4h }, [x27], #2
+# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
# CHECK: [16] Code Region - G17
@@ -1309,10 +1309,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 510
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 2.94
# CHECK-NEXT: IPC: 0.98
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
# CHECK-NEXT: 01234
@@ -1320,9 +1320,9 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeeeeeeeER . ld1r { v1.4s }, [x27], #4
# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld1r { v1.8b }, [x27], #1
-# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld1r { v1.8h }, [x27], #2
-# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld1r { v1.16b }, [x27], #1
-# CHECK-NEXT: [0,4] D====eeeeeeeeER ld1r { v1.1d }, [x27], x28
+# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld1r { v1.8h }, [x27], #2
+# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld1r { v1.16b }, [x27], #1
+# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld1r { v1.1d }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1333,10 +1333,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1r { v1.4s }, [x27], #4
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1r { v1.8b }, [x27], #1
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1r { v1.8h }, [x27], #2
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1r { v1.16b }, [x27], #1
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1r { v1.1d }, [x27], x28
-# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1r { v1.8h }, [x27], #2
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1r { v1.16b }, [x27], #1
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1r { v1.1d }, [x27], x28
+# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
# CHECK: [17] Code Region - G18
@@ -1345,10 +1345,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 510
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 2.94
# CHECK-NEXT: IPC: 0.98
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
# CHECK-NEXT: 01234
@@ -1356,9 +1356,9 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeeeeeeeER . ld1r { v1.2d }, [x27], x28
# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld1r { v1.2s }, [x27], x28
-# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld1r { v1.4h }, [x27], x28
-# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld1r { v1.4s }, [x27], x28
-# CHECK-NEXT: [0,4] D====eeeeeeeeER ld1r { v1.8b }, [x27], x28
+# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld1r { v1.4h }, [x27], x28
+# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld1r { v1.4s }, [x27], x28
+# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld1r { v1.8b }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1369,10 +1369,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1r { v1.2d }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1r { v1.2s }, [x27], x28
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld1r { v1.4h }, [x27], x28
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld1r { v1.4s }, [x27], x28
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ld1r { v1.8b }, [x27], x28
-# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1r { v1.4h }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld1r { v1.4s }, [x27], x28
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1r { v1.8b }, [x27], x28
+# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
# CHECK: [18] Code Region - G19
@@ -1381,10 +1381,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 510
# CHECK-NEXT: Total uOps: 1900
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.73
# CHECK-NEXT: IPC: 0.98
-# CHECK-NEXT: Block RThroughput: 2.0
+# CHECK-NEXT: Block RThroughput: 3.2
# CHECK: Timeline view:
# CHECK-NEXT: 01234
@@ -1392,9 +1392,9 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeeeeeeeER . ld1r { v1.8h }, [x27], x28
# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld1r { v1.16b }, [x27], x28
-# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld2 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld2 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,4] .D===eeeeeeeeER ld2 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld2 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,3] . D=eeeeeeeeER. ld2 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,4] . D=eeeeeeeeER ld2 { v1.4h, v2.4h }, [x27], #16
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1405,10 +1405,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1r { v1.8h }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld1r { v1.16b }, [x27], x28
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld2 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld2 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld2 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld2 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 ld2 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld2 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 1 1.8 0.2 0.0 <total>
# CHECK: [19] Code Region - G20
@@ -1417,20 +1417,20 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 510
# CHECK-NEXT: Total uOps: 2400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.71
# CHECK-NEXT: IPC: 0.98
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
# CHECK-NEXT: 01234
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeeeER . ld2 { v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld2 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld2 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld2 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,4] .D===eeeeeeeeER ld2 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld2 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld2 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,3] . DeeeeeeeeER. ld2 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,4] . DeeeeeeeeER ld2 { v1.2d, v2.2d }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1440,11 +1440,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld2 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld2 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld2 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld2 { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 1 2.6 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld2 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld2 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld2 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld2 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 1 1.0 0.2 0.0 <total>
# CHECK: [20] Code Region - G21
@@ -1453,20 +1453,20 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 510
# CHECK-NEXT: Total uOps: 2200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.31
# CHECK-NEXT: IPC: 0.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.7
# CHECK: Timeline view:
# CHECK-NEXT: 01234
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeeeER . ld2 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld2 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld2 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld2 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,4] .D===eeeeeeeeER ld2 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld2 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld2 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3] . DeeeeeeeeER. ld2 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,4] . DeeeeeeeeER ld2 { v1.8h, v2.8h }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1476,11 +1476,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld2 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld2 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld2 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld2 { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 1 2.6 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld2 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld2 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld2 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld2 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 1 1.0 0.2 0.0 <total>
# CHECK: [21] Code Region - G22
@@ -1489,20 +1489,20 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 3310
# CHECK-NEXT: Total uOps: 2100
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.63
# CHECK-NEXT: IPC: 0.15
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.5
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
# CHECK-NEXT: Index 0123456789 0123456789 012
# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld2 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld2 { v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,2] D================eeeeeeeeER . . . . ld2 { v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,3] .D=======================eeeeeeeeER. . . ld2 { v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,4] .D===============================eeeeeeeeER ld2 { v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,1] .D=======eeeeeeeeER . . . . . . ld2 { v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,2] . D==============eeeeeeeeER . . . . ld2 { v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,3] . D=====================eeeeeeeeER. . . ld2 { v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,4] . D============================eeeeeeeeER ld2 { v1.b, v2.b }[8], [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1512,11 +1512,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld2 { v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 2. 1 17.0 0.0 0.0 ld2 { v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 3. 1 24.0 0.0 0.0 ld2 { v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: 4. 1 32.0 0.0 0.0 ld2 { v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 1 16.6 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 8.0 0.0 0.0 ld2 { v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 2. 1 15.0 0.0 0.0 ld2 { v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 3. 1 22.0 0.0 0.0 ld2 { v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: 4. 1 29.0 0.0 0.0 ld2 { v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 1 15.0 0.2 0.0 <total>
# CHECK: [22] Code Region - G23
@@ -1525,20 +1525,20 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.50
# CHECK-NEXT: IPC: 0.12
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.3
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
# CHECK-NEXT: Index 0123456789 0123456789 012
# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld2 { v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld2 { v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,2] D================eeeeeeeeER . . . . ld2 { v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,3] D========================eeeeeeeeER. . . ld2 { v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,4] .D===============================eeeeeeeeER ld2 { v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,1] .D=======eeeeeeeeER . . . . . . ld2 { v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,2] . D==============eeeeeeeeER . . . . ld2 { v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,3] . D=====================eeeeeeeeER. . . ld2 { v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,4] . D============================eeeeeeeeER ld2 { v1.s, v2.s }[0], [x27], #8
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1548,11 +1548,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld2 { v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 2. 1 17.0 0.0 0.0 ld2 { v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 3. 1 25.0 0.0 0.0 ld2 { v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: 4. 1 32.0 0.0 0.0 ld2 { v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 1 16.8 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 8.0 0.0 0.0 ld2 { v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 2. 1 15.0 0.0 0.0 ld2 { v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 3. 1 22.0 0.0 0.0 ld2 { v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: 4. 1 29.0 0.0 0.0 ld2 { v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 1 15.0 0.2 0.0 <total>
# CHECK: [23] Code Region - G24
@@ -1561,20 +1561,20 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 2603
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.77
# CHECK-NEXT: IPC: 0.19
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.3
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789
# CHECK-NEXT: Index 0123456789 012345678
# CHECK: [0,0] DeeeeeeeeER . . . . ld2 { v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . ld2 { v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,2] D================eeeeeeeeER . ld2 { v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,3] D=================eeeeeeeeER. ld2r { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,4] .D=================eeeeeeeeER ld2r { v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,1] .D=======eeeeeeeeER . . . ld2 { v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,2] . D==============eeeeeeeeER . ld2 { v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,3] . D==============eeeeeeeeER. ld2r { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,4] . D==============eeeeeeeeER ld2r { v1.2d, v2.2d }, [x27], #16
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1584,11 +1584,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2 { v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld2 { v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 2. 1 17.0 0.0 0.0 ld2 { v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 3. 1 18.0 0.0 0.0 ld2r { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 4. 1 18.0 0.0 0.0 ld2r { v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: 1 12.6 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 8.0 0.0 0.0 ld2 { v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 2. 1 15.0 0.0 0.0 ld2 { v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 3. 1 15.0 0.0 0.0 ld2r { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 4. 1 15.0 0.0 0.0 ld2r { v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: 1 10.8 0.2 0.0 <total>
# CHECK: [24] Code Region - G25
@@ -1597,20 +1597,20 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 510
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.92
# CHECK-NEXT: IPC: 0.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.3
# CHECK: Timeline view:
# CHECK-NEXT: 01234
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeeeER . ld2r { v1.2s, v2.2s }, [x27], #8
-# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld2r { v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld2r { v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld2r { v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: [0,4] .D===eeeeeeeeER ld2r { v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld2r { v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld2r { v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,3] . DeeeeeeeeER. ld2r { v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,4] . DeeeeeeeeER ld2r { v1.8h, v2.8h }, [x27], #4
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1620,11 +1620,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2r { v1.2s, v2.2s }, [x27], #8
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld2r { v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld2r { v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld2r { v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld2r { v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld2r { v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld2r { v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld2r { v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld2r { v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: 1 1.0 0.2 0.0 <total>
# CHECK: [25] Code Region - G26
@@ -1633,20 +1633,20 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 510
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.92
# CHECK-NEXT: IPC: 0.98
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.3
# CHECK: Timeline view:
# CHECK-NEXT: 01234
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeeeER . ld2r { v1.16b, v2.16b }, [x27], #2
-# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld2r { v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld2r { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld2r { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,4] .D===eeeeeeeeER ld2r { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld2r { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld2r { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3] . DeeeeeeeeER. ld2r { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,4] . DeeeeeeeeER ld2r { v1.4h, v2.4h }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1656,11 +1656,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2r { v1.16b, v2.16b }, [x27], #2
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld2r { v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld2r { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld2r { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld2r { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld2r { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld2r { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld2r { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld2r { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 1 1.0 0.2 0.0 <total>
# CHECK: [26] Code Region - G27
@@ -1669,20 +1669,20 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 510
# CHECK-NEXT: Total uOps: 2300
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.51
# CHECK-NEXT: IPC: 0.98
-# CHECK-NEXT: Block RThroughput: 2.8
+# CHECK-NEXT: Block RThroughput: 3.8
# CHECK: Timeline view:
# CHECK-NEXT: 01234
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeeeER . ld2r { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld2r { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,2] D==eeeeeeeeER . ld2r { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,3] D===eeeeeeeeER. ld2r { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,4] .D===eeeeeeeeER ld3 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld2r { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld2r { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3] . DeeeeeeeeER. ld2r { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,4] . DeeeeeeeeER ld3 { v1.2d, v2.2d, v3.2d }, [x27], #48
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1692,33 +1692,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld2r { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld2r { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ld2r { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ld2r { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld3 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld2r { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld2r { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld2r { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld3 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 1 1.0 0.2 0.0 <total>
# CHECK: [27] Code Region - G28
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 510
+# CHECK-NEXT: Total Cycles: 709
# CHECK-NEXT: Total uOps: 3200
-# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 6.27
-# CHECK-NEXT: IPC: 0.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 4.51
+# CHECK-NEXT: IPC: 0.71
+# CHECK-NEXT: Block RThroughput: 5.3
# CHECK: Timeline view:
-# CHECK-NEXT: 01234
+# CHECK-NEXT: 012345
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeeER . ld3 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld3 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld3 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld3 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld3 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK: [0,0] DeeeeeeeeER . ld3 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld3 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld3 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3] . DeeeeeeeeER. ld3 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,4] . DeeeeeeeeER ld3 { v1.8h, v2.8h, v3.8h }, [x27], #48
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1728,33 +1728,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld3 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld3 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld3 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld3 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld3 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld3 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3. 1 1.0 1.0 0.0 ld3 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld3 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 1 1.0 0.4 0.0 <total>
# CHECK: [28] Code Region - G29
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 510
+# CHECK-NEXT: Total Cycles: 809
# CHECK-NEXT: Total uOps: 3300
-# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 6.47
-# CHECK-NEXT: IPC: 0.98
-# CHECK-NEXT: Block RThroughput: 4.3
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 4.08
+# CHECK-NEXT: IPC: 0.62
+# CHECK-NEXT: Block RThroughput: 5.5
# CHECK: Timeline view:
-# CHECK-NEXT: 01234
+# CHECK-NEXT: 0123456
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeeER . ld3 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld3 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld3 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld3 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld3 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK: [0,0] DeeeeeeeeER .. ld3 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1] . DeeeeeeeeER .. ld3 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeeeeeER.. ld3 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,3] . DeeeeeeeeER. ld3 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,4] . .DeeeeeeeeER ld3 { v1.4s, v2.4s, v3.4s }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1764,33 +1764,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld3 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld3 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld3 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld3 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 1.0 0.0 ld3 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld3 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld3 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld3 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 1 1.0 0.6 0.0 <total>
# CHECK: [29] Code Region - G30
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 1910
+# CHECK-NEXT: Total Cycles: 1911
# CHECK-NEXT: Total uOps: 3200
-# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 1.68
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 1.67
# CHECK-NEXT: IPC: 0.26
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 5.3
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789 012345678
+# CHECK-NEXT: Index 0123456789 0123456789
-# CHECK: [0,0] DeeeeeeeeER . . . . ld3 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1] D=eeeeeeeeER . . . . ld3 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,2] .D=eeeeeeeeER . . . . ld3 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,3] .D=========eeeeeeeeER . . ld3 { v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,4] . D================eeeeeeeeER ld3 { v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK: [0,0] DeeeeeeeeER . . . . ld3 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1] .DeeeeeeeeER . . . . ld3 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeeeeeER . . . . ld3 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,3] . D======eeeeeeeeER . . ld3 { v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,4] . .D=============eeeeeeeeER ld3 { v1.b, v2.b, v3.b }[8], [x27], #3
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1800,11 +1800,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld3 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld3 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 3. 1 10.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 4. 1 17.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 1 6.4 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld3 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld3 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 3. 1 7.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 4. 1 14.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 1 4.8 0.4 0.0 <total>
# CHECK: [30] Code Region - G31
@@ -1813,20 +1813,20 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 3000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.75
# CHECK-NEXT: IPC: 0.12
-# CHECK-NEXT: Block RThroughput: 3.8
+# CHECK-NEXT: Block RThroughput: 5.0
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
# CHECK-NEXT: Index 0123456789 0123456789 012
# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld3 { v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld3 { v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,2] .D===============eeeeeeeeER . . . . ld3 { v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,3] .D=======================eeeeeeeeER. . . ld3 { v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,4] . D==============================eeeeeeeeER ld3 { v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,1] .D=======eeeeeeeeER . . . . . . ld3 { v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,2] . D==============eeeeeeeeER . . . . ld3 { v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,3] . D=====================eeeeeeeeER. . . ld3 { v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,4] . D============================eeeeeeeeER ld3 { v1.h, v2.h, v3.h }[0], [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1836,11 +1836,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 2. 1 16.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 3. 1 24.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 4. 1 31.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 1 16.2 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 8.0 0.0 0.0 ld3 { v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 2. 1 15.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: 3. 1 22.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 4. 1 29.0 0.0 0.0 ld3 { v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 1 15.0 0.2 0.0 <total>
# CHECK: [31] Code Region - G32
@@ -1849,20 +1849,20 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 3000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.75
# CHECK-NEXT: IPC: 0.12
-# CHECK-NEXT: Block RThroughput: 3.8
+# CHECK-NEXT: Block RThroughput: 5.0
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
# CHECK-NEXT: Index 0123456789 0123456789 012
# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld3 { v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld3 { v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,2] .D===============eeeeeeeeER . . . . ld3 { v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,3] .D=======================eeeeeeeeER. . . ld3 { v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,4] . D==============================eeeeeeeeER ld3 { v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,1] .D=======eeeeeeeeER . . . . . . ld3 { v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,2] . D==============eeeeeeeeER . . . . ld3 { v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,3] . D=====================eeeeeeeeER. . . ld3 { v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,4] . D============================eeeeeeeeER ld3 { v1.d, v2.d, v3.d }[0], [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1872,33 +1872,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3 { v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld3 { v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 2. 1 16.0 0.0 0.0 ld3 { v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 3. 1 24.0 0.0 0.0 ld3 { v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 4. 1 31.0 0.0 0.0 ld3 { v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 1 16.2 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 8.0 0.0 0.0 ld3 { v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 2. 1 15.0 0.0 0.0 ld3 { v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: 3. 1 22.0 0.0 0.0 ld3 { v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 4. 1 29.0 0.0 0.0 ld3 { v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 1 15.0 0.2 0.0 <total>
# CHECK: [32] Code Region - G33
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 510
+# CHECK-NEXT: Total Cycles: 709
# CHECK-NEXT: Total uOps: 3200
-# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 6.27
-# CHECK-NEXT: IPC: 0.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 4.51
+# CHECK-NEXT: IPC: 0.71
+# CHECK-NEXT: Block RThroughput: 5.3
# CHECK: Timeline view:
-# CHECK-NEXT: 01234
+# CHECK-NEXT: 012345
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeeER . ld3r { v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld3r { v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld3r { v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld3r { v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld3r { v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK: [0,0] DeeeeeeeeER . ld3r { v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld3r { v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld3r { v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,3] . DeeeeeeeeER. ld3r { v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,4] . DeeeeeeeeER ld3r { v1.4s, v2.4s, v3.4s }, [x27], #12
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1908,33 +1908,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3r { v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld3r { v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld3r { v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld3r { v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld3r { v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld3r { v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld3r { v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld3r { v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld3r { v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: 1 1.0 0.4 0.0 <total>
# CHECK: [33] Code Region - G34
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 510
+# CHECK-NEXT: Total Cycles: 809
# CHECK-NEXT: Total uOps: 3300
-# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 6.47
-# CHECK-NEXT: IPC: 0.98
-# CHECK-NEXT: Block RThroughput: 4.3
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 4.08
+# CHECK-NEXT: IPC: 0.62
+# CHECK-NEXT: Block RThroughput: 5.5
# CHECK: Timeline view:
-# CHECK-NEXT: 01234
+# CHECK-NEXT: 0123456
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeeER . ld3r { v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld3r { v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld3r { v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld3r { v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld3r { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK: [0,0] DeeeeeeeeER .. ld3r { v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1] .DeeeeeeeeER .. ld3r { v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,2] . DeeeeeeeeER .. ld3r { v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,3] . DeeeeeeeeER. ld3r { v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,4] . .DeeeeeeeeER ld3r { v1.2d, v2.2d, v3.2d }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1944,33 +1944,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3r { v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld3r { v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld3r { v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld3r { v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld3r { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld3r { v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld3r { v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 3. 1 1.0 1.0 0.0 ld3r { v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld3r { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 1 1.0 0.6 0.0 <total>
# CHECK: [34] Code Region - G35
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 510
+# CHECK-NEXT: Total Cycles: 709
# CHECK-NEXT: Total uOps: 3200
-# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 6.27
-# CHECK-NEXT: IPC: 0.98
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 4.51
+# CHECK-NEXT: IPC: 0.71
+# CHECK-NEXT: Block RThroughput: 5.3
# CHECK: Timeline view:
-# CHECK-NEXT: 01234
+# CHECK-NEXT: 012345
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeeER . ld3r { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1] D=eeeeeeeeER . ld3r { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,2] .D=eeeeeeeeER . ld3r { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,3] .D==eeeeeeeeER. ld3r { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld3r { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK: [0,0] DeeeeeeeeER . ld3r { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld3r { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld3r { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,3] . DeeeeeeeeER. ld3r { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,4] . DeeeeeeeeER ld3r { v1.8h, v2.8h, v3.8h }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1980,33 +1980,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3r { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ld3r { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld3r { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ld3r { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld3r { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld3r { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld3r { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 1.0 0.0 ld3r { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld3r { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 1 1.0 0.4 0.0 <total>
# CHECK: [35] Code Region - G36
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 710
+# CHECK-NEXT: Total Cycles: 1010
# CHECK-NEXT: Total uOps: 4500
-# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 6.34
-# CHECK-NEXT: IPC: 0.70
-# CHECK-NEXT: Block RThroughput: 7.0
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 4.46
+# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: Block RThroughput: 7.5
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 0123456789
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeeER .. ld3r { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1] .DeeeeeeeeeER .. ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,2] . DeeeeeeeeER .. ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,3] . D=eeeeeeeeER .. ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,4] . D==eeeeeeeeeER ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK: [0,0] DeeeeeeeeER . . ld3r { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1] . DeeeeeeeeeER . . ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,2] . DeeeeeeeeER. . ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,3] . .DeeeeeeeeER . ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,4] . . DeeeeeeeeeER ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2016,33 +2016,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld3r { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 3. 1 2.0 0.0 0.0 ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 4. 1 3.0 1.0 0.0 ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 1 1.6 0.4 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 1.0 0.0 ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 3. 1 1.0 1.0 0.0 ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 1 1.0 1.0 0.0 <total>
# CHECK: [36] Code Region - G37
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 810
+# CHECK-NEXT: Total Cycles: 1009
# CHECK-NEXT: Total uOps: 4900
-# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 6.05
-# CHECK-NEXT: IPC: 0.62
-# CHECK-NEXT: Block RThroughput: 8.0
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 4.86
+# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: Block RThroughput: 8.2
# CHECK: Timeline view:
-# CHECK-NEXT: 01234567
+# CHECK-NEXT: 012345678
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeeER . . ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,1] .DeeeeeeeeeER . . ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,2] . DeeeeeeeeeER . . ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,3] . DeeeeeeeeeER. . ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,4] . D===eeeeeeeeER ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK: [0,0] DeeeeeeeeER . . ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1] . DeeeeeeeeeER . . ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,2] . DeeeeeeeeeER . ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,3] . .DeeeeeeeeeER. ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,4] . . DeeeeeeeeER ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2052,33 +2052,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 4. 1 4.0 3.0 0.0 ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 1 1.6 0.8 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 1.0 0.0 ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 3. 1 1.0 1.0 0.0 ld4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 1 1.0 1.0 0.0 <total>
# CHECK: [37] Code Region - G38
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 809
+# CHECK-NEXT: Total Cycles: 1010
# CHECK-NEXT: Total uOps: 4900
-# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 6.06
-# CHECK-NEXT: IPC: 0.62
-# CHECK-NEXT: Block RThroughput: 8.0
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 4.85
+# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: Block RThroughput: 8.2
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 0123456789
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeeER .. ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,1] .DeeeeeeeeeER .. ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,2] . DeeeeeeeeER .. ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,3] . DeeeeeeeeeER.. ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,4] . D=eeeeeeeeeER ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK: [0,0] DeeeeeeeeER . . ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1] . DeeeeeeeeeER . . ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeeeeeER. . ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3] . .DeeeeeeeeeER . ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,4] . . DeeeeeeeeeER ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2088,11 +2088,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 4. 1 2.0 1.0 0.0 ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 1 1.2 0.4 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 1.0 0.0 ld4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 1.0 0.0 ld4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 1 1.0 1.0 0.0 <total>
# CHECK: [38] Code Region - G39
@@ -2101,20 +2101,20 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 4000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 1.00
# CHECK-NEXT: IPC: 0.12
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK-NEXT: Block RThroughput: 6.7
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
# CHECK-NEXT: Index 0123456789 0123456789 012
# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,2] .D===============eeeeeeeeER . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,3] .D=======================eeeeeeeeER. . . ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,4] . D==============================eeeeeeeeER ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,1] . D======eeeeeeeeER . . . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,2] . D============eeeeeeeeER . . . . ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,3] . .D==================eeeeeeeeER. . . ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,4] . . D========================eeeeeeeeER ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2124,11 +2124,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 2. 1 16.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 3. 1 24.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 4. 1 31.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 1 16.2 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 7.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 2. 1 13.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 3. 1 19.0 0.0 0.0 ld4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: 4. 1 25.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 1 13.0 0.2 0.0 <total>
# CHECK: [39] Code Region - G40
@@ -2137,20 +2137,20 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 4003
# CHECK-NEXT: Total uOps: 4000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 1.00
# CHECK-NEXT: IPC: 0.12
-# CHECK-NEXT: Block RThroughput: 5.0
+# CHECK-NEXT: Block RThroughput: 6.7
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
# CHECK-NEXT: Index 0123456789 0123456789 012
# CHECK: [0,0] DeeeeeeeeER . . . . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,1] D========eeeeeeeeER . . . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,2] .D===============eeeeeeeeER . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,3] .D=======================eeeeeeeeER. . . ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,4] . D==============================eeeeeeeeER ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,1] . D======eeeeeeeeER . . . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,2] . D============eeeeeeeeER . . . . ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,3] . .D==================eeeeeeeeER. . . ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,4] . . D========================eeeeeeeeER ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2160,11 +2160,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 2. 1 16.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 3. 1 24.0 0.0 0.0 ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 4. 1 31.0 0.0 0.0 ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 1 16.2 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 7.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 2. 1 13.0 0.0 0.0 ld4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 3. 1 19.0 0.0 0.0 ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: 4. 1 25.0 0.0 0.0 ld4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 1 13.0 0.2 0.0 <total>
# CHECK: [40] Code Region - G41
@@ -2173,20 +2173,20 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 1903
# CHECK-NEXT: Total uOps: 4100
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 2.15
# CHECK-NEXT: IPC: 0.26
-# CHECK-NEXT: Block RThroughput: 5.3
+# CHECK-NEXT: Block RThroughput: 6.8
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789
# CHECK-NEXT: Index 0123456789 01
# CHECK: [0,0] DeeeeeeeeER . .. ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,1] D========eeeeeeeeER .. ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,2] .D========eeeeeeeeER.. ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,3] . D========eeeeeeeeER. ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: [0,4] . D========eeeeeeeeER ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,1] . D======eeeeeeeeER .. ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,2] . D=====eeeeeeeeER.. ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3] . .D====eeeeeeeeER. ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,4] . . D===eeeeeeeeER ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2196,33 +2196,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 1. 1 9.0 0.0 0.0 ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 2. 1 9.0 0.0 0.0 ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 3. 1 9.0 0.0 0.0 ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: 4. 1 9.0 0.0 0.0 ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: 1 7.4 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 7.0 0.0 0.0 ld4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 2. 1 6.0 0.0 0.0 ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3. 1 5.0 0.0 0.0 ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: 1 4.6 0.2 0.0 <total>
# CHECK: [41] Code Region - G42
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 659
+# CHECK-NEXT: Total Cycles: 1009
# CHECK-NEXT: Total uOps: 4300
-# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 6.53
-# CHECK-NEXT: IPC: 0.76
-# CHECK-NEXT: Block RThroughput: 6.0
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 4.26
+# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: Block RThroughput: 7.2
# CHECK: Timeline view:
-# CHECK-NEXT: 012345
+# CHECK-NEXT: 012345678
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeeER . ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: [0,3] . DeeeeeeeeER . ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: [0,4] . D=eeeeeeeeER ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK: [0,0] DeeeeeeeeER . . ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1] . DeeeeeeeeER . . ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,2] . DeeeeeeeeER. . ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,3] . .DeeeeeeeeER . ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,4] . . DeeeeeeeeER ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2232,33 +2232,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: 4. 1 2.0 1.0 0.0 ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: 1 1.2 0.4 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 1.0 0.0 ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: 3. 1 1.0 1.0 0.0 ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: 1 1.0 1.0 0.0 <total>
# CHECK: [42] Code Region - G43
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 610
+# CHECK-NEXT: Total Cycles: 1009
# CHECK-NEXT: Total uOps: 4200
-# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 6.89
-# CHECK-NEXT: IPC: 0.82
-# CHECK-NEXT: Block RThroughput: 5.7
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 4.16
+# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: Block RThroughput: 7.0
# CHECK: Timeline view:
-# CHECK-NEXT: 012345
+# CHECK-NEXT: 012345678
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeeER . ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,1] .DeeeeeeeeER . ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,2] . DeeeeeeeeER . ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3] . D=eeeeeeeeER . ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,4] . D==eeeeeeeeER ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK: [0,0] DeeeeeeeeER . . ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1] . DeeeeeeeeER . . ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeeeeeER. . ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3] . .DeeeeeeeeER . ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,4] . . DeeeeeeeeER ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2268,33 +2268,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4r { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 3. 1 2.0 0.0 0.0 ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 4. 1 3.0 1.0 0.0 ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 1 1.6 0.4 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 1.0 0.0 ld4r { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld4r { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 1.0 0.0 ld4r { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 ld4r { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 1 1.0 1.0 0.0 <total>
# CHECK: [43] Code Region - G44
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 508
+# CHECK-NEXT: Total Cycles: 808
# CHECK-NEXT: Total uOps: 3400
-# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 6.69
-# CHECK-NEXT: IPC: 0.98
-# CHECK-NEXT: Block RThroughput: 4.3
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 4.21
+# CHECK-NEXT: IPC: 0.62
+# CHECK-NEXT: Block RThroughput: 5.7
# CHECK: Timeline view:
-# CHECK-NEXT: 012
+# CHECK-NEXT: 012345
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeeeER . ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,1] .DeeeeeeeeER. ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,2] . DeeeeeeeeER ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3] . D=eeeeeeE-R ldp s1, s2, [x27], #248
-# CHECK-NEXT: [0,4] . D=eeeeeeER ldp d1, d2, [x27], #496
+# CHECK: [0,0] DeeeeeeeeER . ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1] . DeeeeeeeeER . ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeeeeeER. ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3] . .DeeeeeeER. ldp s1, s2, [x27], #248
+# CHECK-NEXT: [0,4] . . DeeeeeeER ldp d1, d2, [x27], #496
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2304,11 +2304,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld4r { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3. 1 2.0 0.0 1.0 ldp s1, s2, [x27], #248
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ldp d1, d2, [x27], #496
-# CHECK-NEXT: 1 1.4 0.2 0.2 <total>
+# CHECK-NEXT: 1. 1 1.0 1.0 0.0 ld4r { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 ld4r { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 1.0 0.0 ldp s1, s2, [x27], #248
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ldp d1, d2, [x27], #496
+# CHECK-NEXT: 1 1.0 0.8 0.0 <total>
# CHECK: [44] Code Region - G45
@@ -2317,20 +2317,20 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 507
# CHECK-NEXT: Total uOps: 2300
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.54
# CHECK-NEXT: IPC: 0.99
-# CHECK-NEXT: Block RThroughput: 2.3
+# CHECK-NEXT: Block RThroughput: 3.8
# CHECK: Timeline view:
# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeER .. ldp q1, q2, [x27], #992
-# CHECK-NEXT: [0,1] D=eeeeeeER.. ldp s1, s2, [x27, #248]!
-# CHECK-NEXT: [0,2] D==eeeeeeER. ldp d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,3] .D==eeeeeeER ldp q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,4] .D===eeeeE-R ldp w1, w2, [x27], #248
+# CHECK-NEXT: [0,1] .DeeeeeeER.. ldp s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,2] . DeeeeeeER. ldp d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,3] . DeeeeeeER ldp q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,4] . DeeeeE-R ldp w1, w2, [x27], #248
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2340,11 +2340,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldp q1, q2, [x27], #992
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldp s1, s2, [x27, #248]!
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldp d1, d2, [x27, #496]!
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ldp q1, q2, [x27, #992]!
-# CHECK-NEXT: 4. 1 4.0 0.0 1.0 ldp w1, w2, [x27], #248
-# CHECK-NEXT: 1 2.6 0.2 0.2 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 ldp s1, s2, [x27, #248]!
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ldp d1, d2, [x27, #496]!
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 ldp q1, q2, [x27, #992]!
+# CHECK-NEXT: 4. 1 1.0 0.0 1.0 ldp w1, w2, [x27], #248
+# CHECK-NEXT: 1 1.0 0.2 0.2 <total>
# CHECK: [45] Code Region - G46
@@ -2353,10 +2353,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 507
# CHECK-NEXT: Total uOps: 2100
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.14
# CHECK-NEXT: IPC: 0.99
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 3.5
# CHECK: Timeline view:
# CHECK-NEXT: 01
@@ -2364,9 +2364,9 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeeeER .. ldp x1, x2, [x27], #496
# CHECK-NEXT: [0,1] D=eeeeER .. ldp w1, w2, [x27, #248]!
-# CHECK-NEXT: [0,2] D==eeeeER .. ldp x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,3] D===eeeeeER. ldpsw x1, x2, [x27], #248
-# CHECK-NEXT: [0,4] .D===eeeeeER ldpsw x1, x2, [x27, #248]!
+# CHECK-NEXT: [0,2] .D=eeeeER .. ldp x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,3] . D=eeeeeER. ldpsw x1, x2, [x27], #248
+# CHECK-NEXT: [0,4] . D=eeeeeER ldpsw x1, x2, [x27, #248]!
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2377,10 +2377,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldp x1, x2, [x27], #496
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldp w1, w2, [x27, #248]!
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldp x1, x2, [x27, #496]!
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ldpsw x1, x2, [x27], #248
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ldpsw x1, x2, [x27, #248]!
-# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ldp x1, x2, [x27, #496]!
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 ldpsw x1, x2, [x27], #248
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ldpsw x1, x2, [x27, #248]!
+# CHECK-NEXT: 1 1.8 0.2 0.0 <total>
# CHECK: [46] Code Region - G47
@@ -2389,10 +2389,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 2.95
# CHECK-NEXT: IPC: 0.98
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
# CHECK-NEXT: 012
@@ -2400,9 +2400,9 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeeeeeER . . ldr b1, [x27], #254
# CHECK-NEXT: [0,1] D=eeeeeeER. . ldr h1, [x27], #254
-# CHECK-NEXT: [0,2] D==eeeeeeER . ldr s1, [x27], #254
-# CHECK-NEXT: [0,3] D===eeeeeeER. ldr d1, [x27], #254
-# CHECK-NEXT: [0,4] D====eeeeeeER ldr q1, [x27], #254
+# CHECK-NEXT: [0,2] .D=eeeeeeER . ldr s1, [x27], #254
+# CHECK-NEXT: [0,3] .D==eeeeeeER. ldr d1, [x27], #254
+# CHECK-NEXT: [0,4] . D==eeeeeeER ldr q1, [x27], #254
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2413,10 +2413,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldr b1, [x27], #254
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldr h1, [x27], #254
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldr s1, [x27], #254
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ldr d1, [x27], #254
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ldr q1, [x27], #254
-# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ldr s1, [x27], #254
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ldr d1, [x27], #254
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ldr q1, [x27], #254
+# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
# CHECK: [47] Code Region - G48
@@ -2425,10 +2425,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 2.95
# CHECK-NEXT: IPC: 0.98
-# CHECK-NEXT: Block RThroughput: 1.7
+# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
# CHECK-NEXT: 012
@@ -2436,9 +2436,9 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeeeeeER . . ldr b1, [x27, #254]!
# CHECK-NEXT: [0,1] D=eeeeeeER. . ldr h1, [x27, #254]!
-# CHECK-NEXT: [0,2] D==eeeeeeER . ldr s1, [x27, #254]!
-# CHECK-NEXT: [0,3] D===eeeeeeER. ldr d1, [x27, #254]!
-# CHECK-NEXT: [0,4] D====eeeeeeER ldr q1, [x27, #254]!
+# CHECK-NEXT: [0,2] .D=eeeeeeER . ldr s1, [x27, #254]!
+# CHECK-NEXT: [0,3] .D==eeeeeeER. ldr d1, [x27, #254]!
+# CHECK-NEXT: [0,4] . D==eeeeeeER ldr q1, [x27, #254]!
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2449,10 +2449,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldr b1, [x27, #254]!
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldr h1, [x27, #254]!
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldr s1, [x27, #254]!
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ldr d1, [x27, #254]!
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ldr q1, [x27, #254]!
-# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ldr s1, [x27, #254]!
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ldr d1, [x27, #254]!
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ldr q1, [x27, #254]!
+# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
# CHECK: [48] Code Region - G49
@@ -2461,7 +2461,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 1000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 1.98
# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 1.7
@@ -2473,8 +2473,8 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeeeER . ldr w1, [x27], #254
# CHECK-NEXT: [0,1] D=eeeeER . ldr x1, [x27], #254
# CHECK-NEXT: [0,2] D==eeeeER . ldr w1, [x27, #254]!
-# CHECK-NEXT: [0,3] D===eeeeER. ldr x1, [x27, #254]!
-# CHECK-NEXT: [0,4] D====eeeeER ldrb w1, [x27], #254
+# CHECK-NEXT: [0,3] .D==eeeeER. ldr x1, [x27, #254]!
+# CHECK-NEXT: [0,4] .D===eeeeER ldrb w1, [x27], #254
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2486,9 +2486,9 @@ ldr x2, [x1], #254
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldr w1, [x27], #254
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldr x1, [x27], #254
# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldr w1, [x27, #254]!
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ldr x1, [x27, #254]!
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ldrb w1, [x27], #254
-# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ldr x1, [x27, #254]!
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ldrb w1, [x27], #254
+# CHECK-NEXT: 1 2.6 0.2 0.0 <total>
# CHECK: [49] Code Region - G50
@@ -2497,7 +2497,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 1000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 1.98
# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 1.7
@@ -2509,8 +2509,8 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeeeER . ldrb w1, [x27, #254]!
# CHECK-NEXT: [0,1] D=eeeeER . ldrh w1, [x27], #254
# CHECK-NEXT: [0,2] D==eeeeER . ldrh w1, [x27, #254]!
-# CHECK-NEXT: [0,3] D===eeeeER. ldrsb w1, [x27], #254
-# CHECK-NEXT: [0,4] D====eeeeER ldrsb x1, [x27], #254
+# CHECK-NEXT: [0,3] .D==eeeeER. ldrsb w1, [x27], #254
+# CHECK-NEXT: [0,4] .D===eeeeER ldrsb x1, [x27], #254
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2522,9 +2522,9 @@ ldr x2, [x1], #254
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldrb w1, [x27, #254]!
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldrh w1, [x27], #254
# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldrh w1, [x27, #254]!
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ldrsb w1, [x27], #254
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ldrsb x1, [x27], #254
-# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ldrsb w1, [x27], #254
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ldrsb x1, [x27], #254
+# CHECK-NEXT: 1 2.6 0.2 0.0 <total>
# CHECK: [50] Code Region - G51
@@ -2533,7 +2533,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 1000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 1.98
# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 1.7
@@ -2545,8 +2545,8 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeeeER . ldrsb w1, [x27, #254]!
# CHECK-NEXT: [0,1] D=eeeeER . ldrsb x1, [x27, #254]!
# CHECK-NEXT: [0,2] D==eeeeER . ldrsh w1, [x27], #254
-# CHECK-NEXT: [0,3] D===eeeeER. ldrsh x1, [x27], #254
-# CHECK-NEXT: [0,4] D====eeeeER ldrsh w1, [x27, #254]!
+# CHECK-NEXT: [0,3] .D==eeeeER. ldrsh x1, [x27], #254
+# CHECK-NEXT: [0,4] .D===eeeeER ldrsh w1, [x27, #254]!
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2558,9 +2558,9 @@ ldr x2, [x1], #254
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldrsb w1, [x27, #254]!
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldrsb x1, [x27, #254]!
# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldrsh w1, [x27], #254
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 ldrsh x1, [x27], #254
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 ldrsh w1, [x27, #254]!
-# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 ldrsh x1, [x27], #254
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 ldrsh w1, [x27, #254]!
+# CHECK-NEXT: 1 2.6 0.2 0.0 <total>
# CHECK: [51] Code Region - G52
@@ -2569,10 +2569,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 1200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 2.38
# CHECK-NEXT: IPC: 0.99
-# CHECK-NEXT: Block RThroughput: 1.0
+# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Timeline view:
# CHECK-NEXT: Index 012345678
@@ -2580,8 +2580,8 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeeeER . ldrsh x1, [x27, #254]!
# CHECK-NEXT: [0,1] D=eeeeER. ldrsw x1, [x27], #254
# CHECK-NEXT: [0,2] D==eeeeER ldrsw x1, [x27, #254]!
-# CHECK-NEXT: [0,3] D===eeE-R st1 { v1.1d }, [x27], #8
-# CHECK-NEXT: [0,4] D====eeER st1 { v1.2d }, [x27], #16
+# CHECK-NEXT: [0,3] .D==eeE-R st1 { v1.1d }, [x27], #8
+# CHECK-NEXT: [0,4] .D===eeER st1 { v1.2d }, [x27], #16
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2593,9 +2593,9 @@ ldr x2, [x1], #254
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ldrsh x1, [x27, #254]!
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 ldrsw x1, [x27], #254
# CHECK-NEXT: 2. 1 3.0 0.0 0.0 ldrsw x1, [x27, #254]!
-# CHECK-NEXT: 3. 1 4.0 0.0 1.0 st1 { v1.1d }, [x27], #8
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 st1 { v1.2d }, [x27], #16
-# CHECK-NEXT: 1 3.0 0.2 0.2 <total>
+# CHECK-NEXT: 3. 1 3.0 0.0 1.0 st1 { v1.1d }, [x27], #8
+# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.2d }, [x27], #16
+# CHECK-NEXT: 1 2.6 0.2 0.2 <total>
# CHECK: [52] Code Region - G53
@@ -2604,7 +2604,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 2.98
# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 2.5
@@ -2614,9 +2614,9 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeER. . st1 { v1.2s }, [x27], #8
# CHECK-NEXT: [0,1] D=eeER . st1 { v1.4h }, [x27], #8
-# CHECK-NEXT: [0,2] D==eeER . st1 { v1.4s }, [x27], #16
-# CHECK-NEXT: [0,3] D===eeER. st1 { v1.8b }, [x27], #8
-# CHECK-NEXT: [0,4] D====eeER st1 { v1.8h }, [x27], #16
+# CHECK-NEXT: [0,2] .D=eeER . st1 { v1.4s }, [x27], #16
+# CHECK-NEXT: [0,3] .D==eeER. st1 { v1.8b }, [x27], #8
+# CHECK-NEXT: [0,4] . D==eeER st1 { v1.8h }, [x27], #16
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2627,10 +2627,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.2s }, [x27], #8
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.4h }, [x27], #8
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.4s }, [x27], #16
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.8b }, [x27], #8
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 st1 { v1.8h }, [x27], #16
-# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.4s }, [x27], #16
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st1 { v1.8b }, [x27], #8
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 st1 { v1.8h }, [x27], #16
+# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
# CHECK: [53] Code Region - G54
@@ -2639,7 +2639,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 2.98
# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 2.5
@@ -2649,9 +2649,9 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeER. . st1 { v1.16b }, [x27], #16
# CHECK-NEXT: [0,1] D=eeER . st1 { v1.1d }, [x27], x28
-# CHECK-NEXT: [0,2] D==eeER . st1 { v1.2d }, [x27], x28
-# CHECK-NEXT: [0,3] D===eeER. st1 { v1.2s }, [x27], x28
-# CHECK-NEXT: [0,4] D====eeER st1 { v1.4h }, [x27], x28
+# CHECK-NEXT: [0,2] .D=eeER . st1 { v1.2d }, [x27], x28
+# CHECK-NEXT: [0,3] .D==eeER. st1 { v1.2s }, [x27], x28
+# CHECK-NEXT: [0,4] . D==eeER st1 { v1.4h }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2662,10 +2662,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.16b }, [x27], #16
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.1d }, [x27], x28
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.2d }, [x27], x28
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.2s }, [x27], x28
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 st1 { v1.4h }, [x27], x28
-# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.2d }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st1 { v1.2s }, [x27], x28
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 st1 { v1.4h }, [x27], x28
+# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
# CHECK: [54] Code Region - G55
@@ -2674,7 +2674,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 2.98
# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 2.5
@@ -2684,9 +2684,9 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeeER. . st1 { v1.4s }, [x27], x28
# CHECK-NEXT: [0,1] D=eeER . st1 { v1.8b }, [x27], x28
-# CHECK-NEXT: [0,2] D==eeER . st1 { v1.8h }, [x27], x28
-# CHECK-NEXT: [0,3] D===eeER. st1 { v1.16b }, [x27], x28
-# CHECK-NEXT: [0,4] D====eeER st1 { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,2] .D=eeER . st1 { v1.8h }, [x27], x28
+# CHECK-NEXT: [0,3] .D==eeER. st1 { v1.16b }, [x27], x28
+# CHECK-NEXT: [0,4] . D==eeER st1 { v1.1d, v2.1d }, [x27], #16
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2697,10 +2697,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.4s }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.8b }, [x27], x28
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.8h }, [x27], x28
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.16b }, [x27], x28
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 st1 { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.8h }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st1 { v1.16b }, [x27], x28
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 st1 { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
# CHECK: [55] Code Region - G56
@@ -2709,7 +2709,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 1900
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.77
# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 3.5
@@ -2718,10 +2718,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Index 012345678
# CHECK: [0,0] DeeER. . st1 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,1] D=eeER . st1 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,2] D==eeER . st1 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,3] D===eeER. st1 { v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,4] .D===eeER st1 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,1] .DeeER . st1 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,2] .D=eeER . st1 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3] . D=eeER. st1 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,4] . D=eeER st1 { v1.8b, v2.8b }, [x27], #16
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2731,11 +2731,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 st1 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 1 1.6 0.2 0.0 <total>
# CHECK: [56] Code Region - G57
@@ -2744,7 +2744,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 2100
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.17
# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 4.0
@@ -2753,10 +2753,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Index 012345678
# CHECK: [0,0] DeeER. . st1 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,1] D=eeER . st1 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,2] D==eeER . st1 { v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,3] .D==eeER. st1 { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,4] .D===eeER st1 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,1] .DeeER . st1 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,2] . DeeER . st1 { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3] . DeeER. st1 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,4] . DeeER st1 { v1.2s, v2.2s }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2766,11 +2766,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st1 { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 1 2.6 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st1 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 1 1.0 0.2 0.0 <total>
# CHECK: [57] Code Region - G58
@@ -2779,7 +2779,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 2100
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.17
# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 4.0
@@ -2788,10 +2788,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Index 012345678
# CHECK: [0,0] DeeER. . st1 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,1] D=eeER . st1 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,2] D==eeER . st1 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,3] D===eeER. st1 { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,4] .D===eeER st1 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,1] .DeeER . st1 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeER . st1 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3] . DeeER. st1 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,4] . DeeER st1 { v1.16b, v2.16b }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2801,11 +2801,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st1 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 1 1.0 0.2 0.0 <total>
# CHECK: [58] Code Region - G59
@@ -2814,7 +2814,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 703
# CHECK-NEXT: Total uOps: 2900
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.13
# CHECK-NEXT: IPC: 0.71
# CHECK-NEXT: Block RThroughput: 6.0
@@ -2823,10 +2823,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeER. . st1 { v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1] D=eeER . st1 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,2] .D=eeER . st1 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,3] .D===eeER. st1 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,4] . D===eeER st1 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1] .DeeER . st1 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,2] . DeeER . st1 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,3] . D=eeER. st1 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,4] . D=eeER st1 { v1.4s, v2.4s, v3.4s }, [x27], #48
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2836,32 +2836,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 3. 1 4.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 1 2.6 0.4 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 3. 1 2.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 1 1.4 0.4 0.0 <total>
# CHECK: [59] Code Region - G60
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 703
+# CHECK-NEXT: Total Cycles: 704
# CHECK-NEXT: Total uOps: 3100
-# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 4.41
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 4.40
# CHECK-NEXT: IPC: 0.71
# CHECK-NEXT: Block RThroughput: 6.5
# CHECK: Timeline view:
+# CHECK-NEXT: 0
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeER. . st1 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,1] D=eeER . st1 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,2] .D=eeER . st1 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,3] .D===eeER. st1 { v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,4] . D===eeER st1 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK: [0,0] DeeER. . st1 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1] .DeeER . st1 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,2] . DeeER . st1 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,3] . DeeER . st1 { v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,4] . D=eeER st1 { v1.2d, v2.2d, v3.2d }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2871,11 +2872,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 3. 1 4.0 1.0 0.0 st1 { v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 1 2.6 0.4 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st1 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st1 { v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st1 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 1 1.2 0.6 0.0 <total>
# CHECK: [60] Code Region - G61
@@ -2884,7 +2885,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 703
# CHECK-NEXT: Total uOps: 2900
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.13
# CHECK-NEXT: IPC: 0.71
# CHECK-NEXT: Block RThroughput: 6.0
@@ -2893,10 +2894,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeER. . st1 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1] D=eeER . st1 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,2] .D=eeER . st1 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,3] .D==eeER . st1 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,4] . D===eeER st1 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,1] .DeeER . st1 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeER . st1 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,3] . DeeER . st1 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,4] . D=eeER st1 { v1.8h, v2.8h, v3.8h }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2906,11 +2907,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 4. 1 4.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 1 2.4 0.4 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 1 1.2 0.4 0.0 <total>
# CHECK: [61] Code Region - G62
@@ -2919,7 +2920,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 704
# CHECK-NEXT: Total uOps: 3100
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.40
# CHECK-NEXT: IPC: 0.71
# CHECK-NEXT: Block RThroughput: 6.5
@@ -2929,10 +2930,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeER. . st1 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1] D=eeER . st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,2] .D==eeER . st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,3] .D===eeER . st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,4] . D====eeER st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1] .DeeER . st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,2] . D=eeER . st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3] . DeeER . st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,4] . D=eeER st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2942,33 +2943,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 2. 1 3.0 1.0 0.0 st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 4. 1 5.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 1 3.0 0.6 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 2. 1 2.0 1.0 0.0 st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 1 1.4 0.6 0.0 <total>
# CHECK: [62] Code Region - G63
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 804
+# CHECK-NEXT: Total Cycles: 805
# CHECK-NEXT: Total uOps: 3700
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.60
# CHECK-NEXT: IPC: 0.62
# CHECK-NEXT: Block RThroughput: 8.0
# CHECK: Timeline view:
-# CHECK-NEXT: 01
+# CHECK-NEXT: 012
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeER. .. st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,1] D=eeER .. st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,2] .D==eeER .. st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,3] . D==eeER .. st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,4] . D=====eeER st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK: [0,0] DeeER. . . st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1] . DeeER . . st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,2] . D=eeER . . st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3] . DeeER. . st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,4] . . D=eeER st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -2978,32 +2979,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 2. 1 3.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 4. 1 6.0 2.0 0.0 st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 1 3.0 0.8 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 2. 1 2.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 4. 1 2.0 2.0 0.0 st1 { v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 1 1.4 1.0 0.0 <total>
# CHECK: [63] Code Region - G64
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 703
+# CHECK-NEXT: Total Cycles: 704
# CHECK-NEXT: Total uOps: 3300
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.69
# CHECK-NEXT: IPC: 0.71
# CHECK-NEXT: Block RThroughput: 7.0
# CHECK: Timeline view:
+# CHECK-NEXT: 0
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeER. . st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1] D=eeER . st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,2] .D==eeER . st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,3] .D===eeER. st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,4] . D===eeER st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK: [0,0] DeeER. . st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1] . DeeER . st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,2] . D=eeER . st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,3] . D=eeER. st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,4] . .DeeER st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3013,33 +3015,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 2. 1 3.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 1 2.8 0.4 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st1 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 2. 1 2.0 1.0 0.0 st1 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 st1 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 1 1.4 0.6 0.0 <total>
# CHECK: [64] Code Region - G65
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 706
+# CHECK-NEXT: Total Cycles: 707
# CHECK-NEXT: Total uOps: 3000
-# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 4.25
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 4.24
# CHECK-NEXT: IPC: 0.71
# CHECK-NEXT: Block RThroughput: 7.0
# CHECK: Timeline view:
-# CHECK-NEXT: 012
+# CHECK-NEXT: 0123
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeER. . . st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1] .DeeER . . st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,2] .D===eeeeER . st1 { v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,3] . D===eeeeER. st1 { v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,4] . D====eeeeER st1 { v1.b }[0], [x27], x28
+# CHECK: [0,0] DeeER. . . st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1] . DeeER . . st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeER . st1 { v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,3] . D=eeeeER. st1 { v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,4] . .D=eeeeER st1 { v1.b }[0], [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3049,11 +3051,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 2. 1 4.0 2.0 0.0 st1 { v1.b }[0], [x27], #1
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.b }[8], [x27], #1
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 st1 { v1.b }[0], [x27], x28
-# CHECK-NEXT: 1 3.0 0.6 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st1 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st1 { v1.b }[0], [x27], #1
+# CHECK-NEXT: 3. 1 2.0 1.0 0.0 st1 { v1.b }[8], [x27], #1
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 st1 { v1.b }[0], [x27], x28
+# CHECK-NEXT: 1 1.4 0.8 0.0 <total>
# CHECK: [65] Code Region - G66
@@ -3062,7 +3064,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.95
# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 5.0
@@ -3072,10 +3074,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeER . st1 { v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,1] D=eeeeER . st1 { v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,2] D==eeeeER . st1 { v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,3] D===eeeeER. st1 { v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,4] .D===eeeeER st1 { v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,1] .DeeeeER . st1 { v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,2] . DeeeeER . st1 { v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,3] . DeeeeER. st1 { v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,4] . DeeeeER st1 { v1.h }[4], [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3085,11 +3087,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.b }[8], [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.h }[0], [x27], #2
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.h }[4], [x27], #2
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.h }[0], [x27], x28
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st1 { v1.h }[4], [x27], x28
-# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.h }[0], [x27], #2
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.h }[4], [x27], #2
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st1 { v1.h }[0], [x27], x28
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st1 { v1.h }[4], [x27], x28
+# CHECK-NEXT: 1 1.0 0.2 0.0 <total>
# CHECK: [66] Code Region - G67
@@ -3098,7 +3100,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 605
# CHECK-NEXT: Total uOps: 2300
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.80
# CHECK-NEXT: IPC: 0.83
# CHECK-NEXT: Block RThroughput: 6.0
@@ -3108,10 +3110,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeER . st1 { v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,1] D=eeeeER . st1 { v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,2] D==eeeeER . st1 { v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,3] D===eeeeER. st1 { v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,4] .D===eeeeER st2 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,1] .DeeeeER . st1 { v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeER . st1 { v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,3] . DeeeeER. st1 { v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,4] . DeeeeER st2 { v1.2d, v2.2d }, [x27], #32
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3121,11 +3123,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st1 { v1.s }[0], [x27], #4
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st1 { v1.s }[0], [x27], x28
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st1 { v1.d }[0], [x27], #8
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st1 { v1.d }[0], [x27], x28
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st2 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st1 { v1.s }[0], [x27], x28
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st1 { v1.d }[0], [x27], #8
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st1 { v1.d }[0], [x27], x28
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st2 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 1 1.0 0.2 0.0 <total>
# CHECK: [67] Code Region - G68
@@ -3134,7 +3136,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 705
# CHECK-NEXT: Total uOps: 2600
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.69
# CHECK-NEXT: IPC: 0.71
# CHECK-NEXT: Block RThroughput: 7.0
@@ -3144,10 +3146,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeER .. st2 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,1] D=eeeeER .. st2 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,2] D==eeeeER .. st2 { v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,3] .D==eeeeER.. st2 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,4] .D====eeeeER st2 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1] .DeeeeER .. st2 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,2] . DeeeeER .. st2 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,3] . DeeeeER.. st2 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,4] . D=eeeeER st2 { v1.8h, v2.8h }, [x27], #32
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3157,33 +3159,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st2 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st2 { v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st2 { v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 4. 1 5.0 1.0 0.0 st2 { v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 1 2.8 0.4 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st2 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st2 { v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st2 { v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 4. 1 2.0 1.0 0.0 st2 { v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 1 1.2 0.4 0.0 <total>
# CHECK: [68] Code Region - G69
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 805
+# CHECK-NEXT: Total Cycles: 806
# CHECK-NEXT: Total uOps: 2900
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.60
# CHECK-NEXT: IPC: 0.62
# CHECK-NEXT: Block RThroughput: 8.0
# CHECK: Timeline view:
-# CHECK-NEXT: 012
+# CHECK-NEXT: 0123
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeER . . st2 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,1] D=eeeeER . . st2 { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,2] .D===eeeeER . st2 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,3] .D====eeeeER. st2 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,4] .D=====eeeeER st2 { v1.4s, v2.4s }, [x27], x28
+# CHECK: [0,0] DeeeeER . . st2 { v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1] . DeeeeER . . st2 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,2] . D=eeeeER . st2 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3] . D==eeeeER. st2 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,4] . D==eeeeER st2 { v1.4s, v2.4s }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3193,33 +3195,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st2 { v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 2. 1 4.0 2.0 0.0 st2 { v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 3. 1 5.0 0.0 0.0 st2 { v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 4. 1 6.0 0.0 0.0 st2 { v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 1 3.6 0.6 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st2 { v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 2. 1 2.0 1.0 0.0 st2 { v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 1.0 0.0 st2 { v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 st2 { v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 1 2.0 0.8 0.0 <total>
# CHECK: [69] Code Region - G70
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 706
+# CHECK-NEXT: Total Cycles: 707
# CHECK-NEXT: Total uOps: 2600
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.68
# CHECK-NEXT: IPC: 0.71
# CHECK-NEXT: Block RThroughput: 7.0
# CHECK: Timeline view:
-# CHECK-NEXT: 012
+# CHECK-NEXT: 0123
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeER . . st2 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,1] D=eeeeER . . st2 { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,2] .D=eeeeER . . st2 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,3] .D====eeeeER. st2 { v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,4] .D=====eeeeER st2 { v1.b, v2.b }[8], [x27], #2
+# CHECK: [0,0] DeeeeER . . st2 { v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1] .DeeeeER . . st2 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeER. . st2 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3] . D=eeeeER . st2 { v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,4] . D==eeeeER st2 { v1.b, v2.b }[8], [x27], #2
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3229,11 +3231,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st2 { v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st2 { v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 3. 1 5.0 2.0 0.0 st2 { v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 4. 1 6.0 0.0 0.0 st2 { v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 1 3.2 0.6 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st2 { v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st2 { v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 3. 1 2.0 1.0 0.0 st2 { v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 4. 1 3.0 1.0 0.0 st2 { v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 1 1.6 0.8 0.0 <total>
# CHECK: [70] Code Region - G71
@@ -3242,7 +3244,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.95
# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 5.0
@@ -3252,10 +3254,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeER . st2 { v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,1] D=eeeeER . st2 { v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,2] D==eeeeER . st2 { v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,3] D===eeeeER. st2 { v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,4] .D===eeeeER st2 { v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,1] .DeeeeER . st2 { v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeER . st2 { v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,3] . DeeeeER. st2 { v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,4] . DeeeeER st2 { v1.h, v2.h }[0], [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3265,11 +3267,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st2 { v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st2 { v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st2 { v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st2 { v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st2 { v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st2 { v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st2 { v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st2 { v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 1 1.0 0.2 0.0 <total>
# CHECK: [71] Code Region - G72
@@ -3278,7 +3280,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 506
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.95
# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 5.0
@@ -3288,10 +3290,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeER . st2 { v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,1] D=eeeeER . st2 { v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,2] D==eeeeER . st2 { v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,3] D===eeeeER. st2 { v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,4] .D===eeeeER st2 { v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,1] .DeeeeER . st2 { v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,2] . DeeeeER . st2 { v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,3] . DeeeeER. st2 { v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,4] . DeeeeER st2 { v1.d, v2.d }[0], [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3301,11 +3303,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2 { v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st2 { v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st2 { v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st2 { v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 st2 { v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st2 { v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 st2 { v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st2 { v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 st2 { v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 1 1.0 0.2 0.0 <total>
# CHECK: [72] Code Region - G73
@@ -3314,7 +3316,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 807
# CHECK-NEXT: Total uOps: 3000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.72
# CHECK-NEXT: IPC: 0.62
# CHECK-NEXT: Block RThroughput: 7.0
@@ -3325,9 +3327,9 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeER . . . st2g x26, [x27], #4064
# CHECK-NEXT: [0,1] D=eER. . . st2g x26, [x27, #4064]!
-# CHECK-NEXT: [0,2] D==eeeeeeER . st3 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,3] .D==eeeeeER . st3 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,4] .D======eeeeeER st3 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,2] .D=eeeeeeER . st3 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3] . DeeeeeER . st3 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,4] . D==eeeeeER st3 { v1.4h, v2.4h, v3.4h }, [x27], #24
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3338,32 +3340,32 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st2g x26, [x27], #4064
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st2g x26, [x27, #4064]!
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 st3 { v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st3 { v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 4. 1 7.0 3.0 0.0 st3 { v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 1 3.2 0.8 0.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 st3 { v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st3 { v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 4. 1 3.0 3.0 0.0 st3 { v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 1 1.8 0.8 0.0 <total>
# CHECK: [73] Code Region - G74
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 1405
+# CHECK-NEXT: Total Cycles: 1406
# CHECK-NEXT: Total uOps: 4700
-# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 3.35
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 3.34
# CHECK-NEXT: IPC: 0.36
# CHECK-NEXT: Block RThroughput: 14.0
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 0123456789 0
-# CHECK: [0,0] DeeeeeeER . . . st3 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,1] .DeeeeeER . . . st3 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,2] . D===eeeeeeER . . st3 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,3] . D===eeeeeeER. . st3 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,4] . D=======eeeeeeER st3 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK: [0,0] DeeeeeeER . . . st3 { v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1] . DeeeeeER. . . st3 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,2] . D==eeeeeeER. . st3 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3] . .D=eeeeeeER . st3 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,4] . . D====eeeeeeER st3 { v1.2d, v2.2d, v3.2d }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3373,33 +3375,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st3 { v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 2. 1 4.0 3.0 0.0 st3 { v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st3 { v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 4. 1 8.0 4.0 0.0 st3 { v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 1 3.6 1.6 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st3 { v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 2. 1 3.0 3.0 0.0 st3 { v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 st3 { v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 4. 1 5.0 4.0 0.0 st3 { v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 1 2.4 1.8 0.0 <total>
# CHECK: [74] Code Region - G75
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 1206
+# CHECK-NEXT: Total Cycles: 1207
# CHECK-NEXT: Total uOps: 4100
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.40
# CHECK-NEXT: IPC: 0.41
# CHECK-NEXT: Block RThroughput: 12.0
# CHECK: Timeline view:
-# CHECK-NEXT: 01234567
+# CHECK-NEXT: 012345678
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeER . . . st3 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1] D=eeeeeER . . . st3 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,2] .D===eeeeeeER . . st3 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,3] . D===eeeeeER . . st3 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,4] . D======eeeeeeER st3 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK: [0,0] DeeeeeER . . . st3 { v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1] . DeeeeeER. . . st3 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,2] . DeeeeeeER . . st3 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,3] . .DeeeeeER . . st3 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,4] . . D==eeeeeeER st3 { v1.8h, v2.8h, v3.8h }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3409,33 +3411,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st3 { v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 2. 1 4.0 2.0 0.0 st3 { v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st3 { v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 4. 1 7.0 3.0 0.0 st3 { v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 1 3.6 1.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st3 { v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st3 { v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 3. 1 1.0 1.0 0.0 st3 { v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 4. 1 3.0 3.0 0.0 st3 { v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 1 1.4 1.4 0.0 <total>
# CHECK: [75] Code Region - G76
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 1106
+# CHECK-NEXT: Total Cycles: 1107
# CHECK-NEXT: Total uOps: 3800
-# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 3.44
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 3.43
# CHECK-NEXT: IPC: 0.45
# CHECK-NEXT: Block RThroughput: 11.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01234567
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeeER . .. st3 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1] .DeeeeeER . .. st3 { v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,2] .D====eeeeeER .. st3 { v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,3] . D====eeeeeER .. st3 { v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,4] . D=======eeeeeER st3 { v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK: [0,0] DeeeeeeER . . . st3 { v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1] . DeeeeeER. . . st3 { v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,2] . D==eeeeeER . . st3 { v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,3] . .D=eeeeeER. . st3 { v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,4] . . D==eeeeeER st3 { v1.b, v2.b, v3.b }[8], [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3445,20 +3447,20 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st3 { v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 2. 1 5.0 3.0 0.0 st3 { v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 3. 1 5.0 0.0 0.0 st3 { v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 4. 1 8.0 2.0 0.0 st3 { v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 1 4.0 1.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st3 { v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 2. 1 3.0 3.0 0.0 st3 { v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 st3 { v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 4. 1 3.0 2.0 0.0 st3 { v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 1 2.0 1.4 0.0 <total>
# CHECK: [76] Code Region - G77
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 1005
+# CHECK-NEXT: Total Cycles: 1006
# CHECK-NEXT: Total uOps: 3500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.48
# CHECK-NEXT: IPC: 0.50
# CHECK-NEXT: Block RThroughput: 10.0
@@ -3468,10 +3470,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeER . . st3 { v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,1] D=eeeeeER . . st3 { v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,2] .D===eeeeeER . st3 { v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,3] .D====eeeeeER . st3 { v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,4] . D======eeeeeER st3 { v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,1] . DeeeeeER. . st3 { v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,2] . DeeeeeER . st3 { v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,3] . .DeeeeeER . st3 { v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,4] . . DeeeeeER st3 { v1.s, v2.s, v3.s }[0], [x27], #12
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3481,33 +3483,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st3 { v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 2. 1 4.0 2.0 0.0 st3 { v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 3. 1 5.0 0.0 0.0 st3 { v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 4. 1 7.0 2.0 0.0 st3 { v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 1 3.8 1.0 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st3 { v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st3 { v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 3. 1 1.0 1.0 0.0 st3 { v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st3 { v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 1 1.0 1.0 0.0 <total>
# CHECK: [77] Code Region - G78
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 1304
+# CHECK-NEXT: Total Cycles: 1305
# CHECK-NEXT: Total uOps: 4300
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.30
# CHECK-NEXT: IPC: 0.38
# CHECK-NEXT: Block RThroughput: 13.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456
+# CHECK-NEXT: 01234567
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeeeER . .. st3 { v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,1] D=eeeeeER . .. st3 { v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,2] .D===eeeeeER .. st3 { v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,3] . D===eeeeeER .. st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,4] . D=====eeeeeeER st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK: [0,0] DeeeeeER . . . st3 { v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1] . DeeeeeER. . . st3 { v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,2] . DeeeeeER . . st3 { v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,3] . .DeeeeeER . . st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,4] . . DeeeeeeER st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3517,33 +3519,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st3 { v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st3 { v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 2. 1 4.0 2.0 0.0 st3 { v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 4. 1 6.0 2.0 0.0 st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 1 3.4 1.0 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st3 { v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 st3 { v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 3. 1 1.0 1.0 0.0 st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 1 1.0 1.0 0.0 <total>
# CHECK: [78] Code Region - G79
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 2399
+# CHECK-NEXT: Total Cycles: 2400
# CHECK-NEXT: Total uOps: 6900
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 2.88
# CHECK-NEXT: IPC: 0.21
# CHECK-NEXT: Block RThroughput: 24.0
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789 012
+# CHECK-NEXT: Index 0123456789 0123
-# CHECK: [0,0] DeeeeeeER . . . . st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,1] .DeeeeeeeER . . . st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,2] . D====eeeeeeER. . . st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,3] . D=========eeeeeeeER. st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,4] . D========eeeeeeeER st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK: [0,0] DeeeeeeER . . . . st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1] . DeeeeeeeER . . . st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,2] . D=eeeeeeER. . . st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3] . . D=====eeeeeeeER . st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,4] . . D====eeeeeeeER st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3553,33 +3555,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 2. 1 5.0 4.0 0.0 st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 3. 1 10.0 5.0 0.0 st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 4. 1 9.0 0.0 0.0 st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 1 5.2 2.0 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 2. 1 2.0 2.0 0.0 st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 3. 1 6.0 5.0 0.0 st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 4. 1 5.0 1.0 0.0 st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 1 3.0 2.0 0.0 <total>
# CHECK: [79] Code Region - G80
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 1903
+# CHECK-NEXT: Total Cycles: 1904
# CHECK-NEXT: Total uOps: 5700
-# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 3.00
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 2.99
# CHECK-NEXT: IPC: 0.26
# CHECK-NEXT: Block RThroughput: 19.0
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789 01
+# CHECK-NEXT: Index 0123456789 012
-# CHECK: [0,0] DeeeeeER . . .. st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1] .DeeeeeeER. . .. st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,2] . D=====eeeeeeER .. st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,3] . D=====eeeeeeeER .. st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,4] . D=========eeeeeeER st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK: [0,0] DeeeeeER . . . . st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1] . DeeeeeeER . . . st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,2] . D===eeeeeeER . . st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,3] . . D==eeeeeeeER . . st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,4] . . D====eeeeeeER st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3589,33 +3591,33 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 2. 1 6.0 5.0 0.0 st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 3. 1 6.0 0.0 0.0 st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 4. 1 10.0 4.0 0.0 st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 1 4.8 2.0 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st4 { v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 2. 1 4.0 4.0 0.0 st4 { v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 st4 { v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 4. 1 5.0 4.0 0.0 st4 { v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 1 2.8 2.0 0.0 <total>
# CHECK: [80] Code Region - G81
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 500
-# CHECK-NEXT: Total Cycles: 1658
+# CHECK-NEXT: Total Cycles: 1659
# CHECK-NEXT: Total uOps: 4900
-# CHECK: Dispatch Width: 16
-# CHECK-NEXT: uOps Per Cycle: 2.96
+# CHECK: Dispatch Width: 6
+# CHECK-NEXT: uOps Per Cycle: 2.95
# CHECK-NEXT: IPC: 0.30
# CHECK-NEXT: Block RThroughput: 16.5
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789 0123
+# CHECK-NEXT: Index 0123456789 01234
-# CHECK: [0,0] DeeeeeeeER. . . . st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1] . DeeeeeeeER . . . st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,2] . D=========eeeeeeER . st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,3] . D===========eeeeeeER. st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,4] . D============eeeeeeER st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK: [0,0] DeeeeeeeER. . . . st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1] . DeeeeeeeER . . . st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,2] . .D======eeeeeeER . st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,3] . . D========eeeeeeER. st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,4] . . D========eeeeeeER st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3626,10 +3628,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 st4 { v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 2. 1 10.0 9.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 3. 1 12.0 1.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 4. 1 13.0 0.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 1 7.4 2.4 0.0 <total>
+# CHECK-NEXT: 2. 1 7.0 7.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 3. 1 9.0 2.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 4. 1 9.0 0.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 1 5.4 2.2 0.0 <total>
# CHECK: [81] Code Region - G82
@@ -3638,7 +3640,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 757
# CHECK-NEXT: Total uOps: 2500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.30
# CHECK-NEXT: IPC: 0.66
# CHECK-NEXT: Block RThroughput: 7.5
@@ -3648,10 +3650,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeER . . st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,1] D=eeeeeeER. . st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,2] D===eeeeeeER . st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,3] .D===eeeeeeER . st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,4] .D=====eeeeeeER st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,1] .DeeeeeeER. . st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,2] . D=eeeeeeER . st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,3] . D=eeeeeeER . st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,4] . D==eeeeeeER st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3661,11 +3663,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 2. 1 4.0 1.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 4. 1 6.0 1.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 1 3.4 0.6 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 2. 1 2.0 1.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 4. 1 3.0 1.0 0.0 st4 { v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 1 1.8 0.6 0.0 <total>
# CHECK: [82] Code Region - G83
@@ -3674,7 +3676,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 704
# CHECK-NEXT: Total uOps: 2700
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.84
# CHECK-NEXT: IPC: 0.71
# CHECK-NEXT: Block RThroughput: 7.0
@@ -3684,10 +3686,10 @@ ldr x2, [x1], #254
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeeeER . st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,1] D=eeeeeeER. st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,2] .D==eeeeER. st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,3] .D===eeeeER st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,4] . D===eE--R stg x26, [x27], #4064
+# CHECK-NEXT: [0,1] .DeeeeeeER. st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,2] . D=eeeeER. st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,3] . DeeeeER st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,4] . DeE--R stg x26, [x27], #4064
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3697,11 +3699,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 2. 1 3.0 1.0 0.0 st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 4. 1 4.0 0.0 2.0 stg x26, [x27], #4064
-# CHECK-NEXT: 1 2.8 0.4 0.4 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 st4 { v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 2. 1 2.0 1.0 0.0 st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 st4 { v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 4. 1 1.0 0.0 2.0 stg x26, [x27], #4064
+# CHECK-NEXT: 1 1.2 0.4 0.4 <total>
# CHECK: [83] Code Region - G84
@@ -3710,19 +3712,19 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 1700
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.37
# CHECK-NEXT: IPC: 0.99
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 2.8
# CHECK: Timeline view:
# CHECK-NEXT: Index 012345678
# CHECK: [0,0] DeER . . stg x26, [x27, #4064]!
# CHECK-NEXT: [0,1] D=eER. . stgp x1, x2, [x27], #992
-# CHECK-NEXT: [0,2] D==eER . stgp x1, x2, [x27, #992]!
-# CHECK-NEXT: [0,3] D===eeER. stp s1, s2, [x27], #248
-# CHECK-NEXT: [0,4] .D===eeER stp d1, d2, [x27], #496
+# CHECK-NEXT: [0,2] .D=eER . stgp x1, x2, [x27, #992]!
+# CHECK-NEXT: [0,3] . D=eeER. stp s1, s2, [x27], #248
+# CHECK-NEXT: [0,4] . D=eeER stp d1, d2, [x27], #496
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3733,10 +3735,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 stg x26, [x27, #4064]!
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 stgp x1, x2, [x27], #992
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 stgp x1, x2, [x27, #992]!
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 stp s1, s2, [x27], #248
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 stp d1, d2, [x27], #496
-# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 stgp x1, x2, [x27, #992]!
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 stp s1, s2, [x27], #248
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 stp d1, d2, [x27], #496
+# CHECK-NEXT: 1 1.8 0.2 0.0 <total>
# CHECK: [84] Code Region - G85
@@ -3745,19 +3747,19 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 703
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 2.84
# CHECK-NEXT: IPC: 0.71
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 3.3
# CHECK: Timeline view:
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeER. . stp q1, q2, [x27], #992
-# CHECK-NEXT: [0,1] D==eeER . stp s1, s2, [x27, #248]!
-# CHECK-NEXT: [0,2] D===eeER . stp d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,3] .D===eeER. stp q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,4] .D=====eER stp w1, w2, [x27], #248
+# CHECK-NEXT: [0,1] .D=eeER . stp s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,2] . D=eeER . stp d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,3] . D=eeER. stp q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,4] . D==eER stp w1, w2, [x27], #248
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3767,11 +3769,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 stp q1, q2, [x27], #992
-# CHECK-NEXT: 1. 1 3.0 0.0 0.0 stp s1, s2, [x27, #248]!
-# CHECK-NEXT: 2. 1 4.0 0.0 0.0 stp d1, d2, [x27, #496]!
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 stp q1, q2, [x27, #992]!
-# CHECK-NEXT: 4. 1 6.0 0.0 0.0 stp w1, w2, [x27], #248
-# CHECK-NEXT: 1 3.6 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 2.0 0.0 0.0 stp s1, s2, [x27, #248]!
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 stp d1, d2, [x27, #496]!
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 stp q1, q2, [x27, #992]!
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 stp w1, w2, [x27], #248
+# CHECK-NEXT: 1 2.0 0.2 0.0 <total>
# CHECK: [85] Code Region - G86
@@ -3780,19 +3782,19 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 1700
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.37
# CHECK-NEXT: IPC: 0.99
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 2.8
# CHECK: Timeline view:
# CHECK-NEXT: Index 012345678
# CHECK: [0,0] DeER . . stp x1, x2, [x27], #496
# CHECK-NEXT: [0,1] D=eER. . stp w1, w2, [x27, #248]!
-# CHECK-NEXT: [0,2] D==eER . stp x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,3] D===eeER. str b1, [x27], #254
-# CHECK-NEXT: [0,4] .D===eeER str h1, [x27], #254
+# CHECK-NEXT: [0,2] .D=eER . stp x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,3] . D=eeER. str b1, [x27], #254
+# CHECK-NEXT: [0,4] . D=eeER str h1, [x27], #254
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3803,10 +3805,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 stp x1, x2, [x27], #496
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 stp w1, w2, [x27, #248]!
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 stp x1, x2, [x27, #496]!
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 str b1, [x27], #254
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 str h1, [x27], #254
-# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 stp x1, x2, [x27, #496]!
+# CHECK-NEXT: 3. 1 2.0 0.0 0.0 str b1, [x27], #254
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 str h1, [x27], #254
+# CHECK-NEXT: 1 1.8 0.2 0.0 <total>
# CHECK: [86] Code Region - G87
@@ -3815,19 +3817,19 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 504
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.97
# CHECK-NEXT: IPC: 0.99
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.3
# CHECK: Timeline view:
# CHECK-NEXT: Index 012345678
# CHECK: [0,0] DeeER. . str s1, [x27], #254
-# CHECK-NEXT: [0,1] D=eeER . str d1, [x27], #254
-# CHECK-NEXT: [0,2] D==eeER . str q1, [x27], #254
-# CHECK-NEXT: [0,3] D===eeER. str b1, [x27, #254]!
-# CHECK-NEXT: [0,4] .D===eeER str h1, [x27, #254]!
+# CHECK-NEXT: [0,1] .DeeER . str d1, [x27], #254
+# CHECK-NEXT: [0,2] . DeeER . str q1, [x27], #254
+# CHECK-NEXT: [0,3] . DeeER. str b1, [x27, #254]!
+# CHECK-NEXT: [0,4] . DeeER str h1, [x27, #254]!
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3837,11 +3839,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 str s1, [x27], #254
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 str d1, [x27], #254
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 str q1, [x27], #254
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 str b1, [x27, #254]!
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 str h1, [x27, #254]!
-# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 str d1, [x27], #254
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 str q1, [x27], #254
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 str b1, [x27, #254]!
+# CHECK-NEXT: 4. 1 1.0 0.0 0.0 str h1, [x27, #254]!
+# CHECK-NEXT: 1 1.0 0.2 0.0 <total>
# CHECK: [87] Code Region - G88
@@ -3850,19 +3852,19 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 503
# CHECK-NEXT: Total uOps: 1800
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.58
# CHECK-NEXT: IPC: 0.99
-# CHECK-NEXT: Block RThroughput: 2.5
+# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
# CHECK-NEXT: Index 01234567
# CHECK: [0,0] DeeER. . str s1, [x27, #254]!
-# CHECK-NEXT: [0,1] D=eeER . str d1, [x27, #254]!
-# CHECK-NEXT: [0,2] D==eeER. str q1, [x27, #254]!
-# CHECK-NEXT: [0,3] D===eER. str w1, [x27], #254
-# CHECK-NEXT: [0,4] .D===eER str x1, [x27], #254
+# CHECK-NEXT: [0,1] .DeeER . str d1, [x27, #254]!
+# CHECK-NEXT: [0,2] . DeeER. str q1, [x27, #254]!
+# CHECK-NEXT: [0,3] . DeER. str w1, [x27], #254
+# CHECK-NEXT: [0,4] . D=eER str x1, [x27], #254
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3872,11 +3874,11 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 str s1, [x27, #254]!
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 str d1, [x27, #254]!
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 str q1, [x27, #254]!
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 str w1, [x27], #254
-# CHECK-NEXT: 4. 1 4.0 0.0 0.0 str x1, [x27], #254
-# CHECK-NEXT: 1 2.8 0.2 0.0 <total>
+# CHECK-NEXT: 1. 1 1.0 0.0 0.0 str d1, [x27, #254]!
+# CHECK-NEXT: 2. 1 1.0 0.0 0.0 str q1, [x27, #254]!
+# CHECK-NEXT: 3. 1 1.0 0.0 0.0 str w1, [x27], #254
+# CHECK-NEXT: 4. 1 2.0 0.0 0.0 str x1, [x27], #254
+# CHECK-NEXT: 1 1.2 0.2 0.0 <total>
# CHECK: [88] Code Region - G89
@@ -3885,7 +3887,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 503
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 2.98
# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 2.5
@@ -3895,9 +3897,9 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeER . . str w1, [x27, #254]!
# CHECK-NEXT: [0,1] D=eER. . str x1, [x27, #254]!
-# CHECK-NEXT: [0,2] D==eER . strb w1, [x27], #254
-# CHECK-NEXT: [0,3] D===eER. strb w1, [x27, #254]!
-# CHECK-NEXT: [0,4] D====eER strh w1, [x27], #254
+# CHECK-NEXT: [0,2] .D=eER . strb w1, [x27], #254
+# CHECK-NEXT: [0,3] .D==eER. strb w1, [x27, #254]!
+# CHECK-NEXT: [0,4] . D==eER strh w1, [x27], #254
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3908,10 +3910,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 str w1, [x27, #254]!
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 str x1, [x27, #254]!
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 strb w1, [x27], #254
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 strb w1, [x27, #254]!
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 strh w1, [x27], #254
-# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 strb w1, [x27], #254
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 strb w1, [x27, #254]!
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 strh w1, [x27], #254
+# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
# CHECK: [89] Code Region - G90
@@ -3920,7 +3922,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 503
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 2.98
# CHECK-NEXT: IPC: 0.99
# CHECK-NEXT: Block RThroughput: 2.5
@@ -3930,9 +3932,9 @@ ldr x2, [x1], #254
# CHECK: [0,0] DeER . . strh w1, [x27, #254]!
# CHECK-NEXT: [0,1] D=eER. . stz2g x26, [x27], #4064
-# CHECK-NEXT: [0,2] D==eER . stz2g x26, [x27, #4064]!
-# CHECK-NEXT: [0,3] D===eER. stzg x26, [x27], #4064
-# CHECK-NEXT: [0,4] D====eER stzg x26, [x27, #4064]!
+# CHECK-NEXT: [0,2] .D=eER . stz2g x26, [x27, #4064]!
+# CHECK-NEXT: [0,3] .D==eER. stzg x26, [x27], #4064
+# CHECK-NEXT: [0,4] . D==eER stzg x26, [x27, #4064]!
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -3943,10 +3945,10 @@ ldr x2, [x1], #254
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 strh w1, [x27, #254]!
# CHECK-NEXT: 1. 1 2.0 0.0 0.0 stz2g x26, [x27], #4064
-# CHECK-NEXT: 2. 1 3.0 0.0 0.0 stz2g x26, [x27, #4064]!
-# CHECK-NEXT: 3. 1 4.0 0.0 0.0 stzg x26, [x27], #4064
-# CHECK-NEXT: 4. 1 5.0 0.0 0.0 stzg x26, [x27, #4064]!
-# CHECK-NEXT: 1 3.0 0.2 0.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 stz2g x26, [x27, #4064]!
+# CHECK-NEXT: 3. 1 3.0 0.0 0.0 stzg x26, [x27], #4064
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 stzg x26, [x27, #4064]!
+# CHECK-NEXT: 1 2.2 0.2 0.0 <total>
# CHECK: [90] Code Region - G91
@@ -3955,7 +3957,7 @@ ldr x2, [x1], #254
# CHECK-NEXT: Total Cycles: 110
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 3.64
# CHECK-NEXT: IPC: 1.82
# CHECK-NEXT: Block RThroughput: 0.7
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-zero-lat-movs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-zero-lat-movs.s
index 1690d9669b396..3ddb525327015 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-zero-lat-movs.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-zero-lat-movs.s
@@ -23,18 +23,18 @@ mov x1, x2
# CHECK-NEXT: [6]: HasSideEffects (U)
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
-# CHECK-NEXT: 1 0 0.06 mov x1, #0
-# CHECK-NEXT: 1 0 0.06 mov x1, xzr
-# CHECK-NEXT: 1 0 0.06 mov w1, #0
-# CHECK-NEXT: 1 0 0.06 mov w1, wzr
-# CHECK-NEXT: 1 0 0.06 fmov h1, wzr
-# CHECK-NEXT: 1 0 0.06 fmov h1, xzr
-# CHECK-NEXT: 1 0 0.06 fmov s1, wzr
-# CHECK-NEXT: 1 0 0.06 fmov d1, xzr
-# CHECK-NEXT: 1 0 0.06 movi d1, #0000000000000000
-# CHECK-NEXT: 1 0 0.06 movi v1.2d, #0000000000000000
-# CHECK-NEXT: 1 0 0.06 mov w1, w2
-# CHECK-NEXT: 1 0 0.06 mov x1, x2
+# CHECK-NEXT: 1 0 0.17 mov x1, #0
+# CHECK-NEXT: 1 0 0.17 mov x1, xzr
+# CHECK-NEXT: 1 0 0.17 mov w1, #0
+# CHECK-NEXT: 1 0 0.17 mov w1, wzr
+# CHECK-NEXT: 1 0 0.17 fmov h1, wzr
+# CHECK-NEXT: 1 0 0.17 fmov h1, xzr
+# CHECK-NEXT: 1 0 0.17 fmov s1, wzr
+# CHECK-NEXT: 1 0 0.17 fmov d1, xzr
+# CHECK-NEXT: 1 0 0.17 movi d1, #0000000000000000
+# CHECK-NEXT: 1 0 0.17 movi v1.2d, #0000000000000000
+# CHECK-NEXT: 1 0 0.17 mov w1, w2
+# CHECK-NEXT: 1 0 0.17 mov x1, x2
# CHECK: Resources:
# CHECK-NEXT: [0.0] - V2UnitB
More information about the llvm-commits
mailing list