[llvm] [AArch64] Change IssueWidth to 6 in AArch64SchedNeoverseV2.td (PR #142565)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 3 02:31:37 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: David Sherwood (david-arm)
<details>
<summary>Changes</summary>
I think that the issue width for neoverse-v2 CPUs is set too
high and does not properly reflect the dispatch constraints.
I tested various values of IssueWidth (16, 8 and 6) with runs
of SPEC2017 on a neoverse-v2 machine and I got the highest
overall geomean score with an issue width of 6, although it's
only a marginal 0.14% improvement. I also observed a 1-2%
improvement when testing the Gromacs application with some
workloads. Here are some notable changes in SPEC2017 ref
runtimes, i.e. has a ~0.5% change or greater ('-' means
faster):
548.exchange2: -1.7%
510.parest: -0.78%
538.imagick: -0.73%
500.perlbench: -0.57%
525.x264: -0.55%
507.cactuBSSN: -0.5%
520.omnetpp: -0.48%
511.povray: +0.57%
544.nab: +0.65%
503.bwaves: +0.68%
526.blender: +0.75%
If this patch causes any major regressions post-commit it can
be easily reverted, but I think it should be an overall
improvement.
---
Patch is 311.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142565.diff
8 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td (+1-1)
- (modified) llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators-sve.ll (+3-2)
- (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s (+5-5)
- (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s (+62-62)
- (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s (+346-346)
- (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s (+9-9)
- (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s (+976-974)
- (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-zero-lat-movs.s (+12-12)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
index 39f7077ae4514..2fea569296427 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
@@ -14,7 +14,7 @@
//===----------------------------------------------------------------------===//
def NeoverseV2Model : SchedMachineModel {
- let IssueWidth = 16; // Micro-ops dispatched at a time.
+ let IssueWidth = 6; // Micro-ops dispatched at a time.
let MicroOpBufferSize = 320; // Entries in micro-op re-order buffer.
let LoadLatency = 4; // Optimistic load latency.
let MispredictPenalty = 10; // Extra cycles for mispredicted branch. NOTE: Copied from N2.
diff --git a/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators-sve.ll b/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators-sve.ll
index 0d4c053551011..ecc972ef237b5 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators-sve.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators-sve.ll
@@ -1,4 +1,5 @@
-; RUN: opt -passes=loop-unroll %s -o - | llc -O3 - -mtriple=aarch64-unknown-unknown -mcpu=neoverse-v2 -o - | FileCheck %s
+; RUN: opt -passes=loop-unroll %s -o - | llc -O3 - -mtriple=aarch64-unknown-unknown \
+; RUN: -mcpu=neoverse-v1 -mattr=+sve2 -o - | FileCheck %s
define i64 @sabalb_i32_to_i64_accumulation(ptr %ptr1, ptr %ptr2) {
; CHECK-LABEL: sabalb_i32_to_i64_accumulation
@@ -423,4 +424,4 @@ exit:
ret i16 %reduce
}
-declare <vscale x 8 x i16> @llvm.aarch64.sve.add.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
\ No newline at end of file
+declare <vscale x 8 x i16> @llvm.aarch64.sve.add.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
index 581dad6b68dcf..54b5f1644be48 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
@@ -2536,14 +2536,14 @@ drps
# CHECK-NEXT: 1 2 0.50 bics x3, xzr, x3, lsl #1
# CHECK-NEXT: 1 2 0.50 tst w3, w7, lsl #31
# CHECK-NEXT: 1 2 0.50 tst x2, x20, asr #2
-# CHECK-NEXT: 1 0 0.06 mov x3, x6
-# CHECK-NEXT: 1 0 0.06 mov x3, xzr
-# CHECK-NEXT: 1 0 0.06 mov wzr, w2
-# CHECK-NEXT: 1 0 0.06 mov w3, w5
+# CHECK-NEXT: 1 0 0.17 mov x3, x6
+# CHECK-NEXT: 1 0 0.17 mov x3, xzr
+# CHECK-NEXT: 1 0 0.17 mov wzr, w2
+# CHECK-NEXT: 1 0 0.17 mov w3, w5
# CHECK-NEXT: 1 1 0.17 movz w2, #0, lsl #16
# CHECK-NEXT: 1 1 0.17 mov w2, #-1235
# CHECK-NEXT: 1 1 0.17 mov x2, #5299989643264
-# CHECK-NEXT: 1 0 0.06 mov x2, #0
+# CHECK-NEXT: 1 0 0.17 mov x2, #0
# CHECK-NEXT: 1 1 0.17 movk w3, #0
# CHECK-NEXT: 1 1 0.17 movz x4, #0, lsl #16
# CHECK-NEXT: 1 1 0.17 movk w5, #0, lsl #16
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
index fbf65e26e99a5..3398331a67f5b 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
@@ -58,7 +58,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -116,8 +116,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr b0, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr b0, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ldr b0, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -126,9 +126,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr b0, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ldr b0, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [1] Code Region - FPR16-bit
@@ -137,7 +137,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -195,8 +195,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr h0, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr h0, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ldr h0, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -205,9 +205,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr h0, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ldr h0, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [2] Code Region - FPR32-bit
@@ -216,7 +216,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -274,8 +274,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr s0, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr s0, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ldr s0, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -284,9 +284,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr s0, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ldr s0, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [3] Code Region - FPR64-bit
@@ -295,7 +295,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -353,8 +353,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr d0, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr d0, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ldr d0, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -363,9 +363,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr d0, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ldr d0, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [4] Code Region - FPR128-bit
@@ -374,7 +374,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -432,8 +432,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ldr q0, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ldr q0, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ldr q0, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -442,9 +442,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ldr q0, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ldr q0, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [5] Code Region - SIMD64-bit-b
@@ -453,7 +453,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -511,8 +511,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.8b }, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.8b }, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ld1 { v0.8b }, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -521,9 +521,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.8b }, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ld1 { v0.8b }, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [6] Code Region - SIMD64-bit-h
@@ -532,7 +532,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -590,8 +590,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.4h }, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.4h }, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ld1 { v0.4h }, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -600,9 +600,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.4h }, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ld1 { v0.4h }, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [7] Code Region - SIMD64-bit-s
@@ -611,7 +611,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -669,8 +669,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.2s }, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.2s }, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ld1 { v0.2s }, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -679,9 +679,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.2s }, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ld1 { v0.2s }, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [8] Code Region - SIMD64-bit-d
@@ -690,7 +690,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 44
# CHECK-NEXT: Total uOps: 200
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 4.55
# CHECK-NEXT: IPC: 4.55
# CHECK-NEXT: Block RThroughput: 0.3
@@ -748,8 +748,8 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,1] D======eeER. add z0.d, z0.d, z0.d
# CHECK-NEXT: [2,0] DeeeeeeE--R. ld1 { v0.1d }, [sp]
# CHECK-NEXT: [2,1] D======eeER. add z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0] D=eeeeeeE-R. ld1 { v0.1d }, [sp]
-# CHECK-NEXT: [3,1] D=======eeER add z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0] .DeeeeeeE-R. ld1 { v0.1d }, [sp]
+# CHECK-NEXT: [3,1] .D======eeER add z0.d, z0.d, z0.d
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -758,9 +758,9 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 1.3 1.3 1.3 ld1 { v0.1d }, [sp]
-# CHECK-NEXT: 1. 4 7.3 0.0 0.0 add z0.d, z0.d, z0.d
-# CHECK-NEXT: 4 4.3 0.6 0.6 <total>
+# CHECK-NEXT: 0. 4 1.0 1.0 1.3 ld1 { v0.1d }, [sp]
+# CHECK-NEXT: 1. 4 7.0 0.0 0.0 add z0.d, z0.d, z0.d
+# CHECK-NEXT: 4 4.0 0.5 0.6 <total>
# CHECK: [9] Code Region - insr
@@ -769,7 +769,7 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: Total Cycles: 803
# CHECK-NEXT: Total uOps: 300
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.37
# CHECK-NEXT: IPC: 0.25
# CHECK-NEXT: Block RThroughput: 1.0
@@ -825,10 +825,10 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [0,1] D======eeER . . . . . add z0.s, z0.s, z0.s
# CHECK-NEXT: [1,0] D========eeeeeeER . . . . insr z0.s, w0
# CHECK-NEXT: [1,1] D==============eeER . . . . add z0.s, z0.s, z0.s
-# CHECK-NEXT: [2,0] D================eeeeeeER. . . insr z0.s, w0
-# CHECK-NEXT: [2,1] D======================eeER . . add z0.s, z0.s, z0.s
-# CHECK-NEXT: [3,0] D========================eeeeeeER . insr z0.s, w0
-# CHECK-NEXT: [3,1] D==============================eeER add z0.s, z0.s, z0.s
+# CHECK-NEXT: [2,0] .D===============eeeeeeER. . . insr z0.s, w0
+# CHECK-NEXT: [2,1] .D=====================eeER . . add z0.s, z0.s, z0.s
+# CHECK-NEXT: [3,0] .D=======================eeeeeeER . insr z0.s, w0
+# CHECK-NEXT: [3,1] .D=============================eeER add z0.s, z0.s, z0.s
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -837,6 +837,6 @@ add z0.s, z0.s, z0.s
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 4 13.0 0.3 0.0 insr z0.s, w0
-# CHECK-NEXT: 1. 4 19.0 0.0 0.0 add z0.s, z0.s, z0.s
-# CHECK-NEXT: 4 16.0 0.1 0.0 <total>
+# CHECK-NEXT: 0. 4 12.5 0.3 0.0 insr z0.s, w0
+# CHECK-NEXT: 1. 4 18.5 0.0 0.0 add z0.s, z0.s, z0.s
+# CHECK-NEXT: 4 15.5 0.1 0.0 <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s
index 0f5ab183f5358..39a779b27fe7f 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s
@@ -315,7 +315,7 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: Total Cycles: 703
# CHECK-NEXT: Total uOps: 400
-# CHECK: Dispatch Width: 16
+# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.57
# CHECK-NEXT: IPC: 0.57
# CHECK-NEXT: Block RThroughput: 3.0
@@ -330,8 +330,8 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK-NEXT: [0,3] D=====eeER. .. madd x0, x0, x0, x0
# CHECK-NEXT: [1,0] D=======eeER .. mul x0, x0, x0
# CHECK-NEXT: [1,1] D=========eeER .. madd x0, x1, x2, x0
-# CHECK-NEXT: [1,2] D==========eeER.. madd x0, x1, x2, x0
-# CHECK-NEXT: [1,3] D============eeER madd x0, x0, x0, x0
+# CHECK-NEXT: [1,2] .D=========eeER.. madd x0, x1, x2, x0
+# CHECK-NEXT: [1,3] .D===========eeER madd x0, x0, x0, x0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -342,9 +342,9 @@ bfmlalb z0.s, z0.h, z1.h
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0
# CHECK-NEXT: 1. 2 6.5 0.0 0.0 madd x0, x1, x2, x0
-# CHECK-NEXT: 2. 2 7.5 0.0 0.0 madd x0, x1, x...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/142565
More information about the llvm-commits
mailing list