[llvm] [AArch64] Change IssueWidth to 6 in AArch64SchedNeoverseV2.td (PR #142565)

via llvm-commits llvm-commits at lists.llvm.org
Tue Jun 3 02:31:37 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-aarch64

Author: David Sherwood (david-arm)

<details>
<summary>Changes</summary>

I think that the issue width for neoverse-v2 CPUs is set too
high and does not properly reflect the dispatch constraints.
I tested various values of IssueWidth (16, 8 and 6) with runs
of SPEC2017 on a neoverse-v2 machine and I got the highest
overall geomean score with an issue width of 6, although it's
only a marginal 0.14% improvement. I also observed a 1-2%
improvement when testing the Gromacs application with some
workloads. Here are some notable changes in SPEC2017 ref
runtimes, i.e. has a ~0.5% change or greater ('-' means
faster):

548.exchange2: -1.7%
510.parest: -0.78%
538.imagick: -0.73%
500.perlbench: -0.57%
525.x264: -0.55%
507.cactuBSSN: -0.5%
520.omnetpp: -0.48%
511.povray: +0.57%
544.nab: +0.65%
503.bwaves: +0.68%
526.blender: +0.75%

If this patch causes any major regressions post-commit it can
be easily reverted, but I think it should be an overall
improvement.

---

Patch is 311.03 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142565.diff


8 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators-sve.ll (+3-2) 
- (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s (+5-5) 
- (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s (+62-62) 
- (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s (+346-346) 
- (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s (+9-9) 
- (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-writeback.s (+976-974) 
- (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-zero-lat-movs.s (+12-12) 


``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
index 39f7077ae4514..2fea569296427 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
@@ -14,7 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 def NeoverseV2Model : SchedMachineModel {
-  let IssueWidth            =  16; // Micro-ops dispatched at a time.
+  let IssueWidth            =   6; // Micro-ops dispatched at a time.
   let MicroOpBufferSize     = 320; // Entries in micro-op re-order buffer.
   let LoadLatency           =   4; // Optimistic load latency.
   let MispredictPenalty     =  10; // Extra cycles for mispredicted branch.  NOTE: Copied from N2.
diff --git a/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators-sve.ll b/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators-sve.ll
index 0d4c053551011..ecc972ef237b5 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators-sve.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-reassociate-accumulators-sve.ll
@@ -1,4 +1,5 @@
-; RUN: opt -passes=loop-unroll %s -o - | llc -O3 - -mtriple=aarch64-unknown-unknown -mcpu=neoverse-v2 -o - | FileCheck %s
+; RUN: opt -passes=loop-unroll %s -o - | llc -O3 - -mtriple=aarch64-unknown-unknown \
+; RUN:   -mcpu=neoverse-v1 -mattr=+sve2 -o - | FileCheck %s
 
 define i64 @sabalb_i32_to_i64_accumulation(ptr %ptr1, ptr %ptr2) {
   ; CHECK-LABEL: sabalb_i32_to_i64_accumulation
@@ -423,4 +424,4 @@ exit:
   ret i16 %reduce
 }
 
-declare <vscale x 8 x i16> @llvm.aarch64.sve.add.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
\ No newline at end of file
+declare <vscale x 8 x i16> @llvm.aarch64.sve.add.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
index 581dad6b68dcf..54b5f1644be48 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-basic-instructions.s
@@ -2536,14 +2536,14 @@ drps
 # CHECK-NEXT:  1      2     0.50                        bics	x3, xzr, x3, lsl #1
 # CHECK-NEXT:  1      2     0.50                        tst	w3, w7, lsl #31
 # CHECK-NEXT:  1      2     0.50                        tst	x2, x20, asr #2
-# CHECK-NEXT:  1      0     0.06                        mov	x3, x6
-# CHECK-NEXT:  1      0     0.06                        mov	x3, xzr
-# CHECK-NEXT:  1      0     0.06                        mov	wzr, w2
-# CHECK-NEXT:  1      0     0.06                        mov	w3, w5
+# CHECK-NEXT:  1      0     0.17                        mov	x3, x6
+# CHECK-NEXT:  1      0     0.17                        mov	x3, xzr
+# CHECK-NEXT:  1      0     0.17                        mov	wzr, w2
+# CHECK-NEXT:  1      0     0.17                        mov	w3, w5
 # CHECK-NEXT:  1      1     0.17                        movz	w2, #0, lsl #16
 # CHECK-NEXT:  1      1     0.17                        mov	w2, #-1235
 # CHECK-NEXT:  1      1     0.17                        mov	x2, #5299989643264
-# CHECK-NEXT:  1      0     0.06                        mov	x2, #0
+# CHECK-NEXT:  1      0     0.17                        mov	x2, #0
 # CHECK-NEXT:  1      1     0.17                        movk	w3, #0
 # CHECK-NEXT:  1      1     0.17                        movz	x4, #0, lsl #16
 # CHECK-NEXT:  1      1     0.17                        movk	w5, #0, lsl #16
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
index fbf65e26e99a5..3398331a67f5b 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-clear-upper-regs.s
@@ -58,7 +58,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -116,8 +116,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	b0, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	b0, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ldr	b0, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -126,9 +126,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	b0, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ldr	b0, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [1] Code Region - FPR16-bit
 
@@ -137,7 +137,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -195,8 +195,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	h0, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	h0, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ldr	h0, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -205,9 +205,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	h0, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ldr	h0, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [2] Code Region - FPR32-bit
 
@@ -216,7 +216,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -274,8 +274,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	s0, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	s0, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ldr	s0, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -284,9 +284,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	s0, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ldr	s0, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [3] Code Region - FPR64-bit
 
@@ -295,7 +295,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -353,8 +353,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	d0, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	d0, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ldr	d0, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -363,9 +363,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	d0, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ldr	d0, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [4] Code Region - FPR128-bit
 
@@ -374,7 +374,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -432,8 +432,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ldr	q0, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ldr	q0, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ldr	q0, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -442,9 +442,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ldr	q0, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ldr	q0, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [5] Code Region - SIMD64-bit-b
 
@@ -453,7 +453,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -511,8 +511,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.8b }, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -521,9 +521,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.8b }, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ld1	{ v0.8b }, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [6] Code Region - SIMD64-bit-h
 
@@ -532,7 +532,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -590,8 +590,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.4h }, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -600,9 +600,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.4h }, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ld1	{ v0.4h }, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [7] Code Region - SIMD64-bit-s
 
@@ -611,7 +611,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -669,8 +669,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.2s }, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -679,9 +679,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.2s }, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ld1	{ v0.2s }, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [8] Code Region - SIMD64-bit-d
 
@@ -690,7 +690,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      44
 # CHECK-NEXT: Total uOps:        200
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    4.55
 # CHECK-NEXT: IPC:               4.55
 # CHECK-NEXT: Block RThroughput: 0.3
@@ -748,8 +748,8 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,1]     D======eeER.   add	z0.d, z0.d, z0.d
 # CHECK-NEXT: [2,0]     DeeeeeeE--R.   ld1	{ v0.1d }, [sp]
 # CHECK-NEXT: [2,1]     D======eeER.   add	z0.d, z0.d, z0.d
-# CHECK-NEXT: [3,0]     D=eeeeeeE-R.   ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: [3,1]     D=======eeER   add	z0.d, z0.d, z0.d
+# CHECK-NEXT: [3,0]     .DeeeeeeE-R.   ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: [3,1]     .D======eeER   add	z0.d, z0.d, z0.d
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -758,9 +758,9 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     1.3    1.3    1.3       ld1	{ v0.1d }, [sp]
-# CHECK-NEXT: 1.     4     7.3    0.0    0.0       add	z0.d, z0.d, z0.d
-# CHECK-NEXT:        4     4.3    0.6    0.6       <total>
+# CHECK-NEXT: 0.     4     1.0    1.0    1.3       ld1	{ v0.1d }, [sp]
+# CHECK-NEXT: 1.     4     7.0    0.0    0.0       add	z0.d, z0.d, z0.d
+# CHECK-NEXT:        4     4.0    0.5    0.6       <total>
 
 # CHECK:      [9] Code Region - insr
 
@@ -769,7 +769,7 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: Total Cycles:      803
 # CHECK-NEXT: Total uOps:        300
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.37
 # CHECK-NEXT: IPC:               0.25
 # CHECK-NEXT: Block RThroughput: 1.0
@@ -825,10 +825,10 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [0,1]     D======eeER    .    .    .    .   .   add	z0.s, z0.s, z0.s
 # CHECK-NEXT: [1,0]     D========eeeeeeER   .    .    .   .   insr	z0.s, w0
 # CHECK-NEXT: [1,1]     D==============eeER .    .    .   .   add	z0.s, z0.s, z0.s
-# CHECK-NEXT: [2,0]     D================eeeeeeER.    .   .   insr	z0.s, w0
-# CHECK-NEXT: [2,1]     D======================eeER   .   .   add	z0.s, z0.s, z0.s
-# CHECK-NEXT: [3,0]     D========================eeeeeeER .   insr	z0.s, w0
-# CHECK-NEXT: [3,1]     D==============================eeER   add	z0.s, z0.s, z0.s
+# CHECK-NEXT: [2,0]     .D===============eeeeeeER.    .   .   insr	z0.s, w0
+# CHECK-NEXT: [2,1]     .D=====================eeER   .   .   add	z0.s, z0.s, z0.s
+# CHECK-NEXT: [3,0]     .D=======================eeeeeeER .   insr	z0.s, w0
+# CHECK-NEXT: [3,1]     .D=============================eeER   add	z0.s, z0.s, z0.s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -837,6 +837,6 @@ add z0.s, z0.s, z0.s
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     4     13.0   0.3    0.0       insr	z0.s, w0
-# CHECK-NEXT: 1.     4     19.0   0.0    0.0       add	z0.s, z0.s, z0.s
-# CHECK-NEXT:        4     16.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     4     12.5   0.3    0.0       insr	z0.s, w0
+# CHECK-NEXT: 1.     4     18.5   0.0    0.0       add	z0.s, z0.s, z0.s
+# CHECK-NEXT:        4     15.5   0.1    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s
index 0f5ab183f5358..39a779b27fe7f 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-forwarding.s
@@ -315,7 +315,7 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        400
 
-# CHECK:      Dispatch Width:    16
+# CHECK:      Dispatch Width:    6
 # CHECK-NEXT: uOps Per Cycle:    0.57
 # CHECK-NEXT: IPC:               0.57
 # CHECK-NEXT: Block RThroughput: 3.0
@@ -330,8 +330,8 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK-NEXT: [0,3]     D=====eeER.    ..   madd	x0, x0, x0, x0
 # CHECK-NEXT: [1,0]     D=======eeER   ..   mul	x0, x0, x0
 # CHECK-NEXT: [1,1]     D=========eeER ..   madd	x0, x1, x2, x0
-# CHECK-NEXT: [1,2]     D==========eeER..   madd	x0, x1, x2, x0
-# CHECK-NEXT: [1,3]     D============eeER   madd	x0, x0, x0, x0
+# CHECK-NEXT: [1,2]     .D=========eeER..   madd	x0, x1, x2, x0
+# CHECK-NEXT: [1,3]     .D===========eeER   madd	x0, x0, x0, x0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -342,9 +342,9 @@ bfmlalb z0.s, z0.h, z1.h
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	x0, x0, x0
 # CHECK-NEXT: 1.     2     6.5    0.0    0.0       madd	x0, x1, x2, x0
-# CHECK-NEXT: 2.     2     7.5    0.0    0.0       madd	x0, x1, x...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/142565


More information about the llvm-commits mailing list