[llvm] [AArch64] Corrected Latency Descriptions for NeoverseV2/N2 Scheduler (PR #147339)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 7 09:20:14 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: <YafetBeyene> (yafet-a)
<details>
<summary>Changes</summary>
Certain vector instructions for the Neoverse V2 and N2 Schedulers had been using incorrect latency descriptions based on errors from the Software Optimisation Guide ([SWOG](https://developer.arm.com/documentation/109898/latest/)). This PR updates the Neoverse V2 and N2 Schedulers to reflect the correct latencies along with having updated the relevant mca tests.
---
Patch is 330.86 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147339.diff
5 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td (+8-13)
- (modified) llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td (+14-3)
- (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s (+9-9)
- (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s (+2007-2011)
- (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-neon-instructions.s (+15-15)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
index 91a707910a7f3..59e5afefa97ab 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
@@ -175,8 +175,8 @@ def N2Write_2c_1L01_1V : SchedWriteRes<[N2UnitL01, N2UnitV]> {
let NumMicroOps = 2;
}
-def N2Write_4c_1V1_1V : SchedWriteRes<[N2UnitV1, N2UnitV]> {
- let Latency = 4;
+def N2Write_5c_1V1_1V : SchedWriteRes<[N2UnitV1, N2UnitV]> {
+ let Latency = 5;
let NumMicroOps = 2;
}
@@ -294,11 +294,6 @@ def N2Write_9c_1L_1V : SchedWriteRes<[N2UnitL, N2UnitV]> {
let NumMicroOps = 2;
}
-def N2Write_4c_2V1 : SchedWriteRes<[N2UnitV1, N2UnitV1]> {
- let Latency = 4;
- let NumMicroOps = 2;
-}
-
//===----------------------------------------------------------------------===//
// Define generic 3 micro-op types
@@ -1006,14 +1001,14 @@ def : InstRW<[N2Write_4c_1V1],
(instregex "^SABAv", "^UABAv", "^SABALv", "^UABALv")>;
// ASIMD arith, reduce, 4H/4S
-def : InstRW<[N2Write_2c_1V1], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>;
+def : InstRW<[N2Write_3c_1V1], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>;
// ASIMD arith, reduce, 8B/8H
-def : InstRW<[N2Write_4c_1V1_1V],
+def : InstRW<[N2Write_5c_1V1_1V],
(instregex "^(ADDV|[SU]ADDLV)v8(i8|i16)v$")>;
// ASIMD arith, reduce, 16B
-def : InstRW<[N2Write_4c_1V1], (instrs ADDVv16i8v, SADDLVv16i8v,
+def : InstRW<[N2Write_6c_1V1], (instrs ADDVv16i8v, SADDLVv16i8v,
UADDLVv16i8v)>;
// ASIMD dot product
@@ -1025,15 +1020,15 @@ def : InstRW<[N2Write_3c_1V],
def : InstRW<[N2Write_3c_1V], (instrs SMMLA, UMMLA, USMMLA)>;
// ASIMD max/min, reduce, 4H/4S
-def : InstRW<[N2Write_2c_1V1], (instregex "^[SU](MAX|MIN)Vv4i16v$",
+def : InstRW<[N2Write_3c_1V1], (instregex "^[SU](MAX|MIN)Vv4i16v$",
"^[SU](MAX|MIN)Vv4i32v$")>;
// ASIMD max/min, reduce, 8B/8H
-def : InstRW<[N2Write_4c_1V1_1V], (instregex "^[SU](MAX|MIN)Vv8i8v$",
+def : InstRW<[N2Write_5c_1V1_1V], (instregex "^[SU](MAX|MIN)Vv8i8v$",
"^[SU](MAX|MIN)Vv8i16v$")>;
// ASIMD max/min, reduce, 16B
-def : InstRW<[N2Write_4c_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
+def : InstRW<[N2Write_6c_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
// ASIMD multiply
def : InstRW<[N2Write_4c_1V0], (instregex "^MULv", "^SQ(R)?DMULHv")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
index 8d3a4553d4b73..59dc7847c9125 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
@@ -157,6 +157,7 @@ def V2Write_20c_1V0 : SchedWriteRes<[V2UnitV0]> { let Latency = 20;
def V2Write_2c_1V1 : SchedWriteRes<[V2UnitV1]> { let Latency = 2; }
def V2Write_2c_1V13 : SchedWriteRes<[V2UnitV13]> { let Latency = 2; }
def V2Write_3c_1V1 : SchedWriteRes<[V2UnitV1]> { let Latency = 3; }
+def V2Write_3c_1V13 : SchedWriteRes<[V2UnitV13]> { let Latency = 3; }
def V2Write_4c_1V1 : SchedWriteRes<[V2UnitV1]> { let Latency = 4; }
def V2Write_4c_1V13 : SchedWriteRes<[V2UnitV13]> { let Latency = 4; }
def V2Write_6c_1V1 : SchedWriteRes<[V2UnitV1]> { let Latency = 6; }
@@ -261,6 +262,11 @@ def V2Write_4c_1V13_1V : SchedWriteRes<[V2UnitV13, V2UnitV]> {
let NumMicroOps = 2;
}
+def V2Write_5c_1V13_1V : SchedWriteRes<[V2UnitV13, V2UnitV]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
def V2Write_4c_2V0 : SchedWriteRes<[V2UnitV0, V2UnitV0]> {
let Latency = 4;
let NumMicroOps = 2;
@@ -381,6 +387,11 @@ def V2Write_4c_2V13 : SchedWriteRes<[V2UnitV13, V2UnitV13]> {
let NumMicroOps = 2;
}
+def V2Write_6c_2V13 : SchedWriteRes<[V2UnitV13, V2UnitV13]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+
def V2Write_8c_1M0_1V01 : SchedWriteRes<[V2UnitM0, V2UnitV01]> {
let Latency = 8;
let NumMicroOps = 2;
@@ -1468,14 +1479,14 @@ def : SchedAlias<WriteVq, V2Write_2c_1V>;
def : InstRW<[V2Wr_VA, V2Rd_VA], (instregex "^[SU]ABAL?v")>;
// ASIMD arith, reduce, 4H/4S
-def : InstRW<[V2Write_2c_1V13], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>;
+def : InstRW<[V2Write_3c_1V13], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>;
// ASIMD arith, reduce, 8B/8H
-def : InstRW<[V2Write_4c_1V13_1V],
+def : InstRW<[V2Write_5c_1V13_1V],
(instregex "^(ADDV|[SU]ADDLV)v8(i8|i16)v$")>;
// ASIMD arith, reduce, 16B
-def : InstRW<[V2Write_4c_2V13], (instregex "^(ADDV|[SU]ADDLV)v16i8v$")>;
+def : InstRW<[V2Write_6c_2V13], (instregex "^(ADDV|[SU]ADDLV)v16i8v$")>;
// ASIMD dot product
// ASIMD dot product using signed and unsigned integers
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
index ef9d4463ebe52..99e39567b1ad6 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-sve-instructions.s
@@ -5066,19 +5066,19 @@ zip2 z31.s, z31.s, z31.s
# CHECK-NEXT: 2 2 1.00 movs p0.b, p0/z, p0.b
# CHECK-NEXT: 2 2 1.00 movs p15.b, p15.b
# CHECK-NEXT: 2 2 1.00 movs p15.b, p15/z, p15.b
-# CHECK-NEXT: 1 1 0.20 U mrs x3, ID_AA64ZFR0_EL1
-# CHECK-NEXT: 1 1 0.20 U mrs x3, ZCR_EL1
-# CHECK-NEXT: 1 1 0.20 U mrs x3, ZCR_EL12
-# CHECK-NEXT: 1 1 0.20 U mrs x3, ZCR_EL2
-# CHECK-NEXT: 1 1 0.20 U mrs x3, ZCR_EL3
+# CHECK-NEXT: 1 1 0.10 U mrs x3, ID_AA64ZFR0_EL1
+# CHECK-NEXT: 1 1 0.10 U mrs x3, ZCR_EL1
+# CHECK-NEXT: 1 1 0.10 U mrs x3, ZCR_EL12
+# CHECK-NEXT: 1 1 0.10 U mrs x3, ZCR_EL2
+# CHECK-NEXT: 1 1 0.10 U mrs x3, ZCR_EL3
# CHECK-NEXT: 1 4 1.00 msb z0.b, p7/m, z1.b, z31.b
# CHECK-NEXT: 2 5 2.00 msb z0.d, p7/m, z1.d, z31.d
# CHECK-NEXT: 1 4 1.00 msb z0.h, p7/m, z1.h, z31.h
# CHECK-NEXT: 1 4 1.00 msb z0.s, p7/m, z1.s, z31.s
-# CHECK-NEXT: 1 1 0.20 U msr ZCR_EL1, x3
-# CHECK-NEXT: 1 1 0.20 U msr ZCR_EL12, x3
-# CHECK-NEXT: 1 1 0.20 U msr ZCR_EL2, x3
-# CHECK-NEXT: 1 1 0.20 U msr ZCR_EL3, x3
+# CHECK-NEXT: 1 1 0.10 U msr ZCR_EL1, x3
+# CHECK-NEXT: 1 1 0.10 U msr ZCR_EL12, x3
+# CHECK-NEXT: 1 1 0.10 U msr ZCR_EL2, x3
+# CHECK-NEXT: 1 1 0.10 U msr ZCR_EL3, x3
# CHECK-NEXT: 1 4 1.00 mul z0.b, p7/m, z0.b, z31.b
# CHECK-NEXT: 1 4 1.00 mul z0.b, z1.b, z2.b
# CHECK-NEXT: 2 5 2.00 mul z0.d, p7/m, z0.d, z31.d
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
index dee46a304582b..5ffaf9138d482 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
@@ -1185,10 +1185,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 5
+# CHECK: Dispatch Width: 10
# CHECK-NEXT: uOps Per Cycle: 2.95
# CHECK-NEXT: IPC: 1.97
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
# CHECK-NEXT: 012
@@ -1197,13 +1197,13 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.1d }, [x27], #8
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.2d }, [x27], #16
-# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.2s }, [x27], #8
-# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeeeER. ld1 { v1.4h }, [x27], #8
-# CHECK-NEXT: [0,7] . D==eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeER ld1 { v1.4s }, [x27], #16
-# CHECK-NEXT: [0,9] . D==eE----R add x0, x27, #1
+# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] D==eeeeeeER . ld1 { v1.2s }, [x27], #8
+# CHECK-NEXT: [0,5] D===eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] .D===eeeeeeER ld1 { v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9] .D====eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1215,14 +1215,14 @@ add x0, x27, 1
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.1d }, [x27], #8
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.2d }, [x27], #16
-# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.2s }, [x27], #8
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld1 { v1.4h }, [x27], #8
-# CHECK-NEXT: 7. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.4s }, [x27], #16
-# CHECK-NEXT: 9. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.3 0.1 2.0 <total>
+# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.2s }, [x27], #8
+# CHECK-NEXT: 5. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.4h }, [x27], #8
+# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ld1 { v1.4s }, [x27], #16
+# CHECK-NEXT: 9. 1 5.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 3.1 0.1 2.0 <total>
# CHECK: [1] Code Region - G02
@@ -1231,10 +1231,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 5
+# CHECK: Dispatch Width: 10
# CHECK-NEXT: uOps Per Cycle: 2.95
# CHECK-NEXT: IPC: 1.97
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
# CHECK-NEXT: 012
@@ -1243,13 +1243,13 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.8b }, [x27], #8
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.8h }, [x27], #16
-# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.16b }, [x27], #16
-# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeeeER. ld1 { v1.1d }, [x27], x28
-# CHECK-NEXT: [0,7] . D==eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeER ld1 { v1.2d }, [x27], x28
-# CHECK-NEXT: [0,9] . D==eE----R add x0, x27, #1
+# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] D==eeeeeeER . ld1 { v1.16b }, [x27], #16
+# CHECK-NEXT: [0,5] D===eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] .D===eeeeeeER ld1 { v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9] .D====eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1261,14 +1261,14 @@ add x0, x27, 1
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.8b }, [x27], #8
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.8h }, [x27], #16
-# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.16b }, [x27], #16
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld1 { v1.1d }, [x27], x28
-# CHECK-NEXT: 7. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.2d }, [x27], x28
-# CHECK-NEXT: 9. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.3 0.1 2.0 <total>
+# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.16b }, [x27], #16
+# CHECK-NEXT: 5. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.1d }, [x27], x28
+# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ld1 { v1.2d }, [x27], x28
+# CHECK-NEXT: 9. 1 5.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 3.1 0.1 2.0 <total>
# CHECK: [2] Code Region - G03
@@ -1277,10 +1277,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1500
-# CHECK: Dispatch Width: 5
+# CHECK: Dispatch Width: 10
# CHECK-NEXT: uOps Per Cycle: 2.95
# CHECK-NEXT: IPC: 1.97
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 2.5
# CHECK: Timeline view:
# CHECK-NEXT: 012
@@ -1289,13 +1289,13 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.2s }, [x27], x28
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.4h }, [x27], x28
-# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] .D=eeeeeeER . ld1 { v1.4s }, [x27], x28
-# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] . D=eeeeeeER. ld1 { v1.8b }, [x27], x28
-# CHECK-NEXT: [0,7] . D==eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . D==eeeeeeER ld1 { v1.8h }, [x27], x28
-# CHECK-NEXT: [0,9] . D==eE----R add x0, x27, #1
+# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] D==eeeeeeER . ld1 { v1.4s }, [x27], x28
+# CHECK-NEXT: [0,5] D===eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] .D===eeeeeeER ld1 { v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9] .D====eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1307,14 +1307,14 @@ add x0, x27, 1
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.2s }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.4h }, [x27], x28
-# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 2.0 0.0 0.0 ld1 { v1.4s }, [x27], x28
-# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 ld1 { v1.8b }, [x27], x28
-# CHECK-NEXT: 7. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 3.0 0.0 0.0 ld1 { v1.8h }, [x27], x28
-# CHECK-NEXT: 9. 1 3.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 2.3 0.1 2.0 <total>
+# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.4s }, [x27], x28
+# CHECK-NEXT: 5. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.8b }, [x27], x28
+# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ld1 { v1.8h }, [x27], x28
+# CHECK-NEXT: 9. 1 5.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 3.1 0.1 2.0 <total>
# CHECK: [3] Code Region - G04
@@ -1323,10 +1323,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 1900
-# CHECK: Dispatch Width: 5
+# CHECK: Dispatch Width: 10
# CHECK-NEXT: uOps Per Cycle: 3.74
# CHECK-NEXT: IPC: 1.97
-# CHECK-NEXT: Block RThroughput: 3.8
+# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
# CHECK-NEXT: 012
@@ -1334,14 +1334,14 @@ add x0, x27, 1
# CHECK: [0,0] DeeeeeeER . . ld1 { v1.16b }, [x27], x28
# CHECK-NEXT: [0,1] D=eE----R . . add x0, x27, #1
-# CHECK-NEXT: [0,2] .DeeeeeeER. . ld1 { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,3] .D=eE----R. . add x0, x27, #1
-# CHECK-NEXT: [0,4] . DeeeeeeER . ld1 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5] . D=eE----R . add x0, x27, #1
-# CHECK-NEXT: [0,6] . DeeeeeeER. ld1 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7] . D=eE----R. add x0, x27, #1
-# CHECK-NEXT: [0,8] . DeeeeeeER ld1 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9] . D=eE----R add x0, x27, #1
+# CHECK-NEXT: [0,2] D=eeeeeeER. . ld1 { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,3] D==eE----R. . add x0, x27, #1
+# CHECK-NEXT: [0,4] D==eeeeeeER . ld1 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5] .D==eE----R . add x0, x27, #1
+# CHECK-NEXT: [0,6] .D==eeeeeeER. ld1 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7] .D===eE----R. add x0, x27, #1
+# CHECK-NEXT: [0,8] .D===eeeeeeER ld1 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9] .D====eE----R add x0, x27, #1
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -1352,15 +1352,15 @@ add x0, x27, 1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 ld1 { v1.16b }, [x27], x28
# CHECK-NEXT: 1. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 2. 1 1.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 3. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 4. 1 1.0 0.0 0.0 ld1 { v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 5. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 6. 1 1.0 0.0 0.0 ld1 { v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 7. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 8. 1 1.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 9. 1 2.0 0.0 4.0 add x0, x27, #1
-# CHECK-NEXT: 1 1.5 0.1 2.0 <total>
+# CHECK-NEXT: 2. 1 2.0 0.0 0.0 ld1 { v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 3. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 4. 1 3.0 0.0 0.0 ld1 { v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5. 1 3.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 6. 1 3.0 0.0 0.0 ld1 { v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7. 1 4.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 8. 1 4.0 0.0 0.0 ld1 { v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9. 1 5.0 0.0 4.0 add x0, x27, #1
+# CHECK-NEXT: 1 3.0 0.1 2.0 <total>
# CHECK: [4] Code Region - G05
@@ -1369,10 +1369,10 @@ add x0, x27, 1
# CHECK-NEXT: Total Cycles: 508
# CHECK-NEXT: Total uOps: 2000
-# CHECK: Dispatch Width: 5
+# CHECK: Dispatch Width: 10
# CHECK-NEXT: uOps Per Cycle: 3.94
# CHECK-NEXT: IPC: 1.97
-# CHECK-NEXT: Block RThroughput: 4.0
+# CHECK-NEXT: Block RThroughput: 3.3
# CHECK: Timeline view:
# CHECK-NEXT: ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/147339
More information about the llvm-commits
mailing list