[llvm] [AArch64] Model late forwarding in Neoverse N1 (PR #177590)
Amina Chabane via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 2 07:50:23 PST 2026
https://github.com/Amichaxx updated https://github.com/llvm/llvm-project/pull/177590
>From 2dfbf7cd341433f444e5dec5cf69d133dbb5f04a Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Thu, 22 Jan 2026 16:36:40 +0000
Subject: [PATCH 1/8] [AArch64] Model late forwarding in Neoverse N1
---
.../Target/AArch64/AArch64SchedNeoverseN1.td | 99 ++++--
.../AArch64/Neoverse/N1-basic-instructions.s | 90 +++---
.../llvm-mca/AArch64/Neoverse/N1-forwarding.s | 287 ++++++++++++++++++
3 files changed, 410 insertions(+), 66 deletions(-)
create mode 100644 llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
index 80e5bff5abba7..d6de36c6081e4 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
@@ -261,6 +261,49 @@ def N1Write_9c_6L_6V : SchedWriteRes<[N1UnitL, N1UnitL, N1UnitL,
N1UnitV, N1UnitV, N1UnitV,
N1UnitV, N1UnitV, N1UnitV]>;
+//===----------------------------------------------------------------------===//
+// Define forwarded types
+
+def N1Wr_IM32 : WriteSequence<[N1Write_2c_1M]>;
+def N1Wr_IM64 : WriteSequence<[N1Write_4c3_1M]>;
+def N1Rd_IMA : SchedReadAdvance<1, [N1Wr_IM32, N1Wr_IM64]>;
+
+def N1Wr_FMA : WriteSequence<[N1Write_4c_1V]>;
+def N1Rd_FMA : SchedReadAdvance<2, [WriteFMul, N1Wr_FMA]>;
+
+def N1Wr_VA : WriteSequence<[N1Write_4c_1V1]>;
+def N1Rd_VA : SchedReadAdvance<3, [N1Wr_VA]>;
+
+def N1Wr_VMA : WriteSequence<[N1Write_4c_1V0]>;
+def N1Wr_VMAQ : WriteSequence<[N1Write_5c_2V0]>;
+def N1Rd_VMA : SchedReadAdvance<3, [N1Wr_VMA, N1Wr_VMAQ]>;
+
+def N1Wr_VMAH : WriteSequence<[N1Write_4c_1V0]>;
+def N1Wr_VMAHQ : WriteSequence<[N1Write_5c_2V0]>;
+def N1Rd_VMAH : SchedReadAdvance<2, [N1Wr_VMAH, N1Wr_VMAHQ]>;
+
+def N1Wr_VMAL : WriteSequence<[N1Write_4c_1V0]>;
+def N1Rd_VMAL : SchedReadAdvance<3, [N1Wr_VMAL]>;
+
+def N1Wr_VPA : WriteSequence<[N1Write_4c_1V1]>;
+def N1Rd_VPA : SchedReadAdvance<3, [N1Wr_VPA]>;
+
+def N1Wr_VSA : WriteSequence<[N1Write_4c_1V1]>;
+def N1Rd_VSA : SchedReadAdvance<3, [N1Wr_VSA]>;
+
+def N1Wr_VFM : WriteSequence<[N1Write_3c_1V]>;
+def N1Wr_VFMA : WriteSequence<[N1Write_4c_1V]>;
+def N1Rd_VFMA : SchedReadAdvance<2, [N1Wr_VFM, N1Wr_VFMA]>;
+
+def N1Wr_VFMAL : WriteSequence<[N1Write_5c_1V]>;
+def N1Rd_VFMAL : SchedReadAdvance<2, [N1Wr_VFMAL]>;
+
+def N1Wr_CRC : WriteSequence<[N1Write_2c_1M]>;
+def N1Rd_CRC : SchedReadAdvance<1, [N1Wr_CRC]>;
+
+def N1Wr_MH : WriteSequence<[N1Write_5c3_1M]>;
+def N1Rd_MH : SchedReadAdvance<2, [N1Wr_MH]>;
+
// Miscellaneous Instructions
// -----------------------------------------------------------------------------
@@ -327,13 +370,21 @@ def : InstRW<[N1Write_2c_1M], (instregex "^(AND|BIC)S[WX]rs$")>;
def : SchedAlias<WriteID32, N1Write_12c5_1M>;
def : SchedAlias<WriteID64, N1Write_20c5_1M>;
-// Multiply accumulate
+// Multiply accumulate, W-form
+// Multiply accumulate, X-form
// Multiply accumulate, long
def : SchedAlias<WriteIM32, N1Write_2c_1M>;
def : SchedAlias<WriteIM64, N1Write_4c3_1M>;
+def : InstRW<[N1Wr_IM32, ReadIM, ReadIM, N1Rd_IMA],
+ (instregex "^M(ADD|SUB)Wrrr$")>;
+def : InstRW<[N1Wr_IM64, ReadIM, ReadIM, N1Rd_IMA],
+ (instregex "^M(ADD|SUB)Xrrr$")>;
+def : InstRW<[N1Wr_IM64, ReadIM, ReadIM, N1Rd_IMA],
+ (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
+
// Multiply high
-def : InstRW<[N1Write_5c3_1M, ReadIM, ReadIM], (instrs SMULHrr, UMULHrr)>;
+def : InstRW<[N1Wr_MH, ReadIM, ReadIM, N1Rd_MH], (instrs SMULHrr, UMULHrr)>;
// Miscellaneous data-processing instructions
@@ -431,7 +482,7 @@ def : InstRW<[N1Write_17c7_1V0], (instrs FSQRTDr)>;
def : SchedAlias<WriteFMul, N1Write_3c_1V>;
// FP multiply accumulate
-def : InstRW<[N1Write_4c_1V], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
+def : InstRW<[N1Wr_FMA, ReadDefault, ReadDefault, N1Rd_FMA], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
// FP round to integral
def : InstRW<[N1Write_3c_1V0], (instregex "^FRINT[AIMNPXZ][HSD]r$",
@@ -588,7 +639,7 @@ def : SchedAlias<WriteVq, N1Write_2c_1V>;
// ASIMD absolute diff accum
// ASIMD absolute diff accum long
-def : InstRW<[N1Write_4c_1V1], (instregex "^[SU]ABAL?v")>;
+def : InstRW<[N1Wr_VA, N1Rd_VA], (instregex "^[SU]ABAL?v")>;
// ASIMD arith, reduce, 4H/4S
def : InstRW<[N1Write_3c_1V1], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>;
@@ -609,26 +660,32 @@ def : InstRW<[N1Write_5c_1V1_1V], (instregex "^[SU](MAX|MIN)Vv8(i8|i16)v$")>;
def : InstRW<[N1Write_6c_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
// ASIMD multiply, D-form
+def : InstRW<[N1Write_4c_1V0], (instregex "^MUL(v[14]i16|v[12]i32)$",
+ "^SQ(R)?DMULH(v[14]i16|v[12]i32)$",
+ "^([SU]|SQD)MULLv")>;
+
// ASIMD multiply accumulate, D-form
+def : InstRW<[N1Wr_VMA, N1Rd_VMA], (instregex "^ML[AS](v[14]i16|v[12]i32)$")>;
+
// ASIMD multiply accumulate high, D-form
+def : InstRW<[N1Wr_VMAH, N1Rd_VMAH], (instregex "^SQRDML[AS]H(v[14]i16|v[12]i32)$")>;
+
// ASIMD multiply accumulate saturating long
// ASIMD multiply long
// ASIMD multiply accumulate long
-def : InstRW<[N1Write_4c_1V0], (instregex "^MUL(v[14]i16|v[12]i32)$",
- "^ML[AS](v[14]i16|v[12]i32)$",
- "^SQ(R)?DMULH(v[14]i16|v[12]i32)$",
- "^SQRDML[AS]H(v[14]i16|v[12]i32)$",
- "^SQDML[AS]Lv",
- "^([SU]|SQD)MULLv",
- "^[SU]ML[AS]Lv")>;
+def : InstRW<[N1Wr_VMAL, N1Rd_VMAL], (instregex "^[SU]ML[AS]Lv",
+ "^SQDML[AS]Lv")>;
// ASIMD multiply, Q-form
+def : InstRW<[N1Write_5c_2V0], (instregex "^MUL(v8i16|v4i32)$",
+ "^SQ(R)?DMULH(v8i16|v4i32)$")>;
+
// ASIMD multiply accumulate, Q-form
+def : InstRW<[N1Wr_VMAQ, N1Rd_VMA], (instregex "^ML[AS](v8i16|v4i32)$")>;
+
// ASIMD multiply accumulate high, Q-form
-def : InstRW<[N1Write_5c_2V0], (instregex "^MUL(v8i16|v4i32)$",
- "^ML[AS](v8i16|v4i32)$",
- "^SQ(R)?DMULH(v8i16|v4i32)$",
- "^SQRDML[AS]H(v8i16|v4i32)$")>;
+def : InstRW<[N1Wr_VMAHQ, N1Rd_VMAH],
+ (instregex "^SQRDML[AS]H(v8i16|v4i32)$")>;
// ASIMD multiply/multiply long (8x8) polynomial, D-form
def : InstRW<[N1Write_3c_1V0], (instrs PMULv8i8, PMULLv8i8)>;
@@ -637,10 +694,10 @@ def : InstRW<[N1Write_3c_1V0], (instrs PMULv8i8, PMULLv8i8)>;
def : InstRW<[N1Write_4c_2V0], (instrs PMULv16i8, PMULLv16i8)>;
// ASIMD pairwise add and accumulate long
-def : InstRW<[N1Write_4c_1V1], (instregex "^[SU]ADALPv")>;
+def : InstRW<[N1Wr_VPA, N1Rd_VPA], (instregex "^[SU]ADALPv")>;
// ASIMD shift accumulate
-def : InstRW<[N1Write_4c_1V1], (instregex "^[SU]R?SRAv")>;
+def : InstRW<[N1Wr_VSA, N1Rd_VSA], (instregex "^[SU]R?SRAv")>;
// ASIMD shift by immed, basic
// ASIMD shift by immed and insert, basic
@@ -722,13 +779,13 @@ def : InstRW<[N1Write_5c_1V], (instregex "^F(MAX|MIN)(NM)?Vv4(i16|i32)v$")>;
def : InstRW<[N1Write_8c_3V], (instregex "^F(MAX|MIN)(NM)?Vv8i16v$")>;
// ASIMD FP multiply
-def : InstRW<[N1Write_3c_1V], (instregex "^FMULX?v")>;
+def : InstRW<[N1Wr_VFM], (instregex "^FMULX?v")>;
// ASIMD FP multiply accumulate
-def : InstRW<[N1Write_4c_1V], (instregex "^FML[AS]v")>;
+def : InstRW<[N1Wr_VFMA, N1Rd_VFMA], (instregex "^FML[AS]v")>;
// ASIMD FP multiply accumulate long
-def : InstRW<[N1Write_5c_1V], (instregex "^FML[AS]L2?v")>;
+def : InstRW<[N1Wr_VFMAL, N1Rd_VFMAL], (instregex "^FML[AS]L2?v")>;
// ASIMD FP round, D-form F32 and Q-form F64
def : InstRW<[N1Write_3c_1V0], (instregex "^FRINT[AIMNPXZ]v2f(32|64)$")>;
@@ -1053,7 +1110,7 @@ def : InstRW<[N1Write_4c_1V0], (instregex "^SHA1[CMP]rrr$",
// -----------------------------------------------------------------------------
// CRC checksum ops
-def : InstRW<[N1Write_2c_1M], (instregex "^CRC32C?[BHWX]rr$")>;
+def : InstRW<[N1Wr_CRC, N1Rd_CRC], (instregex "^CRC32C?[BHWX]rr$")>;
}
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
index 0a36c14e43955..fb3f2ccc92441 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
@@ -472,32 +472,32 @@
# CHECK-NEXT: 1 4 3.00 msub x13, xzr, x4, x4
# CHECK-NEXT: 1 4 3.00 msub x19, x30, xzr, x29
# CHECK-NEXT: 1 4 3.00 mneg x4, x5, x6
-# CHECK-NEXT: 1 2 1.00 smaddl x3, w5, w2, x9
-# CHECK-NEXT: 1 2 1.00 smaddl xzr, w10, w11, x12
-# CHECK-NEXT: 1 2 1.00 smaddl x13, wzr, w14, x15
-# CHECK-NEXT: 1 2 1.00 smaddl x16, w17, wzr, x18
-# CHECK-NEXT: 1 2 1.00 smull x19, w20, w21
-# CHECK-NEXT: 1 2 1.00 smsubl x3, w5, w2, x9
-# CHECK-NEXT: 1 2 1.00 smsubl xzr, w10, w11, x12
-# CHECK-NEXT: 1 2 1.00 smsubl x13, wzr, w14, x15
-# CHECK-NEXT: 1 2 1.00 smsubl x16, w17, wzr, x18
-# CHECK-NEXT: 1 2 1.00 smnegl x19, w20, w21
-# CHECK-NEXT: 1 2 1.00 umaddl x3, w5, w2, x9
-# CHECK-NEXT: 1 2 1.00 umaddl xzr, w10, w11, x12
-# CHECK-NEXT: 1 2 1.00 umaddl x13, wzr, w14, x15
-# CHECK-NEXT: 1 2 1.00 umaddl x16, w17, wzr, x18
-# CHECK-NEXT: 1 2 1.00 umull x19, w20, w21
-# CHECK-NEXT: 1 2 1.00 umsubl x3, w5, w2, x9
-# CHECK-NEXT: 1 2 1.00 umsubl x16, w17, wzr, x18
-# CHECK-NEXT: 1 2 1.00 umnegl x19, w20, w21
+# CHECK-NEXT: 1 4 3.00 smaddl x3, w5, w2, x9
+# CHECK-NEXT: 1 4 3.00 smaddl xzr, w10, w11, x12
+# CHECK-NEXT: 1 4 3.00 smaddl x13, wzr, w14, x15
+# CHECK-NEXT: 1 4 3.00 smaddl x16, w17, wzr, x18
+# CHECK-NEXT: 1 4 3.00 smull x19, w20, w21
+# CHECK-NEXT: 1 4 3.00 smsubl x3, w5, w2, x9
+# CHECK-NEXT: 1 4 3.00 smsubl xzr, w10, w11, x12
+# CHECK-NEXT: 1 4 3.00 smsubl x13, wzr, w14, x15
+# CHECK-NEXT: 1 4 3.00 smsubl x16, w17, wzr, x18
+# CHECK-NEXT: 1 4 3.00 smnegl x19, w20, w21
+# CHECK-NEXT: 1 4 3.00 umaddl x3, w5, w2, x9
+# CHECK-NEXT: 1 4 3.00 umaddl xzr, w10, w11, x12
+# CHECK-NEXT: 1 4 3.00 umaddl x13, wzr, w14, x15
+# CHECK-NEXT: 1 4 3.00 umaddl x16, w17, wzr, x18
+# CHECK-NEXT: 1 4 3.00 umull x19, w20, w21
+# CHECK-NEXT: 1 4 3.00 umsubl x3, w5, w2, x9
+# CHECK-NEXT: 1 4 3.00 umsubl x16, w17, wzr, x18
+# CHECK-NEXT: 1 4 3.00 umnegl x19, w20, w21
# CHECK-NEXT: 1 5 3.00 smulh x23, x22, xzr
# CHECK-NEXT: 1 5 3.00 umulh x23, x22, xzr
# CHECK-NEXT: 1 4 3.00 mul x19, x20, xzr
# CHECK-NEXT: 1 2 1.00 mneg w21, w22, w23
-# CHECK-NEXT: 1 2 1.00 smull x11, w13, w17
-# CHECK-NEXT: 1 2 1.00 umull x11, w13, w17
-# CHECK-NEXT: 1 2 1.00 smnegl x11, w13, w17
-# CHECK-NEXT: 1 2 1.00 umnegl x11, w13, w17
+# CHECK-NEXT: 1 4 3.00 smull x11, w13, w17
+# CHECK-NEXT: 1 4 3.00 umull x11, w13, w17
+# CHECK-NEXT: 1 4 3.00 smnegl x11, w13, w17
+# CHECK-NEXT: 1 4 3.00 umnegl x11, w13, w17
# CHECK-NEXT: 2 3 1.00 extr w3, w5, w7, #0
# CHECK-NEXT: 2 3 1.00 extr w11, w13, w17, #31
# CHECK-NEXT: 2 3 1.00 extr x3, x5, x7, #15
@@ -1264,7 +1264,7 @@
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1.0] [1.1] [2.0] [2.1] [3] [4.0] [4.1] [5] [6]
-# CHECK-NEXT: 26.00 34.00 34.00 252.50 252.50 483.33 197.33 197.33 293.00 161.00
+# CHECK-NEXT: 26.00 34.00 34.00 252.50 252.50 527.33 197.33 197.33 293.00 161.00
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1.0] [1.1] [2.0] [2.1] [3] [4.0] [4.1] [5] [6] Instructions:
@@ -1730,32 +1730,32 @@
# CHECK-NEXT: - - - - - 3.00 - - - - msub x13, xzr, x4, x4
# CHECK-NEXT: - - - - - 3.00 - - - - msub x19, x30, xzr, x29
# CHECK-NEXT: - - - - - 3.00 - - - - mneg x4, x5, x6
-# CHECK-NEXT: - - - - - 1.00 - - - - smaddl x3, w5, w2, x9
-# CHECK-NEXT: - - - - - 1.00 - - - - smaddl xzr, w10, w11, x12
-# CHECK-NEXT: - - - - - 1.00 - - - - smaddl x13, wzr, w14, x15
-# CHECK-NEXT: - - - - - 1.00 - - - - smaddl x16, w17, wzr, x18
-# CHECK-NEXT: - - - - - 1.00 - - - - smull x19, w20, w21
-# CHECK-NEXT: - - - - - 1.00 - - - - smsubl x3, w5, w2, x9
-# CHECK-NEXT: - - - - - 1.00 - - - - smsubl xzr, w10, w11, x12
-# CHECK-NEXT: - - - - - 1.00 - - - - smsubl x13, wzr, w14, x15
-# CHECK-NEXT: - - - - - 1.00 - - - - smsubl x16, w17, wzr, x18
-# CHECK-NEXT: - - - - - 1.00 - - - - smnegl x19, w20, w21
-# CHECK-NEXT: - - - - - 1.00 - - - - umaddl x3, w5, w2, x9
-# CHECK-NEXT: - - - - - 1.00 - - - - umaddl xzr, w10, w11, x12
-# CHECK-NEXT: - - - - - 1.00 - - - - umaddl x13, wzr, w14, x15
-# CHECK-NEXT: - - - - - 1.00 - - - - umaddl x16, w17, wzr, x18
-# CHECK-NEXT: - - - - - 1.00 - - - - umull x19, w20, w21
-# CHECK-NEXT: - - - - - 1.00 - - - - umsubl x3, w5, w2, x9
-# CHECK-NEXT: - - - - - 1.00 - - - - umsubl x16, w17, wzr, x18
-# CHECK-NEXT: - - - - - 1.00 - - - - umnegl x19, w20, w21
+# CHECK-NEXT: - - - - - 3.00 - - - - smaddl x3, w5, w2, x9
+# CHECK-NEXT: - - - - - 3.00 - - - - smaddl xzr, w10, w11, x12
+# CHECK-NEXT: - - - - - 3.00 - - - - smaddl x13, wzr, w14, x15
+# CHECK-NEXT: - - - - - 3.00 - - - - smaddl x16, w17, wzr, x18
+# CHECK-NEXT: - - - - - 3.00 - - - - smull x19, w20, w21
+# CHECK-NEXT: - - - - - 3.00 - - - - smsubl x3, w5, w2, x9
+# CHECK-NEXT: - - - - - 3.00 - - - - smsubl xzr, w10, w11, x12
+# CHECK-NEXT: - - - - - 3.00 - - - - smsubl x13, wzr, w14, x15
+# CHECK-NEXT: - - - - - 3.00 - - - - smsubl x16, w17, wzr, x18
+# CHECK-NEXT: - - - - - 3.00 - - - - smnegl x19, w20, w21
+# CHECK-NEXT: - - - - - 3.00 - - - - umaddl x3, w5, w2, x9
+# CHECK-NEXT: - - - - - 3.00 - - - - umaddl xzr, w10, w11, x12
+# CHECK-NEXT: - - - - - 3.00 - - - - umaddl x13, wzr, w14, x15
+# CHECK-NEXT: - - - - - 3.00 - - - - umaddl x16, w17, wzr, x18
+# CHECK-NEXT: - - - - - 3.00 - - - - umull x19, w20, w21
+# CHECK-NEXT: - - - - - 3.00 - - - - umsubl x3, w5, w2, x9
+# CHECK-NEXT: - - - - - 3.00 - - - - umsubl x16, w17, wzr, x18
+# CHECK-NEXT: - - - - - 3.00 - - - - umnegl x19, w20, w21
# CHECK-NEXT: - - - - - 3.00 - - - - smulh x23, x22, xzr
# CHECK-NEXT: - - - - - 3.00 - - - - umulh x23, x22, xzr
# CHECK-NEXT: - - - - - 3.00 - - - - mul x19, x20, xzr
# CHECK-NEXT: - - - - - 1.00 - - - - mneg w21, w22, w23
-# CHECK-NEXT: - - - - - 1.00 - - - - smull x11, w13, w17
-# CHECK-NEXT: - - - - - 1.00 - - - - umull x11, w13, w17
-# CHECK-NEXT: - - - - - 1.00 - - - - smnegl x11, w13, w17
-# CHECK-NEXT: - - - - - 1.00 - - - - umnegl x11, w13, w17
+# CHECK-NEXT: - - - - - 3.00 - - - - smull x11, w13, w17
+# CHECK-NEXT: - - - - - 3.00 - - - - umull x11, w13, w17
+# CHECK-NEXT: - - - - - 3.00 - - - - smnegl x11, w13, w17
+# CHECK-NEXT: - - - - - 3.00 - - - - umnegl x11, w13, w17
# CHECK-NEXT: - - - - - 1.33 0.33 0.33 - - extr w3, w5, w7, #0
# CHECK-NEXT: - - - - - 1.33 0.33 0.33 - - extr w11, w13, w17, #31
# CHECK-NEXT: - - - - - 1.33 0.33 0.33 - - extr x3, x5, x7, #15
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
new file mode 100644
index 0000000000000..656b4daff5aab
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
@@ -0,0 +1,287 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-n1 -mattr=+fp16fml --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=2 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN madd
+mul x0, x0, x0
+madd x0, x1, x2, x0
+madd x0, x1, x2, x0
+madd x0, x0, x0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmadd
+fadd d0, d0, d0
+fmadd d0, d1, d2, d0
+fmul d0, d0, d0
+fmadd d0, d1, d2, d0
+fmadd d0, d1, d2, d0
+fmadd d0, d0, d1, d2
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN saba
+mul v0.4s, v0.4s, v0.4s
+saba v0.4s, v1.4s, v2.4s
+saba v0.4s, v1.4s, v2.4s
+saba v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmla
+fmul v0.2d, v0.2d, v0.2d
+fmla v0.2d, v1.2d, v2.2d
+fadd v0.2d, v0.2d, v0.2d
+fmla v0.2d, v1.2d, v2.2d
+fmla v0.2d, v1.2d, v2.2d
+fmla v0.2d, v0.2d, v1.2d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN crc32
+mul w0, w0, w0
+crc32cb w0, w0, w1
+crc32cb w0, w0, w1
+crc32cb w0, w0, w0
+# LLVM-MCA-END
+# LLVM-MCA-BEGIN smulh
+mul x0, x0, x0
+smulh x0, x1, x2
+smulh x0, x1, x2
+smulh x0, x0, x1
+# LLVM-MCA-END
+
+# CHECK: [0] Code Region - madd
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1204
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.33
+# CHECK-NEXT: IPC: 0.33
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 01234567
+
+# CHECK: [0,0] DeeeeER . . . . . mul x0, x0, x0
+# CHECK-NEXT: [0,1] D======eeeeER . . . . madd x0, x1, x2, x0
+# CHECK-NEXT: [0,2] D===eeeeE---R . . . . madd x0, x1, x2, x0
+# CHECK-NEXT: [0,3] .D========eeeeER . . . madd x0, x0, x0, x0
+# CHECK-NEXT: [1,0] .D==============eeeeER . . mul x0, x0, x0
+# CHECK-NEXT: [1,1] .D=================eeeeER. . madd x0, x1, x2, x0
+# CHECK-NEXT: [1,2] . D==========eeeeE------R. . madd x0, x1, x2, x0
+# CHECK-NEXT: [1,3] . D===================eeeeER madd x0, x0, x0, x0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 8.0 1.5 0.0 mul x0, x0, x0
+# CHECK-NEXT: 1. 2 12.5 12.5 0.0 madd x0, x1, x2, x0
+# CHECK-NEXT: 2. 2 7.5 7.5 4.5 madd x0, x1, x2, x0
+# CHECK-NEXT: 3. 2 14.5 3.5 0.0 madd x0, x0, x0, x0
+# CHECK-NEXT: 2 10.6 6.3 1.1 <total>
+
+# CHECK: [1] Code Region - fmadd
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 600
+# CHECK-NEXT: Total Cycles: 2103
+# CHECK-NEXT: Total uOps: 600
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: IPC: 0.29
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 01234
+
+# CHECK: [0,0] DeeER. . . . . . . . . fadd d0, d0, d0
+# CHECK-NEXT: [0,1] D==eeeeER . . . . . . . . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [0,2] D======eeeER . . . . . . . fmul d0, d0, d0
+# CHECK-NEXT: [0,3] .D========eeeeER . . . . . . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [0,4] .D============eeeeER. . . . . . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [0,5] .D================eeeeER . . . . . fmadd d0, d0, d1, d2
+# CHECK-NEXT: [1,0] . D===================eeER . . . . fadd d0, d0, d0
+# CHECK-NEXT: [1,1] . D=====================eeeeER. . . . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [1,2] . D=========================eeeER . . . fmul d0, d0, d0
+# CHECK-NEXT: [1,3] . D===========================eeeeER . . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [1,4] . D===============================eeeeER . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [1,5] . D===================================eeeeER fmadd d0, d0, d1, d2
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 10.5 0.5 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 1. 2 12.5 0.0 0.0 fmadd d0, d1, d2, d0
+# CHECK-NEXT: 2. 2 16.5 0.0 0.0 fmul d0, d0, d0
+# CHECK-NEXT: 3. 2 18.5 0.0 0.0 fmadd d0, d1, d2, d0
+# CHECK-NEXT: 4. 2 22.5 0.0 0.0 fmadd d0, d1, d2, d0
+# CHECK-NEXT: 5. 2 26.5 0.0 0.0 fmadd d0, d0, d1, d2
+# CHECK-NEXT: 2 17.8 0.1 0.0 <total>
+
+# CHECK: [2] Code Region - saba
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: IPC: 0.23
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . .. mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2] .D========eeeeER . . . .. saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3] .D============eeeeER. . . .. saba v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0] . D===============eeeeeER. . .. mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] . D====================eeeeER . .. saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2] . D=======================eeeeER .. saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3] . D===========================eeeeER saba v0.4s, v0.4s, v1.4s
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 8.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 13.5 0.0 0.0 saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2. 2 16.5 0.0 0.0 saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3. 2 20.5 0.0 0.0 saba v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: 2 14.8 0.1 0.0 <total>
+
+# CHECK: [3] Code Region - fmla
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 600
+# CHECK-NEXT: Total Cycles: 2103
+# CHECK-NEXT: Total uOps: 600
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: IPC: 0.29
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 01234
+
+# CHECK: [0,0] DeeeER . . . . . . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . . . . fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,2] D=======eeER . . . . . . . fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,3] .D========eeeeER . . . . . . fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,4] .D============eeeeER. . . . . . fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,5] .D================eeeeER . . . . . fmla v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: [1,0] . D===================eeeER . . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1] . D======================eeeeER . . . fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,2] . D==========================eeER . . . fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3] . D===========================eeeeER . . fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,4] . D===============================eeeeER . fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,5] . D===================================eeeeER fmla v0.2d, v0.2d, v1.2d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 10.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1. 2 13.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 2. 2 17.5 0.0 0.0 fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3. 2 18.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 4. 2 22.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 5. 2 26.5 0.0 0.0 fmla v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: 2 18.2 0.1 0.0 <total>
+
+# CHECK: [4] Code Region - crc32
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 803
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.50
+# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012345678
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . . . mul w0, w0, w0
+# CHECK-NEXT: [0,1] D==eeER . . . crc32cb w0, w0, w1
+# CHECK-NEXT: [0,2] D====eeER . . . crc32cb w0, w0, w1
+# CHECK-NEXT: [0,3] .D=====eeER . . crc32cb w0, w0, w0
+# CHECK-NEXT: [1,0] .D=======eeER . . mul w0, w0, w0
+# CHECK-NEXT: [1,1] .D=========eeER. . crc32cb w0, w0, w1
+# CHECK-NEXT: [1,2] . D==========eeER . crc32cb w0, w0, w1
+# CHECK-NEXT: [1,3] . D============eeER crc32cb w0, w0, w0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul w0, w0, w0
+# CHECK-NEXT: 1. 2 6.5 0.0 0.0 crc32cb w0, w0, w1
+# CHECK-NEXT: 2. 2 8.0 0.0 0.0 crc32cb w0, w0, w1
+# CHECK-NEXT: 3. 2 9.5 0.0 0.0 crc32cb w0, w0, w0
+# CHECK-NEXT: 2 7.1 0.1 0.0 <total>
+
+# CHECK: [5] Code Region - smulh
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1205
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.33
+# CHECK-NEXT: IPC: 0.33
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789
+# CHECK-NEXT: Index 0123456789 012345678
+
+# CHECK: [0,0] DeeeeER . . . . . mul x0, x0, x0
+# CHECK-NEXT: [0,1] D===eeeeeER . . . . smulh x0, x1, x2
+# CHECK-NEXT: [0,2] D======eeeeeER . . . . smulh x0, x1, x2
+# CHECK-NEXT: [0,3] .D===========eeeeeER. . . smulh x0, x0, x1
+# CHECK-NEXT: [1,0] .D=================eeeeER. . mul x0, x0, x0
+# CHECK-NEXT: [1,1] .D========eeeeeE--------R. . smulh x0, x1, x2
+# CHECK-NEXT: [1,2] . D=============eeeeeE--R. . smulh x0, x1, x2
+# CHECK-NEXT: [1,3] . D===================eeeeeER smulh x0, x0, x1
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 9.5 1.0 0.0 mul x0, x0, x0
+# CHECK-NEXT: 1. 2 6.5 6.5 4.0 smulh x0, x1, x2
+# CHECK-NEXT: 2. 2 10.5 10.5 1.0 smulh x0, x1, x2
+# CHECK-NEXT: 3. 2 16.0 1.0 0.0 smulh x0, x0, x1
+# CHECK-NEXT: 2 10.6 4.8 1.3 <total>
>From c7f8dfbd724438b2ac86e1100fd028eb452b7fa3 Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Thu, 12 Feb 2026 00:11:55 +0000
Subject: [PATCH 2/8] n1-forwarding pre-commit
---
.../llvm-mca/AArch64/Neoverse/N1-forwarding.s | 527 +++++++++++++-----
1 file changed, 403 insertions(+), 124 deletions(-)
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
index 656b4daff5aab..bb70593f1b91d 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
@@ -11,8 +11,8 @@ madd x0, x0, x0, x0
# LLVM-MCA-BEGIN fmadd
fadd d0, d0, d0
fmadd d0, d1, d2, d0
-fmul d0, d0, d0
fmadd d0, d1, d2, d0
+fmul d0, d0, d0
fmadd d0, d1, d2, d0
fmadd d0, d0, d1, d2
# LLVM-MCA-END
@@ -24,52 +24,97 @@ saba v0.4s, v1.4s, v2.4s
saba v0.4s, v0.4s, v1.4s
# LLVM-MCA-END
+# LLVM-MCA-BEGIN mla
+mul v0.4s, v0.4s, v0.4s
+mla v0.4s, v1.4s, v2.4s
+mla v0.4s, v1.4s, v2.4s
+mla v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sqrdmlah
+mul v0.4s, v0.4s, v0.4s
+sqrdmlah v0.4s, v1.4s, v2.4s
+sqrdmlah v0.4s, v1.4s, v2.4s
+sqrdmlah v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smlal2
+mul v0.4s, v0.4s, v0.4s
+smlal2 v0.4s, v1.8h, v2.8h
+smlal2 v0.4s, v1.8h, v2.8h
+smlal2 v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sadalp
+mul v0.4s, v0.4s, v0.4s
+sadalp v0.2d, v1.4s
+sadalp v0.2d, v1.4s
+sadalp v0.2d, v0.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ssra
+mul v0.4s, v0.4s, v0.4s
+ssra v0.2d, v1.2d, #1
+ssra v0.2d, v1.2d, #1
+ssra v0.2d, v0.2d, #1
+# LLVM-MCA-END
+
# LLVM-MCA-BEGIN fmla
fmul v0.2d, v0.2d, v0.2d
fmla v0.2d, v1.2d, v2.2d
-fadd v0.2d, v0.2d, v0.2d
fmla v0.2d, v1.2d, v2.2d
+fadd v0.2d, v0.2d, v0.2d
fmla v0.2d, v1.2d, v2.2d
fmla v0.2d, v0.2d, v1.2d
# LLVM-MCA-END
+# LLVM-MCA-BEGIN fmlal
+fmul v0.2d, v0.2d, v0.2d
+fmlal v0.4s, v1.4h, v2.4h
+fmlal v0.4s, v1.4h, v2.4h
+fadd v0.2d, v0.2d, v0.2d
+fmlal v0.4s, v1.4h, v2.4h
+fmlal v0.4s, v0.4h, v1.4h
+# LLVM-MCA-END
+
# LLVM-MCA-BEGIN crc32
mul w0, w0, w0
crc32cb w0, w0, w1
crc32cb w0, w0, w1
crc32cb w0, w0, w0
# LLVM-MCA-END
+
# LLVM-MCA-BEGIN smulh
-mul x0, x0, x0
-smulh x0, x1, x2
-smulh x0, x1, x2
+mul x0, x0, x0
smulh x0, x0, x1
+smulh x0, x0, x1
+smulh x0, x0, x0
# LLVM-MCA-END
# CHECK: [0] Code Region - madd
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1204
+# CHECK-NEXT: Total Cycles: 407
# CHECK-NEXT: Total uOps: 400
# CHECK: Dispatch Width: 3
-# CHECK-NEXT: uOps Per Cycle: 0.33
-# CHECK-NEXT: IPC: 0.33
-# CHECK-NEXT: Block RThroughput: 12.0
+# CHECK-NEXT: uOps Per Cycle: 0.98
+# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789 01234567
-
-# CHECK: [0,0] DeeeeER . . . . . mul x0, x0, x0
-# CHECK-NEXT: [0,1] D======eeeeER . . . . madd x0, x1, x2, x0
-# CHECK-NEXT: [0,2] D===eeeeE---R . . . . madd x0, x1, x2, x0
-# CHECK-NEXT: [0,3] .D========eeeeER . . . madd x0, x0, x0, x0
-# CHECK-NEXT: [1,0] .D==============eeeeER . . mul x0, x0, x0
-# CHECK-NEXT: [1,1] .D=================eeeeER. . madd x0, x1, x2, x0
-# CHECK-NEXT: [1,2] . D==========eeeeE------R. . madd x0, x1, x2, x0
-# CHECK-NEXT: [1,3] . D===================eeeeER madd x0, x0, x0, x0
+# CHECK-NEXT: 012345
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeER . . mul x0, x0, x0
+# CHECK-NEXT: [0,1] D==eeeeER . . madd x0, x1, x2, x0
+# CHECK-NEXT: [0,2] D=eeeeE-R . . madd x0, x1, x2, x0
+# CHECK-NEXT: [0,3] .D====eeeeER . madd x0, x0, x0, x0
+# CHECK-NEXT: [1,0] .D========eeeeER mul x0, x0, x0
+# CHECK-NEXT: [1,1] .D===eeeeE-----R madd x0, x1, x2, x0
+# CHECK-NEXT: [1,2] . D=eeeeE------R madd x0, x1, x2, x0
+# CHECK-NEXT: [1,3] . D=====eeeeE--R madd x0, x0, x0, x0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -78,40 +123,40 @@ smulh x0, x0, x1
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.0 1.5 0.0 mul x0, x0, x0
-# CHECK-NEXT: 1. 2 12.5 12.5 0.0 madd x0, x1, x2, x0
-# CHECK-NEXT: 2. 2 7.5 7.5 4.5 madd x0, x1, x2, x0
-# CHECK-NEXT: 3. 2 14.5 3.5 0.0 madd x0, x0, x0, x0
-# CHECK-NEXT: 2 10.6 6.3 1.1 <total>
+# CHECK-NEXT: 0. 2 5.0 0.5 0.0 mul x0, x0, x0
+# CHECK-NEXT: 1. 2 3.5 3.5 2.5 madd x0, x1, x2, x0
+# CHECK-NEXT: 2. 2 2.0 2.0 3.5 madd x0, x1, x2, x0
+# CHECK-NEXT: 3. 2 5.5 0.0 1.0 madd x0, x0, x0, x0
+# CHECK-NEXT: 2 4.0 1.5 1.8 <total>
# CHECK: [1] Code Region - fmadd
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 600
-# CHECK-NEXT: Total Cycles: 2103
+# CHECK-NEXT: Total Cycles: 1903
# CHECK-NEXT: Total uOps: 600
# CHECK: Dispatch Width: 3
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.29
+# CHECK-NEXT: uOps Per Cycle: 0.32
+# CHECK-NEXT: IPC: 0.32
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 01234
-
-# CHECK: [0,0] DeeER. . . . . . . . . fadd d0, d0, d0
-# CHECK-NEXT: [0,1] D==eeeeER . . . . . . . . fmadd d0, d1, d2, d0
-# CHECK-NEXT: [0,2] D======eeeER . . . . . . . fmul d0, d0, d0
-# CHECK-NEXT: [0,3] .D========eeeeER . . . . . . fmadd d0, d1, d2, d0
-# CHECK-NEXT: [0,4] .D============eeeeER. . . . . . fmadd d0, d1, d2, d0
-# CHECK-NEXT: [0,5] .D================eeeeER . . . . . fmadd d0, d0, d1, d2
-# CHECK-NEXT: [1,0] . D===================eeER . . . . fadd d0, d0, d0
-# CHECK-NEXT: [1,1] . D=====================eeeeER. . . . fmadd d0, d1, d2, d0
-# CHECK-NEXT: [1,2] . D=========================eeeER . . . fmul d0, d0, d0
-# CHECK-NEXT: [1,3] . D===========================eeeeER . . fmadd d0, d1, d2, d0
-# CHECK-NEXT: [1,4] . D===============================eeeeER . fmadd d0, d1, d2, d0
-# CHECK-NEXT: [1,5] . D===================================eeeeER fmadd d0, d0, d1, d2
+# CHECK-NEXT: Index 0123456789 0123456789 0
+
+# CHECK: [0,0] DeeER. . . . . . . . fadd d0, d0, d0
+# CHECK-NEXT: [0,1] D==eeeeER . . . . . . . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [0,2] D====eeeeER . . . . . . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [0,3] .D=======eeeER . . . . . . fmul d0, d0, d0
+# CHECK-NEXT: [0,4] .D==========eeeeER . . . . . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [0,5] .D==============eeeeER . . . . fmadd d0, d0, d1, d2
+# CHECK-NEXT: [1,0] . D=================eeER . . . . fadd d0, d0, d0
+# CHECK-NEXT: [1,1] . D===================eeeeER . . . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [1,2] . D=====================eeeeER. . . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [1,3] . D========================eeeER . . fmul d0, d0, d0
+# CHECK-NEXT: [1,4] . D===========================eeeeER . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [1,5] . D===============================eeeeER fmadd d0, d0, d1, d2
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -120,16 +165,54 @@ smulh x0, x0, x1
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 10.5 0.5 0.0 fadd d0, d0, d0
-# CHECK-NEXT: 1. 2 12.5 0.0 0.0 fmadd d0, d1, d2, d0
-# CHECK-NEXT: 2. 2 16.5 0.0 0.0 fmul d0, d0, d0
-# CHECK-NEXT: 3. 2 18.5 0.0 0.0 fmadd d0, d1, d2, d0
-# CHECK-NEXT: 4. 2 22.5 0.0 0.0 fmadd d0, d1, d2, d0
-# CHECK-NEXT: 5. 2 26.5 0.0 0.0 fmadd d0, d0, d1, d2
-# CHECK-NEXT: 2 17.8 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 9.5 0.5 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fmadd d0, d1, d2, d0
+# CHECK-NEXT: 2. 2 13.5 0.0 0.0 fmadd d0, d1, d2, d0
+# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmul d0, d0, d0
+# CHECK-NEXT: 4. 2 19.5 0.0 0.0 fmadd d0, d1, d2, d0
+# CHECK-NEXT: 5. 2 23.5 0.0 0.0 fmadd d0, d0, d1, d2
+# CHECK-NEXT: 2 15.7 0.1 0.0 <total>
# CHECK: [2] Code Region - saba
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1403
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.29
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2] .D=====eeeeER . . . . saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3] .D=========eeeeER . . . saba v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0] . D============eeeeeER . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] . D=================eeeeER . saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2] . D=================eeeeER . saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3] . D=====================eeeeER saba v0.4s, v0.4s, v1.4s
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 12.0 0.0 0.0 saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 saba v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
+
+# CHECK: [3] Code Region - mla
+
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
# CHECK-NEXT: Total Cycles: 1703
@@ -138,20 +221,20 @@ smulh x0, x0, x1
# CHECK: Dispatch Width: 3
# CHECK-NEXT: uOps Per Cycle: 0.29
# CHECK-NEXT: IPC: 0.23
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 8.0
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456
# CHECK-NEXT: Index 0123456789 0123456789
# CHECK: [0,0] DeeeeeER . . . . . .. mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1] D=====eeeeER . . . . .. saba v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,2] .D========eeeeER . . . .. saba v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,3] .D============eeeeER. . . .. saba v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [0,1] D=====eeeeeER . . . . .. mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2] .D======eeeeeER. . . . .. mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3] .D===========eeeeeER. . . .. mla v0.4s, v0.4s, v1.4s
# CHECK-NEXT: [1,0] . D===============eeeeeER. . .. mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1] . D====================eeeeER . .. saba v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2] . D=======================eeeeER .. saba v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3] . D===========================eeeeER saba v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,1] . D====================eeeeeER. .. mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2] . D=====================eeeeeER .. mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3] . D==========================eeeeeER mla v0.4s, v0.4s, v1.4s
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -161,39 +244,235 @@ smulh x0, x0, x1
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 8.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1. 2 13.5 0.0 0.0 saba v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2. 2 16.5 0.0 0.0 saba v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3. 2 20.5 0.0 0.0 saba v0.4s, v0.4s, v1.4s
-# CHECK-NEXT: 2 14.8 0.1 0.0 <total>
+# CHECK-NEXT: 1. 2 13.5 0.0 0.0 mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2. 2 14.5 0.0 0.0 mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3. 2 19.5 0.0 0.0 mla v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: 2 14.0 0.1 0.0 <total>
+
+# CHECK: [4] Code Region - sqrdmlah
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1803
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.28
+# CHECK-NEXT: IPC: 0.22
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 012345678
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D=====eeeeeER . . . . . . sqrdmlah v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2] .D=======eeeeeER . . . . . sqrdmlah v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3] .D============eeeeeER . . . . sqrdmlah v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0] . D================eeeeeER . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] . D=====================eeeeeER . . sqrdmlah v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2] . D=======================eeeeeER . . sqrdmlah v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3] . D============================eeeeeER sqrdmlah v0.4s, v0.4s, v1.4s
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 9.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 14.0 0.0 0.0 sqrdmlah v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2. 2 16.0 0.0 0.0 sqrdmlah v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3. 2 21.0 0.0 0.0 sqrdmlah v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: 2 15.0 0.1 0.0 <total>
+
+# CHECK: [5] Code Region - smlal2
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1403
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.29
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . smlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2] .D=====eeeeER . . . . smlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3] .D=========eeeeER . . . smlal2 v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0] . D============eeeeeER . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] . D=================eeeeER . smlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2] . D=================eeeeER . smlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3] . D=====================eeeeER smlal2 v0.4s, v0.8h, v1.8h
-# CHECK: [3] Code Region - fmla
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 12.0 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 smlal2 v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
+
+# CHECK: [6] Code Region - sadalp
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1403
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.29
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . sadalp v0.2d, v1.4s
+# CHECK-NEXT: [0,2] .D=====eeeeER . . . . sadalp v0.2d, v1.4s
+# CHECK-NEXT: [0,3] .D=========eeeeER . . . sadalp v0.2d, v0.4s
+# CHECK-NEXT: [1,0] . D============eeeeeER . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] . D=================eeeeER . sadalp v0.2d, v1.4s
+# CHECK-NEXT: [1,2] . D=================eeeeER . sadalp v0.2d, v1.4s
+# CHECK-NEXT: [1,3] . D=====================eeeeER sadalp v0.2d, v0.4s
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 12.0 0.0 0.0 sadalp v0.2d, v1.4s
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 sadalp v0.2d, v1.4s
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sadalp v0.2d, v0.4s
+# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
+
+# CHECK: [7] Code Region - ssra
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1403
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.29
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . ssra v0.2d, v1.2d, #1
+# CHECK-NEXT: [0,2] .D=====eeeeER . . . . ssra v0.2d, v1.2d, #1
+# CHECK-NEXT: [0,3] .D=========eeeeER . . . ssra v0.2d, v0.2d, #1
+# CHECK-NEXT: [1,0] . D============eeeeeER . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] . D=================eeeeER . ssra v0.2d, v1.2d, #1
+# CHECK-NEXT: [1,2] . D=================eeeeER . ssra v0.2d, v1.2d, #1
+# CHECK-NEXT: [1,3] . D=====================eeeeER ssra v0.2d, v0.2d, #1
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 12.0 0.0 0.0 ssra v0.2d, v1.2d, #1
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 ssra v0.2d, v1.2d, #1
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 ssra v0.2d, v0.2d, #1
+# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
+
+# CHECK: [8] Code Region - fmla
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 600
-# CHECK-NEXT: Total Cycles: 2103
+# CHECK-NEXT: Total Cycles: 1703
# CHECK-NEXT: Total uOps: 600
# CHECK: Dispatch Width: 3
-# CHECK-NEXT: uOps Per Cycle: 0.29
-# CHECK-NEXT: IPC: 0.29
+# CHECK-NEXT: uOps Per Cycle: 0.35
+# CHECK-NEXT: IPC: 0.35
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeER . . . . . .. fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1] D=eeeeER . . . . . .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,2] D===eeeeER. . . . . .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,3] .D======eeER . . . . .. fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,4] .D========eeeeER . . . .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,5] .D============eeeeER. . . .. fmla v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: [1,0] . D===============eeeER . . .. fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1] . D================eeeeER. . .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,2] . D==================eeeeER . .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,3] . D=====================eeER . .. fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,4] . D=======================eeeeER .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,5] . D===========================eeeeER fmla v0.2d, v0.2d, v1.2d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1. 2 9.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 2. 2 11.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 3. 2 14.5 0.0 0.0 fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 4. 2 16.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 5. 2 20.5 0.0 0.0 fmla v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: 2 13.5 0.1 0.0 <total>
+
+# CHECK: [9] Code Region - fmlal
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 600
+# CHECK-NEXT: Total Cycles: 2303
+# CHECK-NEXT: Total uOps: 600
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.26
+# CHECK-NEXT: IPC: 0.26
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 01234
-
-# CHECK: [0,0] DeeeER . . . . . . . . fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,1] D===eeeeER. . . . . . . . fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [0,2] D=======eeER . . . . . . . fadd v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,3] .D========eeeeER . . . . . . fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [0,4] .D============eeeeER. . . . . . fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [0,5] .D================eeeeER . . . . . fmla v0.2d, v0.2d, v1.2d
-# CHECK-NEXT: [1,0] . D===================eeeER . . . . fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1] . D======================eeeeER . . . fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,2] . D==========================eeER . . . fadd v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,3] . D===========================eeeeER . . fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,4] . D===============================eeeeER . fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,5] . D===================================eeeeER fmla v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: Index 0123456789 0123456789 012345678
+
+# CHECK: [0,0] DeeeER . . . . . . . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1] D===eeeeeER . . . . . . . . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,2] D======eeeeeER . . . . . . . . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,3] .D==========eeER . . . . . . . fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,4] .D============eeeeeER . . . . . . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,5] .D=================eeeeeER . . . . . fmlal v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: [1,0] . D=====================eeeER . . . . . fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1] . D========================eeeeeER . . . . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,2] . D===========================eeeeeER . . . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,3] . D===============================eeER . . . fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,4] . D=================================eeeeeER . . fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,5] . D======================================eeeeeER fmlal v0.4s, v0.4h, v1.4h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -202,38 +481,38 @@ smulh x0, x0, x1
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 10.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1. 2 13.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 2. 2 17.5 0.0 0.0 fadd v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 3. 2 18.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 4. 2 22.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 5. 2 26.5 0.0 0.0 fmla v0.2d, v0.2d, v1.2d
-# CHECK-NEXT: 2 18.2 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 11.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1. 2 14.5 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 2. 2 17.5 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 3. 2 21.5 0.0 0.0 fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 4. 2 23.5 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 5. 2 28.5 0.0 0.0 fmlal v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: 2 19.5 0.1 0.0 <total>
-# CHECK: [4] Code Region - crc32
+# CHECK: [10] Code Region - crc32
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 803
+# CHECK-NEXT: Total Cycles: 703
# CHECK-NEXT: Total uOps: 400
# CHECK: Dispatch Width: 3
-# CHECK-NEXT: uOps Per Cycle: 0.50
-# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: uOps Per Cycle: 0.57
+# CHECK-NEXT: IPC: 0.57
# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
-# CHECK-NEXT: 012345678
+# CHECK-NEXT: 0123456
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeER. . . . mul w0, w0, w0
-# CHECK-NEXT: [0,1] D==eeER . . . crc32cb w0, w0, w1
-# CHECK-NEXT: [0,2] D====eeER . . . crc32cb w0, w0, w1
-# CHECK-NEXT: [0,3] .D=====eeER . . crc32cb w0, w0, w0
-# CHECK-NEXT: [1,0] .D=======eeER . . mul w0, w0, w0
-# CHECK-NEXT: [1,1] .D=========eeER. . crc32cb w0, w0, w1
-# CHECK-NEXT: [1,2] . D==========eeER . crc32cb w0, w0, w1
-# CHECK-NEXT: [1,3] . D============eeER crc32cb w0, w0, w0
+# CHECK: [0,0] DeeER. . .. mul w0, w0, w0
+# CHECK-NEXT: [0,1] D==eeER . .. crc32cb w0, w0, w1
+# CHECK-NEXT: [0,2] D===eeER . .. crc32cb w0, w0, w1
+# CHECK-NEXT: [0,3] .D====eeER. .. crc32cb w0, w0, w0
+# CHECK-NEXT: [1,0] .D======eeER .. mul w0, w0, w0
+# CHECK-NEXT: [1,1] .D========eeER .. crc32cb w0, w0, w1
+# CHECK-NEXT: [1,2] . D========eeER.. crc32cb w0, w0, w1
+# CHECK-NEXT: [1,3] . D==========eeER crc32cb w0, w0, w0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -242,36 +521,36 @@ smulh x0, x0, x1
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul w0, w0, w0
-# CHECK-NEXT: 1. 2 6.5 0.0 0.0 crc32cb w0, w0, w1
-# CHECK-NEXT: 2. 2 8.0 0.0 0.0 crc32cb w0, w0, w1
-# CHECK-NEXT: 3. 2 9.5 0.0 0.0 crc32cb w0, w0, w0
-# CHECK-NEXT: 2 7.1 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 4.0 0.5 0.0 mul w0, w0, w0
+# CHECK-NEXT: 1. 2 6.0 0.0 0.0 crc32cb w0, w0, w1
+# CHECK-NEXT: 2. 2 6.5 0.0 0.0 crc32cb w0, w0, w1
+# CHECK-NEXT: 3. 2 8.0 0.0 0.0 crc32cb w0, w0, w0
+# CHECK-NEXT: 2 6.1 0.1 0.0 <total>
-# CHECK: [5] Code Region - smulh
+# CHECK: [11] Code Region - smulh
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1205
+# CHECK-NEXT: Total Cycles: 1903
# CHECK-NEXT: Total uOps: 400
# CHECK: Dispatch Width: 3
-# CHECK-NEXT: uOps Per Cycle: 0.33
-# CHECK-NEXT: IPC: 0.33
-# CHECK-NEXT: Block RThroughput: 12.0
+# CHECK-NEXT: uOps Per Cycle: 0.21
+# CHECK-NEXT: IPC: 0.21
+# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789
-# CHECK-NEXT: Index 0123456789 012345678
-
-# CHECK: [0,0] DeeeeER . . . . . mul x0, x0, x0
-# CHECK-NEXT: [0,1] D===eeeeeER . . . . smulh x0, x1, x2
-# CHECK-NEXT: [0,2] D======eeeeeER . . . . smulh x0, x1, x2
-# CHECK-NEXT: [0,3] .D===========eeeeeER. . . smulh x0, x0, x1
-# CHECK-NEXT: [1,0] .D=================eeeeER. . mul x0, x0, x0
-# CHECK-NEXT: [1,1] .D========eeeeeE--------R. . smulh x0, x1, x2
-# CHECK-NEXT: [1,2] . D=============eeeeeE--R. . smulh x0, x1, x2
-# CHECK-NEXT: [1,3] . D===================eeeeeER smulh x0, x0, x1
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 0
+
+# CHECK: [0,0] DeeeeER . . . . . . . mul x0, x0, x0
+# CHECK-NEXT: [0,1] D====eeeeeER . . . . . . smulh x0, x0, x1
+# CHECK-NEXT: [0,2] D=========eeeeeER . . . . . smulh x0, x0, x1
+# CHECK-NEXT: [0,3] .D=============eeeeeER . . . . smulh x0, x0, x0
+# CHECK-NEXT: [1,0] .D==================eeeeER . . . mul x0, x0, x0
+# CHECK-NEXT: [1,1] .D======================eeeeeER . . smulh x0, x0, x1
+# CHECK-NEXT: [1,2] . D==========================eeeeeER . smulh x0, x0, x1
+# CHECK-NEXT: [1,3] . D===============================eeeeeER smulh x0, x0, x0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -280,8 +559,8 @@ smulh x0, x0, x1
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.5 1.0 0.0 mul x0, x0, x0
-# CHECK-NEXT: 1. 2 6.5 6.5 4.0 smulh x0, x1, x2
-# CHECK-NEXT: 2. 2 10.5 10.5 1.0 smulh x0, x1, x2
-# CHECK-NEXT: 3. 2 16.0 1.0 0.0 smulh x0, x0, x1
-# CHECK-NEXT: 2 10.6 4.8 1.3 <total>
+# CHECK-NEXT: 0. 2 10.0 0.5 0.0 mul x0, x0, x0
+# CHECK-NEXT: 1. 2 14.0 0.0 0.0 smulh x0, x0, x1
+# CHECK-NEXT: 2. 2 18.5 0.0 0.0 smulh x0, x0, x1
+# CHECK-NEXT: 3. 2 23.0 0.0 0.0 smulh x0, x0, x0
+# CHECK-NEXT: 2 16.4 0.1 0.0 <total>
>From 502730ba69cd3f613c34fba4fe3c10b4d1fd9a5e Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Thu, 12 Feb 2026 00:21:10 +0000
Subject: [PATCH 3/8] Updated scheduling model + tests
---
.../Target/AArch64/AArch64SchedNeoverseN1.td | 37 ++--
.../AArch64/Neoverse/N1-basic-instructions.s | 158 +++++++++---------
.../AArch64/Neoverse/N1-neon-instructions.s | 12 +-
3 files changed, 105 insertions(+), 102 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
index d6de36c6081e4..b890c01dd0ddb 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
@@ -264,44 +264,44 @@ def N1Write_9c_6L_6V : SchedWriteRes<[N1UnitL, N1UnitL, N1UnitL,
//===----------------------------------------------------------------------===//
// Define forwarded types
-def N1Wr_IM32 : WriteSequence<[N1Write_2c_1M]>;
-def N1Wr_IM64 : WriteSequence<[N1Write_4c3_1M]>;
+def N1Wr_IM32 : SchedWriteRes<[N1UnitM]> { let Latency = 2; }
+def N1Wr_IM64 : SchedWriteRes<[N1UnitM]> { let Latency = 4; }
def N1Rd_IMA : SchedReadAdvance<1, [N1Wr_IM32, N1Wr_IM64]>;
-def N1Wr_FMA : WriteSequence<[N1Write_4c_1V]>;
+def N1Wr_FMA : SchedWriteRes<[N1UnitV]> { let Latency = 4; }
def N1Rd_FMA : SchedReadAdvance<2, [WriteFMul, N1Wr_FMA]>;
-def N1Wr_VA : WriteSequence<[N1Write_4c_1V1]>;
+def N1Wr_VA : SchedWriteRes<[N1UnitV1]> { let Latency = 4; }
def N1Rd_VA : SchedReadAdvance<3, [N1Wr_VA]>;
-def N1Wr_VMA : WriteSequence<[N1Write_4c_1V0]>;
-def N1Wr_VMAQ : WriteSequence<[N1Write_5c_2V0]>;
+def N1Wr_VMA : SchedWriteRes<[N1UnitV0]> { let Latency = 5; }
+def N1Wr_VMAQ : SchedWriteRes<[N1UnitV0, N1UnitV0]> { let Latency = 5; }
def N1Rd_VMA : SchedReadAdvance<3, [N1Wr_VMA, N1Wr_VMAQ]>;
-def N1Wr_VMAH : WriteSequence<[N1Write_4c_1V0]>;
-def N1Wr_VMAHQ : WriteSequence<[N1Write_5c_2V0]>;
+def N1Wr_VMAH : SchedWriteRes<[N1UnitV0]> { let Latency = 4; }
+def N1Wr_VMAHQ : SchedWriteRes<[N1UnitV0, N1UnitV0]> { let Latency = 5; }
def N1Rd_VMAH : SchedReadAdvance<2, [N1Wr_VMAH, N1Wr_VMAHQ]>;
-def N1Wr_VMAL : WriteSequence<[N1Write_4c_1V0]>;
+def N1Wr_VMAL : SchedWriteRes<[N1UnitV0]> { let Latency = 4; }
def N1Rd_VMAL : SchedReadAdvance<3, [N1Wr_VMAL]>;
-def N1Wr_VPA : WriteSequence<[N1Write_4c_1V1]>;
+def N1Wr_VPA : SchedWriteRes<[N1UnitV1]> { let Latency = 4; }
def N1Rd_VPA : SchedReadAdvance<3, [N1Wr_VPA]>;
-def N1Wr_VSA : WriteSequence<[N1Write_4c_1V1]>;
+def N1Wr_VSA : SchedWriteRes<[N1UnitV1]> { let Latency = 4; }
def N1Rd_VSA : SchedReadAdvance<3, [N1Wr_VSA]>;
-def N1Wr_VFM : WriteSequence<[N1Write_3c_1V]>;
-def N1Wr_VFMA : WriteSequence<[N1Write_4c_1V]>;
+def N1Wr_VFM : SchedWriteRes<[N1UnitV]> { let Latency = 3; }
+def N1Wr_VFMA : SchedWriteRes<[N1UnitV]> { let Latency = 4; }
def N1Rd_VFMA : SchedReadAdvance<2, [N1Wr_VFM, N1Wr_VFMA]>;
-def N1Wr_VFMAL : WriteSequence<[N1Write_5c_1V]>;
+def N1Wr_VFMAL : SchedWriteRes<[N1UnitV]> { let Latency = 5; }
def N1Rd_VFMAL : SchedReadAdvance<2, [N1Wr_VFMAL]>;
-def N1Wr_CRC : WriteSequence<[N1Write_2c_1M]>;
+def N1Wr_CRC : SchedWriteRes<[N1UnitM]> { let Latency = 2; }
def N1Rd_CRC : SchedReadAdvance<1, [N1Wr_CRC]>;
-def N1Wr_MH : WriteSequence<[N1Write_5c3_1M]>;
+def N1Wr_MH : SchedWriteRes<[N1UnitM]> { let Latency = 5; }
def N1Rd_MH : SchedReadAdvance<2, [N1Wr_MH]>;
@@ -329,6 +329,9 @@ def : SchedAlias<WriteBrReg, N1Write_1c_1B>;
// Branch and link, register
def : InstRW<[N1Write_1c_1B_1I], (instrs BL, BLR)>;
+// Compare and branch
+def : InstRW<[N1Write_1c_1B], (instregex "^[CT]BN?Z[XW]$")>;
+
// Arithmetic and Logical Instructions
// -----------------------------------------------------------------------------
@@ -348,7 +351,7 @@ def : SchedAlias<WriteI, N1Write_1c_1I>;
def : SchedAlias<WriteIEReg, N1Write_2c_1M>;
// Arithmetic, LSL shift, shift <= 4
-// Arithmetic, flagset, LSL shift, shift <= 4
+// Arithmetic, flagset, LSL shift, shift <= 4ah
// Arithmetic, LSR/ASR/ROR shift or LSL shift > 4
def N1WriteISReg : SchedWriteVariant<[
SchedVar<IsCheapLSL, [N1Write_1c_1I]>,
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
index fb3f2ccc92441..5b5d29623f52f 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
@@ -448,56 +448,56 @@
# CHECK-NEXT: 1 2 1.00 crc32ch w25, w26, w16
# CHECK-NEXT: 1 2 1.00 crc32cw w27, w12, w23
# CHECK-NEXT: 1 2 1.00 crc32cx w21, w28, x5
-# CHECK-NEXT: 1 5 3.00 smulh x30, x29, x28
-# CHECK-NEXT: 1 5 3.00 smulh xzr, x27, x26
-# CHECK-NEXT: 1 5 3.00 umulh x30, x29, x28
-# CHECK-NEXT: 1 5 3.00 umulh x23, x30, xzr
+# CHECK-NEXT: 1 5 1.00 smulh x30, x29, x28
+# CHECK-NEXT: 1 5 1.00 smulh xzr, x27, x26
+# CHECK-NEXT: 1 5 1.00 umulh x30, x29, x28
+# CHECK-NEXT: 1 5 1.00 umulh x23, x30, xzr
# CHECK-NEXT: 1 2 1.00 madd w1, w3, w7, w4
# CHECK-NEXT: 1 2 1.00 madd wzr, w0, w9, w11
# CHECK-NEXT: 1 2 1.00 madd w13, wzr, w4, w4
# CHECK-NEXT: 1 2 1.00 madd w19, w30, wzr, w29
# CHECK-NEXT: 1 2 1.00 mul w4, w5, w6
-# CHECK-NEXT: 1 4 3.00 madd x1, x3, x7, x4
-# CHECK-NEXT: 1 4 3.00 madd xzr, x0, x9, x11
-# CHECK-NEXT: 1 4 3.00 madd x13, xzr, x4, x4
-# CHECK-NEXT: 1 4 3.00 madd x19, x30, xzr, x29
-# CHECK-NEXT: 1 4 3.00 mul x4, x5, x6
+# CHECK-NEXT: 1 4 1.00 madd x1, x3, x7, x4
+# CHECK-NEXT: 1 4 1.00 madd xzr, x0, x9, x11
+# CHECK-NEXT: 1 4 1.00 madd x13, xzr, x4, x4
+# CHECK-NEXT: 1 4 1.00 madd x19, x30, xzr, x29
+# CHECK-NEXT: 1 4 1.00 mul x4, x5, x6
# CHECK-NEXT: 1 2 1.00 msub w1, w3, w7, w4
# CHECK-NEXT: 1 2 1.00 msub wzr, w0, w9, w11
# CHECK-NEXT: 1 2 1.00 msub w13, wzr, w4, w4
# CHECK-NEXT: 1 2 1.00 msub w19, w30, wzr, w29
# CHECK-NEXT: 1 2 1.00 mneg w4, w5, w6
-# CHECK-NEXT: 1 4 3.00 msub x1, x3, x7, x4
-# CHECK-NEXT: 1 4 3.00 msub xzr, x0, x9, x11
-# CHECK-NEXT: 1 4 3.00 msub x13, xzr, x4, x4
-# CHECK-NEXT: 1 4 3.00 msub x19, x30, xzr, x29
-# CHECK-NEXT: 1 4 3.00 mneg x4, x5, x6
-# CHECK-NEXT: 1 4 3.00 smaddl x3, w5, w2, x9
-# CHECK-NEXT: 1 4 3.00 smaddl xzr, w10, w11, x12
-# CHECK-NEXT: 1 4 3.00 smaddl x13, wzr, w14, x15
-# CHECK-NEXT: 1 4 3.00 smaddl x16, w17, wzr, x18
-# CHECK-NEXT: 1 4 3.00 smull x19, w20, w21
-# CHECK-NEXT: 1 4 3.00 smsubl x3, w5, w2, x9
-# CHECK-NEXT: 1 4 3.00 smsubl xzr, w10, w11, x12
-# CHECK-NEXT: 1 4 3.00 smsubl x13, wzr, w14, x15
-# CHECK-NEXT: 1 4 3.00 smsubl x16, w17, wzr, x18
-# CHECK-NEXT: 1 4 3.00 smnegl x19, w20, w21
-# CHECK-NEXT: 1 4 3.00 umaddl x3, w5, w2, x9
-# CHECK-NEXT: 1 4 3.00 umaddl xzr, w10, w11, x12
-# CHECK-NEXT: 1 4 3.00 umaddl x13, wzr, w14, x15
-# CHECK-NEXT: 1 4 3.00 umaddl x16, w17, wzr, x18
-# CHECK-NEXT: 1 4 3.00 umull x19, w20, w21
-# CHECK-NEXT: 1 4 3.00 umsubl x3, w5, w2, x9
-# CHECK-NEXT: 1 4 3.00 umsubl x16, w17, wzr, x18
-# CHECK-NEXT: 1 4 3.00 umnegl x19, w20, w21
-# CHECK-NEXT: 1 5 3.00 smulh x23, x22, xzr
-# CHECK-NEXT: 1 5 3.00 umulh x23, x22, xzr
-# CHECK-NEXT: 1 4 3.00 mul x19, x20, xzr
+# CHECK-NEXT: 1 4 1.00 msub x1, x3, x7, x4
+# CHECK-NEXT: 1 4 1.00 msub xzr, x0, x9, x11
+# CHECK-NEXT: 1 4 1.00 msub x13, xzr, x4, x4
+# CHECK-NEXT: 1 4 1.00 msub x19, x30, xzr, x29
+# CHECK-NEXT: 1 4 1.00 mneg x4, x5, x6
+# CHECK-NEXT: 1 4 1.00 smaddl x3, w5, w2, x9
+# CHECK-NEXT: 1 4 1.00 smaddl xzr, w10, w11, x12
+# CHECK-NEXT: 1 4 1.00 smaddl x13, wzr, w14, x15
+# CHECK-NEXT: 1 4 1.00 smaddl x16, w17, wzr, x18
+# CHECK-NEXT: 1 4 1.00 smull x19, w20, w21
+# CHECK-NEXT: 1 4 1.00 smsubl x3, w5, w2, x9
+# CHECK-NEXT: 1 4 1.00 smsubl xzr, w10, w11, x12
+# CHECK-NEXT: 1 4 1.00 smsubl x13, wzr, w14, x15
+# CHECK-NEXT: 1 4 1.00 smsubl x16, w17, wzr, x18
+# CHECK-NEXT: 1 4 1.00 smnegl x19, w20, w21
+# CHECK-NEXT: 1 4 1.00 umaddl x3, w5, w2, x9
+# CHECK-NEXT: 1 4 1.00 umaddl xzr, w10, w11, x12
+# CHECK-NEXT: 1 4 1.00 umaddl x13, wzr, w14, x15
+# CHECK-NEXT: 1 4 1.00 umaddl x16, w17, wzr, x18
+# CHECK-NEXT: 1 4 1.00 umull x19, w20, w21
+# CHECK-NEXT: 1 4 1.00 umsubl x3, w5, w2, x9
+# CHECK-NEXT: 1 4 1.00 umsubl x16, w17, wzr, x18
+# CHECK-NEXT: 1 4 1.00 umnegl x19, w20, w21
+# CHECK-NEXT: 1 5 1.00 smulh x23, x22, xzr
+# CHECK-NEXT: 1 5 1.00 umulh x23, x22, xzr
+# CHECK-NEXT: 1 4 1.00 mul x19, x20, xzr
# CHECK-NEXT: 1 2 1.00 mneg w21, w22, w23
-# CHECK-NEXT: 1 4 3.00 smull x11, w13, w17
-# CHECK-NEXT: 1 4 3.00 umull x11, w13, w17
-# CHECK-NEXT: 1 4 3.00 smnegl x11, w13, w17
-# CHECK-NEXT: 1 4 3.00 umnegl x11, w13, w17
+# CHECK-NEXT: 1 4 1.00 smull x11, w13, w17
+# CHECK-NEXT: 1 4 1.00 umull x11, w13, w17
+# CHECK-NEXT: 1 4 1.00 smnegl x11, w13, w17
+# CHECK-NEXT: 1 4 1.00 umnegl x11, w13, w17
# CHECK-NEXT: 2 3 1.00 extr w3, w5, w7, #0
# CHECK-NEXT: 2 3 1.00 extr w11, w13, w17, #31
# CHECK-NEXT: 2 3 1.00 extr x3, x5, x7, #15
@@ -1264,7 +1264,7 @@
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1.0] [1.1] [2.0] [2.1] [3] [4.0] [4.1] [5] [6]
-# CHECK-NEXT: 26.00 34.00 34.00 252.50 252.50 527.33 197.33 197.33 293.00 161.00
+# CHECK-NEXT: 26.00 34.00 34.00 252.50 252.50 449.33 197.33 197.33 293.00 161.00
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1.0] [1.1] [2.0] [2.1] [3] [4.0] [4.1] [5] [6] Instructions:
@@ -1706,56 +1706,56 @@
# CHECK-NEXT: - - - - - 1.00 - - - - crc32ch w25, w26, w16
# CHECK-NEXT: - - - - - 1.00 - - - - crc32cw w27, w12, w23
# CHECK-NEXT: - - - - - 1.00 - - - - crc32cx w21, w28, x5
-# CHECK-NEXT: - - - - - 3.00 - - - - smulh x30, x29, x28
-# CHECK-NEXT: - - - - - 3.00 - - - - smulh xzr, x27, x26
-# CHECK-NEXT: - - - - - 3.00 - - - - umulh x30, x29, x28
-# CHECK-NEXT: - - - - - 3.00 - - - - umulh x23, x30, xzr
+# CHECK-NEXT: - - - - - 1.00 - - - - smulh x30, x29, x28
+# CHECK-NEXT: - - - - - 1.00 - - - - smulh xzr, x27, x26
+# CHECK-NEXT: - - - - - 1.00 - - - - umulh x30, x29, x28
+# CHECK-NEXT: - - - - - 1.00 - - - - umulh x23, x30, xzr
# CHECK-NEXT: - - - - - 1.00 - - - - madd w1, w3, w7, w4
# CHECK-NEXT: - - - - - 1.00 - - - - madd wzr, w0, w9, w11
# CHECK-NEXT: - - - - - 1.00 - - - - madd w13, wzr, w4, w4
# CHECK-NEXT: - - - - - 1.00 - - - - madd w19, w30, wzr, w29
# CHECK-NEXT: - - - - - 1.00 - - - - mul w4, w5, w6
-# CHECK-NEXT: - - - - - 3.00 - - - - madd x1, x3, x7, x4
-# CHECK-NEXT: - - - - - 3.00 - - - - madd xzr, x0, x9, x11
-# CHECK-NEXT: - - - - - 3.00 - - - - madd x13, xzr, x4, x4
-# CHECK-NEXT: - - - - - 3.00 - - - - madd x19, x30, xzr, x29
-# CHECK-NEXT: - - - - - 3.00 - - - - mul x4, x5, x6
+# CHECK-NEXT: - - - - - 1.00 - - - - madd x1, x3, x7, x4
+# CHECK-NEXT: - - - - - 1.00 - - - - madd xzr, x0, x9, x11
+# CHECK-NEXT: - - - - - 1.00 - - - - madd x13, xzr, x4, x4
+# CHECK-NEXT: - - - - - 1.00 - - - - madd x19, x30, xzr, x29
+# CHECK-NEXT: - - - - - 1.00 - - - - mul x4, x5, x6
# CHECK-NEXT: - - - - - 1.00 - - - - msub w1, w3, w7, w4
# CHECK-NEXT: - - - - - 1.00 - - - - msub wzr, w0, w9, w11
# CHECK-NEXT: - - - - - 1.00 - - - - msub w13, wzr, w4, w4
# CHECK-NEXT: - - - - - 1.00 - - - - msub w19, w30, wzr, w29
# CHECK-NEXT: - - - - - 1.00 - - - - mneg w4, w5, w6
-# CHECK-NEXT: - - - - - 3.00 - - - - msub x1, x3, x7, x4
-# CHECK-NEXT: - - - - - 3.00 - - - - msub xzr, x0, x9, x11
-# CHECK-NEXT: - - - - - 3.00 - - - - msub x13, xzr, x4, x4
-# CHECK-NEXT: - - - - - 3.00 - - - - msub x19, x30, xzr, x29
-# CHECK-NEXT: - - - - - 3.00 - - - - mneg x4, x5, x6
-# CHECK-NEXT: - - - - - 3.00 - - - - smaddl x3, w5, w2, x9
-# CHECK-NEXT: - - - - - 3.00 - - - - smaddl xzr, w10, w11, x12
-# CHECK-NEXT: - - - - - 3.00 - - - - smaddl x13, wzr, w14, x15
-# CHECK-NEXT: - - - - - 3.00 - - - - smaddl x16, w17, wzr, x18
-# CHECK-NEXT: - - - - - 3.00 - - - - smull x19, w20, w21
-# CHECK-NEXT: - - - - - 3.00 - - - - smsubl x3, w5, w2, x9
-# CHECK-NEXT: - - - - - 3.00 - - - - smsubl xzr, w10, w11, x12
-# CHECK-NEXT: - - - - - 3.00 - - - - smsubl x13, wzr, w14, x15
-# CHECK-NEXT: - - - - - 3.00 - - - - smsubl x16, w17, wzr, x18
-# CHECK-NEXT: - - - - - 3.00 - - - - smnegl x19, w20, w21
-# CHECK-NEXT: - - - - - 3.00 - - - - umaddl x3, w5, w2, x9
-# CHECK-NEXT: - - - - - 3.00 - - - - umaddl xzr, w10, w11, x12
-# CHECK-NEXT: - - - - - 3.00 - - - - umaddl x13, wzr, w14, x15
-# CHECK-NEXT: - - - - - 3.00 - - - - umaddl x16, w17, wzr, x18
-# CHECK-NEXT: - - - - - 3.00 - - - - umull x19, w20, w21
-# CHECK-NEXT: - - - - - 3.00 - - - - umsubl x3, w5, w2, x9
-# CHECK-NEXT: - - - - - 3.00 - - - - umsubl x16, w17, wzr, x18
-# CHECK-NEXT: - - - - - 3.00 - - - - umnegl x19, w20, w21
-# CHECK-NEXT: - - - - - 3.00 - - - - smulh x23, x22, xzr
-# CHECK-NEXT: - - - - - 3.00 - - - - umulh x23, x22, xzr
-# CHECK-NEXT: - - - - - 3.00 - - - - mul x19, x20, xzr
+# CHECK-NEXT: - - - - - 1.00 - - - - msub x1, x3, x7, x4
+# CHECK-NEXT: - - - - - 1.00 - - - - msub xzr, x0, x9, x11
+# CHECK-NEXT: - - - - - 1.00 - - - - msub x13, xzr, x4, x4
+# CHECK-NEXT: - - - - - 1.00 - - - - msub x19, x30, xzr, x29
+# CHECK-NEXT: - - - - - 1.00 - - - - mneg x4, x5, x6
+# CHECK-NEXT: - - - - - 1.00 - - - - smaddl x3, w5, w2, x9
+# CHECK-NEXT: - - - - - 1.00 - - - - smaddl xzr, w10, w11, x12
+# CHECK-NEXT: - - - - - 1.00 - - - - smaddl x13, wzr, w14, x15
+# CHECK-NEXT: - - - - - 1.00 - - - - smaddl x16, w17, wzr, x18
+# CHECK-NEXT: - - - - - 1.00 - - - - smull x19, w20, w21
+# CHECK-NEXT: - - - - - 1.00 - - - - smsubl x3, w5, w2, x9
+# CHECK-NEXT: - - - - - 1.00 - - - - smsubl xzr, w10, w11, x12
+# CHECK-NEXT: - - - - - 1.00 - - - - smsubl x13, wzr, w14, x15
+# CHECK-NEXT: - - - - - 1.00 - - - - smsubl x16, w17, wzr, x18
+# CHECK-NEXT: - - - - - 1.00 - - - - smnegl x19, w20, w21
+# CHECK-NEXT: - - - - - 1.00 - - - - umaddl x3, w5, w2, x9
+# CHECK-NEXT: - - - - - 1.00 - - - - umaddl xzr, w10, w11, x12
+# CHECK-NEXT: - - - - - 1.00 - - - - umaddl x13, wzr, w14, x15
+# CHECK-NEXT: - - - - - 1.00 - - - - umaddl x16, w17, wzr, x18
+# CHECK-NEXT: - - - - - 1.00 - - - - umull x19, w20, w21
+# CHECK-NEXT: - - - - - 1.00 - - - - umsubl x3, w5, w2, x9
+# CHECK-NEXT: - - - - - 1.00 - - - - umsubl x16, w17, wzr, x18
+# CHECK-NEXT: - - - - - 1.00 - - - - umnegl x19, w20, w21
+# CHECK-NEXT: - - - - - 1.00 - - - - smulh x23, x22, xzr
+# CHECK-NEXT: - - - - - 1.00 - - - - umulh x23, x22, xzr
+# CHECK-NEXT: - - - - - 1.00 - - - - mul x19, x20, xzr
# CHECK-NEXT: - - - - - 1.00 - - - - mneg w21, w22, w23
-# CHECK-NEXT: - - - - - 3.00 - - - - smull x11, w13, w17
-# CHECK-NEXT: - - - - - 3.00 - - - - umull x11, w13, w17
-# CHECK-NEXT: - - - - - 3.00 - - - - smnegl x11, w13, w17
-# CHECK-NEXT: - - - - - 3.00 - - - - umnegl x11, w13, w17
+# CHECK-NEXT: - - - - - 1.00 - - - - smull x11, w13, w17
+# CHECK-NEXT: - - - - - 1.00 - - - - umull x11, w13, w17
+# CHECK-NEXT: - - - - - 1.00 - - - - smnegl x11, w13, w17
+# CHECK-NEXT: - - - - - 1.00 - - - - umnegl x11, w13, w17
# CHECK-NEXT: - - - - - 1.33 0.33 0.33 - - extr w3, w5, w7, #0
# CHECK-NEXT: - - - - - 1.33 0.33 0.33 - - extr w11, w13, w17, #31
# CHECK-NEXT: - - - - - 1.33 0.33 0.33 - - extr x3, x5, x7, #15
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
index 270990154f245..1c03d4be8add2 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
@@ -613,10 +613,10 @@
# CHECK-NEXT: 1 2 0.50 mla v0.8b, v0.8b, v0.8b
# CHECK-NEXT: 1 2 0.50 mla v15.8h, v22.8h, v4.h[3]
# CHECK-NEXT: 1 2 0.50 mla v28.2s, v10.2s, v2.s[0]
-# CHECK-NEXT: 1 4 1.00 mls v0.4h, v0.4h, v0.4h
+# CHECK-NEXT: 1 5 1.00 mls v0.4h, v0.4h, v0.4h
# CHECK-NEXT: 1 2 0.50 mls v25.8h, v29.8h, v0.h[4]
# CHECK-NEXT: 1 2 0.50 mls v22.2s, v29.2s, v0.s[3]
-# CHECK-NEXT: 2 5 2.00 mls v26.4s, v5.4s, v28.4s
+# CHECK-NEXT: 1 5 2.00 mls v26.4s, v5.4s, v28.4s
# CHECK-NEXT: 1 2 0.50 mov b0, v0.b[15]
# CHECK-NEXT: 1 2 0.50 mov d6, v0.d[1]
# CHECK-NEXT: 1 2 0.50 mov h2, v0.h[5]
@@ -938,10 +938,10 @@
# CHECK-NEXT: 1 2 0.50 sqrdmlah v0.4s, v1.4s, v2.s[1]
# CHECK-NEXT: 1 4 1.00 sqrdmlah h0, h1, h2
# CHECK-NEXT: 1 4 1.00 sqrdmlah v0.4h, v1.4h, v2.4h
-# CHECK-NEXT: 2 5 2.00 sqrdmlah v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: 1 5 2.00 sqrdmlah v0.8h, v1.8h, v2.8h
# CHECK-NEXT: 1 4 1.00 sqrdmlah s0, s1, s2
# CHECK-NEXT: 1 4 1.00 sqrdmlah v0.2s, v1.2s, v2.2s
-# CHECK-NEXT: 2 5 2.00 sqrdmlah v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 1 5 2.00 sqrdmlah v0.4s, v1.4s, v2.4s
# CHECK-NEXT: 1 2 0.50 sqrdmlsh h0, h1, v2.h[3]
# CHECK-NEXT: 1 2 0.50 sqrdmlsh v0.4h, v1.4h, v2.h[3]
# CHECK-NEXT: 1 2 0.50 sqrdmlsh v0.8h, v1.8h, v2.h[3]
@@ -950,10 +950,10 @@
# CHECK-NEXT: 1 2 0.50 sqrdmlsh v0.4s, v1.4s, v2.s[1]
# CHECK-NEXT: 1 4 1.00 sqrdmlsh h0, h1, h2
# CHECK-NEXT: 1 4 1.00 sqrdmlsh v0.4h, v1.4h, v2.4h
-# CHECK-NEXT: 2 5 2.00 sqrdmlsh v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: 1 5 2.00 sqrdmlsh v0.8h, v1.8h, v2.8h
# CHECK-NEXT: 1 4 1.00 sqrdmlsh s0, s1, s2
# CHECK-NEXT: 1 4 1.00 sqrdmlsh v0.2s, v1.2s, v2.2s
-# CHECK-NEXT: 2 5 2.00 sqrdmlsh v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 1 5 2.00 sqrdmlsh v0.4s, v1.4s, v2.4s
# CHECK-NEXT: 1 4 1.00 sqrdmulh h10, h11, h12
# CHECK-NEXT: 1 2 0.50 sqrdmulh h7, h15, v0.h[3]
# CHECK-NEXT: 1 2 0.50 sqrdmulh s15, s14, v0.s[1]
>From a63212d36106a4ed189e6785b150623b74e93411 Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Thu, 12 Feb 2026 01:07:16 +0000
Subject: [PATCH 4/8] Fixups
---
llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
index b890c01dd0ddb..6a2bf1d3475c0 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
@@ -329,9 +329,6 @@ def : SchedAlias<WriteBrReg, N1Write_1c_1B>;
// Branch and link, register
def : InstRW<[N1Write_1c_1B_1I], (instrs BL, BLR)>;
-// Compare and branch
-def : InstRW<[N1Write_1c_1B], (instregex "^[CT]BN?Z[XW]$")>;
-
// Arithmetic and Logical Instructions
// -----------------------------------------------------------------------------
@@ -351,7 +348,7 @@ def : SchedAlias<WriteI, N1Write_1c_1I>;
def : SchedAlias<WriteIEReg, N1Write_2c_1M>;
// Arithmetic, LSL shift, shift <= 4
-// Arithmetic, flagset, LSL shift, shift <= 4ah
+// Arithmetic, flagset, LSL shift, shift <= 4
// Arithmetic, LSR/ASR/ROR shift or LSL shift > 4
def N1WriteISReg : SchedWriteVariant<[
SchedVar<IsCheapLSL, [N1Write_1c_1I]>,
>From b0a243c38437f3efdf40994d82a49e707648ab5d Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Wed, 25 Feb 2026 16:43:39 +0000
Subject: [PATCH 5/8] Summary: Responds to comments
- Added smaddl test in N1-forwarding.s
- Removed late forwarding for ASIMD multiply accumulate high D/Q form and ASIMD multiply accumulate saturating long
- updated VFMAL read advance to 3
Co-authored-by: Asher8118
---
.../Target/AArch64/AArch64SchedNeoverseN1.td | 18 +-
.../AArch64/Neoverse/N1-basic-instructions.s | 44 ++--
.../llvm-mca/AArch64/Neoverse/N1-forwarding.s | 195 +++++++-----------
.../AArch64/Neoverse/N1-neon-instructions.s | 8 +-
4 files changed, 106 insertions(+), 159 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
index 6a2bf1d3475c0..99e0ef1d8c261 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
@@ -278,13 +278,6 @@ def N1Wr_VMA : SchedWriteRes<[N1UnitV0]> { let Latency = 5; }
def N1Wr_VMAQ : SchedWriteRes<[N1UnitV0, N1UnitV0]> { let Latency = 5; }
def N1Rd_VMA : SchedReadAdvance<3, [N1Wr_VMA, N1Wr_VMAQ]>;
-def N1Wr_VMAH : SchedWriteRes<[N1UnitV0]> { let Latency = 4; }
-def N1Wr_VMAHQ : SchedWriteRes<[N1UnitV0, N1UnitV0]> { let Latency = 5; }
-def N1Rd_VMAH : SchedReadAdvance<2, [N1Wr_VMAH, N1Wr_VMAHQ]>;
-
-def N1Wr_VMAL : SchedWriteRes<[N1UnitV0]> { let Latency = 4; }
-def N1Rd_VMAL : SchedReadAdvance<3, [N1Wr_VMAL]>;
-
def N1Wr_VPA : SchedWriteRes<[N1UnitV1]> { let Latency = 4; }
def N1Rd_VPA : SchedReadAdvance<3, [N1Wr_VPA]>;
@@ -296,7 +289,7 @@ def N1Wr_VFMA : SchedWriteRes<[N1UnitV]> { let Latency = 4; }
def N1Rd_VFMA : SchedReadAdvance<2, [N1Wr_VFM, N1Wr_VFMA]>;
def N1Wr_VFMAL : SchedWriteRes<[N1UnitV]> { let Latency = 5; }
-def N1Rd_VFMAL : SchedReadAdvance<2, [N1Wr_VFMAL]>;
+def N1Rd_VFMAL : SchedReadAdvance<3, [N1Wr_VFMAL]>;
def N1Wr_CRC : SchedWriteRes<[N1UnitM]> { let Latency = 2; }
def N1Rd_CRC : SchedReadAdvance<1, [N1Wr_CRC]>;
@@ -380,7 +373,7 @@ def : InstRW<[N1Wr_IM32, ReadIM, ReadIM, N1Rd_IMA],
(instregex "^M(ADD|SUB)Wrrr$")>;
def : InstRW<[N1Wr_IM64, ReadIM, ReadIM, N1Rd_IMA],
(instregex "^M(ADD|SUB)Xrrr$")>;
-def : InstRW<[N1Wr_IM64, ReadIM, ReadIM, N1Rd_IMA],
+def : InstRW<[N1Wr_IM32, ReadIM, ReadIM, N1Rd_IMA],
(instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
// Multiply high
@@ -668,12 +661,12 @@ def : InstRW<[N1Write_4c_1V0], (instregex "^MUL(v[14]i16|v[12]i32)$",
def : InstRW<[N1Wr_VMA, N1Rd_VMA], (instregex "^ML[AS](v[14]i16|v[12]i32)$")>;
// ASIMD multiply accumulate high, D-form
-def : InstRW<[N1Wr_VMAH, N1Rd_VMAH], (instregex "^SQRDML[AS]H(v[14]i16|v[12]i32)$")>;
+def : InstRW<[N1Write_4c_1V0], (instregex "^SQRDML[AS]H(v[14]i16|v[12]i32)$")>;
// ASIMD multiply accumulate saturating long
// ASIMD multiply long
// ASIMD multiply accumulate long
-def : InstRW<[N1Wr_VMAL, N1Rd_VMAL], (instregex "^[SU]ML[AS]Lv",
+def : InstRW<[N1Write_4c_1V0], (instregex "^[SU]ML[AS]Lv",
"^SQDML[AS]Lv")>;
// ASIMD multiply, Q-form
@@ -684,8 +677,7 @@ def : InstRW<[N1Write_5c_2V0], (instregex "^MUL(v8i16|v4i32)$",
def : InstRW<[N1Wr_VMAQ, N1Rd_VMA], (instregex "^ML[AS](v8i16|v4i32)$")>;
// ASIMD multiply accumulate high, Q-form
-def : InstRW<[N1Wr_VMAHQ, N1Rd_VMAH],
- (instregex "^SQRDML[AS]H(v8i16|v4i32)$")>;
+def : InstRW<[N1Write_5c_2V0], (instregex "^SQRDML[AS]H(v8i16|v4i32)$")>;
// ASIMD multiply/multiply long (8x8) polynomial, D-form
def : InstRW<[N1Write_3c_1V0], (instrs PMULv8i8, PMULLv8i8)>;
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
index 5b5d29623f52f..6fa54ddf61e2f 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
@@ -472,32 +472,32 @@
# CHECK-NEXT: 1 4 1.00 msub x13, xzr, x4, x4
# CHECK-NEXT: 1 4 1.00 msub x19, x30, xzr, x29
# CHECK-NEXT: 1 4 1.00 mneg x4, x5, x6
-# CHECK-NEXT: 1 4 1.00 smaddl x3, w5, w2, x9
-# CHECK-NEXT: 1 4 1.00 smaddl xzr, w10, w11, x12
-# CHECK-NEXT: 1 4 1.00 smaddl x13, wzr, w14, x15
-# CHECK-NEXT: 1 4 1.00 smaddl x16, w17, wzr, x18
-# CHECK-NEXT: 1 4 1.00 smull x19, w20, w21
-# CHECK-NEXT: 1 4 1.00 smsubl x3, w5, w2, x9
-# CHECK-NEXT: 1 4 1.00 smsubl xzr, w10, w11, x12
-# CHECK-NEXT: 1 4 1.00 smsubl x13, wzr, w14, x15
-# CHECK-NEXT: 1 4 1.00 smsubl x16, w17, wzr, x18
-# CHECK-NEXT: 1 4 1.00 smnegl x19, w20, w21
-# CHECK-NEXT: 1 4 1.00 umaddl x3, w5, w2, x9
-# CHECK-NEXT: 1 4 1.00 umaddl xzr, w10, w11, x12
-# CHECK-NEXT: 1 4 1.00 umaddl x13, wzr, w14, x15
-# CHECK-NEXT: 1 4 1.00 umaddl x16, w17, wzr, x18
-# CHECK-NEXT: 1 4 1.00 umull x19, w20, w21
-# CHECK-NEXT: 1 4 1.00 umsubl x3, w5, w2, x9
-# CHECK-NEXT: 1 4 1.00 umsubl x16, w17, wzr, x18
-# CHECK-NEXT: 1 4 1.00 umnegl x19, w20, w21
+# CHECK-NEXT: 1 2 1.00 smaddl x3, w5, w2, x9
+# CHECK-NEXT: 1 2 1.00 smaddl xzr, w10, w11, x12
+# CHECK-NEXT: 1 2 1.00 smaddl x13, wzr, w14, x15
+# CHECK-NEXT: 1 2 1.00 smaddl x16, w17, wzr, x18
+# CHECK-NEXT: 1 2 1.00 smull x19, w20, w21
+# CHECK-NEXT: 1 2 1.00 smsubl x3, w5, w2, x9
+# CHECK-NEXT: 1 2 1.00 smsubl xzr, w10, w11, x12
+# CHECK-NEXT: 1 2 1.00 smsubl x13, wzr, w14, x15
+# CHECK-NEXT: 1 2 1.00 smsubl x16, w17, wzr, x18
+# CHECK-NEXT: 1 2 1.00 smnegl x19, w20, w21
+# CHECK-NEXT: 1 2 1.00 umaddl x3, w5, w2, x9
+# CHECK-NEXT: 1 2 1.00 umaddl xzr, w10, w11, x12
+# CHECK-NEXT: 1 2 1.00 umaddl x13, wzr, w14, x15
+# CHECK-NEXT: 1 2 1.00 umaddl x16, w17, wzr, x18
+# CHECK-NEXT: 1 2 1.00 umull x19, w20, w21
+# CHECK-NEXT: 1 2 1.00 umsubl x3, w5, w2, x9
+# CHECK-NEXT: 1 2 1.00 umsubl x16, w17, wzr, x18
+# CHECK-NEXT: 1 2 1.00 umnegl x19, w20, w21
# CHECK-NEXT: 1 5 1.00 smulh x23, x22, xzr
# CHECK-NEXT: 1 5 1.00 umulh x23, x22, xzr
# CHECK-NEXT: 1 4 1.00 mul x19, x20, xzr
# CHECK-NEXT: 1 2 1.00 mneg w21, w22, w23
-# CHECK-NEXT: 1 4 1.00 smull x11, w13, w17
-# CHECK-NEXT: 1 4 1.00 umull x11, w13, w17
-# CHECK-NEXT: 1 4 1.00 smnegl x11, w13, w17
-# CHECK-NEXT: 1 4 1.00 umnegl x11, w13, w17
+# CHECK-NEXT: 1 2 1.00 smull x11, w13, w17
+# CHECK-NEXT: 1 2 1.00 umull x11, w13, w17
+# CHECK-NEXT: 1 2 1.00 smnegl x11, w13, w17
+# CHECK-NEXT: 1 2 1.00 umnegl x11, w13, w17
# CHECK-NEXT: 2 3 1.00 extr w3, w5, w7, #0
# CHECK-NEXT: 2 3 1.00 extr w11, w13, w17, #31
# CHECK-NEXT: 2 3 1.00 extr x3, x5, x7, #15
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
index bb70593f1b91d..4bffe97ed1b07 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
@@ -31,20 +31,6 @@ mla v0.4s, v1.4s, v2.4s
mla v0.4s, v0.4s, v1.4s
# LLVM-MCA-END
-# LLVM-MCA-BEGIN sqrdmlah
-mul v0.4s, v0.4s, v0.4s
-sqrdmlah v0.4s, v1.4s, v2.4s
-sqrdmlah v0.4s, v1.4s, v2.4s
-sqrdmlah v0.4s, v0.4s, v1.4s
-# LLVM-MCA-END
-
-# LLVM-MCA-BEGIN smlal2
-mul v0.4s, v0.4s, v0.4s
-smlal2 v0.4s, v1.8h, v2.8h
-smlal2 v0.4s, v1.8h, v2.8h
-smlal2 v0.4s, v0.8h, v1.8h
-# LLVM-MCA-END
-
# LLVM-MCA-BEGIN sadalp
mul v0.4s, v0.4s, v0.4s
sadalp v0.2d, v1.4s
@@ -91,6 +77,13 @@ smulh x0, x0, x1
smulh x0, x0, x0
# LLVM-MCA-END
+# LLVM-MCA-BEGIN smaddl
+mul x0, x0, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w0, w0, x0
+# LLVM-MCA-END
+
# CHECK: [0] Code Region - madd
# CHECK: Iterations: 100
@@ -249,83 +242,7 @@ smulh x0, x0, x0
# CHECK-NEXT: 3. 2 19.5 0.0 0.0 mla v0.4s, v0.4s, v1.4s
# CHECK-NEXT: 2 14.0 0.1 0.0 <total>
-# CHECK: [4] Code Region - sqrdmlah
-
-# CHECK: Iterations: 100
-# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1803
-# CHECK-NEXT: Total uOps: 500
-
-# CHECK: Dispatch Width: 3
-# CHECK-NEXT: uOps Per Cycle: 0.28
-# CHECK-NEXT: IPC: 0.22
-# CHECK-NEXT: Block RThroughput: 8.0
-
-# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 012345678
-# CHECK-NEXT: Index 0123456789 0123456789
-
-# CHECK: [0,0] DeeeeeER . . . . . . . mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1] D=====eeeeeER . . . . . . sqrdmlah v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,2] .D=======eeeeeER . . . . . sqrdmlah v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,3] .D============eeeeeER . . . . sqrdmlah v0.4s, v0.4s, v1.4s
-# CHECK-NEXT: [1,0] . D================eeeeeER . . . mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1] . D=====================eeeeeER . . sqrdmlah v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2] . D=======================eeeeeER . . sqrdmlah v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3] . D============================eeeeeER sqrdmlah v0.4s, v0.4s, v1.4s
-
-# CHECK: Average Wait times (based on the timeline view):
-# CHECK-NEXT: [0]: Executions
-# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
-# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
-# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
-
-# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 9.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1. 2 14.0 0.0 0.0 sqrdmlah v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2. 2 16.0 0.0 0.0 sqrdmlah v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3. 2 21.0 0.0 0.0 sqrdmlah v0.4s, v0.4s, v1.4s
-# CHECK-NEXT: 2 15.0 0.1 0.0 <total>
-
-# CHECK: [5] Code Region - smlal2
-
-# CHECK: Iterations: 100
-# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 1403
-# CHECK-NEXT: Total uOps: 500
-
-# CHECK: Dispatch Width: 3
-# CHECK-NEXT: uOps Per Cycle: 0.36
-# CHECK-NEXT: IPC: 0.29
-# CHECK-NEXT: Block RThroughput: 5.0
-
-# CHECK: Timeline view:
-# CHECK-NEXT: 0123456789 0
-# CHECK-NEXT: Index 0123456789 0123456789
-
-# CHECK: [0,0] DeeeeeER . . . . . mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1] D=====eeeeER . . . . smlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,2] .D=====eeeeER . . . . smlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,3] .D=========eeeeER . . . smlal2 v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: [1,0] . D============eeeeeER . . mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1] . D=================eeeeER . smlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2] . D=================eeeeER . smlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3] . D=====================eeeeER smlal2 v0.4s, v0.8h, v1.8h
-
-# CHECK: Average Wait times (based on the timeline view):
-# CHECK-NEXT: [0]: Executions
-# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
-# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
-# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
-
-# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1. 2 12.0 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2. 2 12.0 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3. 2 16.0 0.0 0.0 smlal2 v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
-
-# CHECK: [6] Code Region - sadalp
+# CHECK: [4] Code Region - sadalp
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
@@ -363,7 +280,7 @@ smulh x0, x0, x0
# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sadalp v0.2d, v0.4s
# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
-# CHECK: [7] Code Region - ssra
+# CHECK: [5] Code Region - ssra
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
@@ -401,7 +318,7 @@ smulh x0, x0, x0
# CHECK-NEXT: 3. 2 16.0 0.0 0.0 ssra v0.2d, v0.2d, #1
# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
-# CHECK: [8] Code Region - fmla
+# CHECK: [6] Code Region - fmla
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 600
@@ -445,34 +362,34 @@ smulh x0, x0, x0
# CHECK-NEXT: 5. 2 20.5 0.0 0.0 fmla v0.2d, v0.2d, v1.2d
# CHECK-NEXT: 2 13.5 0.1 0.0 <total>
-# CHECK: [9] Code Region - fmlal
+# CHECK: [7] Code Region - fmlal
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 600
-# CHECK-NEXT: Total Cycles: 2303
+# CHECK-NEXT: Total Cycles: 2203
# CHECK-NEXT: Total uOps: 600
# CHECK: Dispatch Width: 3
-# CHECK-NEXT: uOps Per Cycle: 0.26
-# CHECK-NEXT: IPC: 0.26
+# CHECK-NEXT: uOps Per Cycle: 0.27
+# CHECK-NEXT: IPC: 0.27
# CHECK-NEXT: Block RThroughput: 3.0
# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789
-# CHECK-NEXT: Index 0123456789 0123456789 012345678
-
-# CHECK: [0,0] DeeeER . . . . . . . . . fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,1] D===eeeeeER . . . . . . . . fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [0,2] D======eeeeeER . . . . . . . . fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [0,3] .D==========eeER . . . . . . . fadd v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,4] .D============eeeeeER . . . . . . fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [0,5] .D=================eeeeeER . . . . . fmlal v0.4s, v0.4h, v1.4h
-# CHECK-NEXT: [1,0] . D=====================eeeER . . . . . fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1] . D========================eeeeeER . . . . fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,2] . D===========================eeeeeER . . . fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,3] . D===============================eeER . . . fadd v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,4] . D=================================eeeeeER . . fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,5] . D======================================eeeeeER fmlal v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: Index 0123456789 0123456789 0123456
+
+# CHECK: [0,0] DeeeER . . . . . . . .. fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1] D===eeeeeER . . . . . . .. fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . . .. fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,3] .D=========eeER. . . . . . .. fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,4] .D===========eeeeeER. . . . . .. fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,5] .D================eeeeeER. . . . .. fmlal v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: [1,0] . D====================eeeER . . . .. fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1] . D=======================eeeeeER . . .. fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,2] . D=========================eeeeeER. . .. fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,3] . D=============================eeER . .. fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,4] . D===============================eeeeeER .. fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,5] . D====================================eeeeeER fmlal v0.4s, v0.4h, v1.4h
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -481,15 +398,15 @@ smulh x0, x0, x0
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 11.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1. 2 14.5 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 2. 2 17.5 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 3. 2 21.5 0.0 0.0 fadd v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 4. 2 23.5 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 5. 2 28.5 0.0 0.0 fmlal v0.4s, v0.4h, v1.4h
-# CHECK-NEXT: 2 19.5 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 11.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1. 2 14.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 2. 2 16.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 3. 2 20.0 0.0 0.0 fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 4. 2 22.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 5. 2 27.0 0.0 0.0 fmlal v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: 2 18.3 0.1 0.0 <total>
-# CHECK: [10] Code Region - crc32
+# CHECK: [8] Code Region - crc32
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
@@ -527,7 +444,7 @@ smulh x0, x0, x0
# CHECK-NEXT: 3. 2 8.0 0.0 0.0 crc32cb w0, w0, w0
# CHECK-NEXT: 2 6.1 0.1 0.0 <total>
-# CHECK: [11] Code Region - smulh
+# CHECK: [9] Code Region - smulh
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
@@ -564,3 +481,41 @@ smulh x0, x0, x0
# CHECK-NEXT: 2. 2 18.5 0.0 0.0 smulh x0, x0, x1
# CHECK-NEXT: 3. 2 23.0 0.0 0.0 smulh x0, x0, x0
# CHECK-NEXT: 2 16.4 0.1 0.0 <total>
+
+# CHECK: [10] Code Region - smaddl
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 803
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.50
+# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012345678
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeER . . . mul x0, x0, x0
+# CHECK-NEXT: [0,1] D===eeER . . . smaddl x0, w1, w2, x0
+# CHECK-NEXT: [0,2] D====eeER . . . smaddl x0, w1, w2, x0
+# CHECK-NEXT: [0,3] .D=====eeER . . smaddl x0, w0, w0, x0
+# CHECK-NEXT: [1,0] .D=======eeeeER. . mul x0, x0, x0
+# CHECK-NEXT: [1,1] .D==========eeER . smaddl x0, w1, w2, x0
+# CHECK-NEXT: [1,2] . D==========eeER . smaddl x0, w1, w2, x0
+# CHECK-NEXT: [1,3] . D============eeER smaddl x0, w0, w0, x0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0
+# CHECK-NEXT: 1. 2 7.5 0.0 0.0 smaddl x0, w1, w2, x0
+# CHECK-NEXT: 2. 2 8.0 0.0 0.0 smaddl x0, w1, w2, x0
+# CHECK-NEXT: 3. 2 9.5 0.0 0.0 smaddl x0, w0, w0, x0
+# CHECK-NEXT: 2 7.4 0.1 0.0 <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
index 1c03d4be8add2..78c55729d3f7c 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
@@ -938,10 +938,10 @@
# CHECK-NEXT: 1 2 0.50 sqrdmlah v0.4s, v1.4s, v2.s[1]
# CHECK-NEXT: 1 4 1.00 sqrdmlah h0, h1, h2
# CHECK-NEXT: 1 4 1.00 sqrdmlah v0.4h, v1.4h, v2.4h
-# CHECK-NEXT: 1 5 2.00 sqrdmlah v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: 2 5 2.00 sqrdmlah v0.8h, v1.8h, v2.8h
# CHECK-NEXT: 1 4 1.00 sqrdmlah s0, s1, s2
# CHECK-NEXT: 1 4 1.00 sqrdmlah v0.2s, v1.2s, v2.2s
-# CHECK-NEXT: 1 5 2.00 sqrdmlah v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2 5 2.00 sqrdmlah v0.4s, v1.4s, v2.4s
# CHECK-NEXT: 1 2 0.50 sqrdmlsh h0, h1, v2.h[3]
# CHECK-NEXT: 1 2 0.50 sqrdmlsh v0.4h, v1.4h, v2.h[3]
# CHECK-NEXT: 1 2 0.50 sqrdmlsh v0.8h, v1.8h, v2.h[3]
@@ -950,10 +950,10 @@
# CHECK-NEXT: 1 2 0.50 sqrdmlsh v0.4s, v1.4s, v2.s[1]
# CHECK-NEXT: 1 4 1.00 sqrdmlsh h0, h1, h2
# CHECK-NEXT: 1 4 1.00 sqrdmlsh v0.4h, v1.4h, v2.4h
-# CHECK-NEXT: 1 5 2.00 sqrdmlsh v0.8h, v1.8h, v2.8h
+# CHECK-NEXT: 2 5 2.00 sqrdmlsh v0.8h, v1.8h, v2.8h
# CHECK-NEXT: 1 4 1.00 sqrdmlsh s0, s1, s2
# CHECK-NEXT: 1 4 1.00 sqrdmlsh v0.2s, v1.2s, v2.2s
-# CHECK-NEXT: 1 5 2.00 sqrdmlsh v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2 5 2.00 sqrdmlsh v0.4s, v1.4s, v2.4s
# CHECK-NEXT: 1 4 1.00 sqrdmulh h10, h11, h12
# CHECK-NEXT: 1 2 0.50 sqrdmulh h7, h15, v0.h[3]
# CHECK-NEXT: 1 2 0.50 sqrdmulh s15, s14, v0.s[1]
>From 147cf9088c3fdf02187fcc380cd4034d90cb2f2e Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Wed, 25 Feb 2026 19:52:24 +0000
Subject: [PATCH 6/8] Update test
---
.../llvm-mca/AArch64/Neoverse/N1-forwarding.s | 36 +++++++++----------
1 file changed, 18 insertions(+), 18 deletions(-)
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
index 4bffe97ed1b07..ad40aff3f30ac 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
@@ -88,26 +88,26 @@ smaddl x0, w0, w0, x0
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 407
+# CHECK-NEXT: Total Cycles: 1403
# CHECK-NEXT: Total uOps: 400
# CHECK: Dispatch Width: 3
-# CHECK-NEXT: uOps Per Cycle: 0.98
-# CHECK-NEXT: IPC: 0.98
+# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: Block RThroughput: 4.0
# CHECK: Timeline view:
-# CHECK-NEXT: 012345
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: 0123456789 0
+# CHECK-NEXT: Index 0123456789 0123456789
-# CHECK: [0,0] DeeeeER . . mul x0, x0, x0
-# CHECK-NEXT: [0,1] D==eeeeER . . madd x0, x1, x2, x0
-# CHECK-NEXT: [0,2] D=eeeeE-R . . madd x0, x1, x2, x0
-# CHECK-NEXT: [0,3] .D====eeeeER . madd x0, x0, x0, x0
-# CHECK-NEXT: [1,0] .D========eeeeER mul x0, x0, x0
-# CHECK-NEXT: [1,1] .D===eeeeE-----R madd x0, x1, x2, x0
-# CHECK-NEXT: [1,2] . D=eeeeE------R madd x0, x1, x2, x0
-# CHECK-NEXT: [1,3] . D=====eeeeE--R madd x0, x0, x0, x0
+# CHECK: [0,0] DeeeeER . . . . . mul x0, x0, x0
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . madd x0, x1, x2, x0
+# CHECK-NEXT: [0,2] D======eeeeER . . . . madd x0, x1, x2, x0
+# CHECK-NEXT: [0,3] .D=========eeeeER . . . madd x0, x0, x0, x0
+# CHECK-NEXT: [1,0] .D=============eeeeER . . mul x0, x0, x0
+# CHECK-NEXT: [1,1] .D================eeeeER . . madd x0, x1, x2, x0
+# CHECK-NEXT: [1,2] . D==================eeeeER . madd x0, x1, x2, x0
+# CHECK-NEXT: [1,3] . D======================eeeeER madd x0, x0, x0, x0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -116,11 +116,11 @@ smaddl x0, w0, w0, x0
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 5.0 0.5 0.0 mul x0, x0, x0
-# CHECK-NEXT: 1. 2 3.5 3.5 2.5 madd x0, x1, x2, x0
-# CHECK-NEXT: 2. 2 2.0 2.0 3.5 madd x0, x1, x2, x0
-# CHECK-NEXT: 3. 2 5.5 0.0 1.0 madd x0, x0, x0, x0
-# CHECK-NEXT: 2 4.0 1.5 1.8 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul x0, x0, x0
+# CHECK-NEXT: 1. 2 10.5 0.0 0.0 madd x0, x1, x2, x0
+# CHECK-NEXT: 2. 2 13.0 0.0 0.0 madd x0, x1, x2, x0
+# CHECK-NEXT: 3. 2 16.5 0.0 0.0 madd x0, x0, x0, x0
+# CHECK-NEXT: 2 11.9 0.1 0.0 <total>
# CHECK: [1] Code Region - fmadd
>From 964021f7aa3df651924f518835f6bc69ae4493d9 Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Fri, 27 Feb 2026 10:59:31 +0000
Subject: [PATCH 7/8] Responds to comments and some general fix ups
---
.../Target/AArch64/AArch64SchedNeoverseN1.td | 21 ++--
.../llvm-mca/AArch64/Neoverse/N1-forwarding.s | 106 +++++++++---------
.../AArch64/Neoverse/N1-neon-instructions.s | 2 +-
3 files changed, 67 insertions(+), 62 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
index 99e0ef1d8c261..046c0abeecf04 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
@@ -274,10 +274,13 @@ def N1Rd_FMA : SchedReadAdvance<2, [WriteFMul, N1Wr_FMA]>;
def N1Wr_VA : SchedWriteRes<[N1UnitV1]> { let Latency = 4; }
def N1Rd_VA : SchedReadAdvance<3, [N1Wr_VA]>;
-def N1Wr_VMA : SchedWriteRes<[N1UnitV0]> { let Latency = 5; }
+def N1Wr_VMA : SchedWriteRes<[N1UnitV0]> { let Latency = 4; }
def N1Wr_VMAQ : SchedWriteRes<[N1UnitV0, N1UnitV0]> { let Latency = 5; }
def N1Rd_VMA : SchedReadAdvance<3, [N1Wr_VMA, N1Wr_VMAQ]>;
+def N1Wr_VMAL : SchedWriteRes<[N1UnitV0]> { let Latency = 4; }
+def N1Rd_VMAL : SchedReadAdvance<3, [N1Wr_VMAL]>;
+
def N1Wr_VPA : SchedWriteRes<[N1UnitV1]> { let Latency = 4; }
def N1Rd_VPA : SchedReadAdvance<3, [N1Wr_VPA]>;
@@ -365,7 +368,7 @@ def : SchedAlias<WriteID64, N1Write_20c5_1M>;
// Multiply accumulate, W-form
// Multiply accumulate, X-form
-// Multiply accumulate, long
+// Multiply accumulate long
def : SchedAlias<WriteIM32, N1Write_2c_1M>;
def : SchedAlias<WriteIM64, N1Write_4c3_1M>;
@@ -654,8 +657,10 @@ def : InstRW<[N1Write_6c_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
// ASIMD multiply, D-form
def : InstRW<[N1Write_4c_1V0], (instregex "^MUL(v[14]i16|v[12]i32)$",
- "^SQ(R)?DMULH(v[14]i16|v[12]i32)$",
- "^([SU]|SQD)MULLv")>;
+ "^SQ(R)?DMULH(v[14]i16|v[12]i32)$")>;
+
+// ASIMD multiply long
+def : InstRW<[N1Write_4c_1V0], (instregex "^([SU]|SQD)MULLv")>;
// ASIMD multiply accumulate, D-form
def : InstRW<[N1Wr_VMA, N1Rd_VMA], (instregex "^ML[AS](v[14]i16|v[12]i32)$")>;
@@ -663,11 +668,11 @@ def : InstRW<[N1Wr_VMA, N1Rd_VMA], (instregex "^ML[AS](v[14]i16|v[12]i32)$")>;
// ASIMD multiply accumulate high, D-form
def : InstRW<[N1Write_4c_1V0], (instregex "^SQRDML[AS]H(v[14]i16|v[12]i32)$")>;
-// ASIMD multiply accumulate saturating long
-// ASIMD multiply long
// ASIMD multiply accumulate long
-def : InstRW<[N1Write_4c_1V0], (instregex "^[SU]ML[AS]Lv",
- "^SQDML[AS]Lv")>;
+def : InstRW<[N1Wr_VMAL, N1Rd_VMAL], (instregex "^[SU]ML[AS]Lv")>;
+
+// ASIMD multiply accumulate saturating long
+def : InstRW<[N1Write_4c_1V0], (instregex "^SQDML[AS]Lv")>;
// ASIMD multiply, Q-form
def : InstRW<[N1Write_5c_2V0], (instregex "^MUL(v8i16|v4i32)$",
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
index ad40aff3f30ac..c5d02ec165d05 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
@@ -17,6 +17,13 @@ fmadd d0, d1, d2, d0
fmadd d0, d0, d1, d2
# LLVM-MCA-END
+# LLVM-MCA-BEGIN smaddl
+mul x0, x0, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w0, w0, x0
+# LLVM-MCA-END
+
# LLVM-MCA-BEGIN saba
mul v0.4s, v0.4s, v0.4s
saba v0.4s, v1.4s, v2.4s
@@ -77,13 +84,6 @@ smulh x0, x0, x1
smulh x0, x0, x0
# LLVM-MCA-END
-# LLVM-MCA-BEGIN smaddl
-mul x0, x0, x0
-smaddl x0, w1, w2, x0
-smaddl x0, w1, w2, x0
-smaddl x0, w0, w0, x0
-# LLVM-MCA-END
-
# CHECK: [0] Code Region - madd
# CHECK: Iterations: 100
@@ -166,7 +166,45 @@ smaddl x0, w0, w0, x0
# CHECK-NEXT: 5. 2 23.5 0.0 0.0 fmadd d0, d0, d1, d2
# CHECK-NEXT: 2 15.7 0.1 0.0 <total>
-# CHECK: [2] Code Region - saba
+# CHECK: [2] Code Region - smaddl
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 803
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.50
+# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012345678
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeER . . . mul x0, x0, x0
+# CHECK-NEXT: [0,1] D===eeER . . . smaddl x0, w1, w2, x0
+# CHECK-NEXT: [0,2] D====eeER . . . smaddl x0, w1, w2, x0
+# CHECK-NEXT: [0,3] .D=====eeER . . smaddl x0, w0, w0, x0
+# CHECK-NEXT: [1,0] .D=======eeeeER. . mul x0, x0, x0
+# CHECK-NEXT: [1,1] .D==========eeER . smaddl x0, w1, w2, x0
+# CHECK-NEXT: [1,2] . D==========eeER . smaddl x0, w1, w2, x0
+# CHECK-NEXT: [1,3] . D============eeER smaddl x0, w0, w0, x0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0
+# CHECK-NEXT: 1. 2 7.5 0.0 0.0 smaddl x0, w1, w2, x0
+# CHECK-NEXT: 2. 2 8.0 0.0 0.0 smaddl x0, w1, w2, x0
+# CHECK-NEXT: 3. 2 9.5 0.0 0.0 smaddl x0, w0, w0, x0
+# CHECK-NEXT: 2 7.4 0.1 0.0 <total>
+
+# CHECK: [3] Code Region - saba
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
@@ -204,7 +242,7 @@ smaddl x0, w0, w0, x0
# CHECK-NEXT: 3. 2 16.0 0.0 0.0 saba v0.4s, v0.4s, v1.4s
# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
-# CHECK: [3] Code Region - mla
+# CHECK: [4] Code Region - mla
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
@@ -242,7 +280,7 @@ smaddl x0, w0, w0, x0
# CHECK-NEXT: 3. 2 19.5 0.0 0.0 mla v0.4s, v0.4s, v1.4s
# CHECK-NEXT: 2 14.0 0.1 0.0 <total>
-# CHECK: [4] Code Region - sadalp
+# CHECK: [5] Code Region - sadalp
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
@@ -280,7 +318,7 @@ smaddl x0, w0, w0, x0
# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sadalp v0.2d, v0.4s
# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
-# CHECK: [5] Code Region - ssra
+# CHECK: [6] Code Region - ssra
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
@@ -318,7 +356,7 @@ smaddl x0, w0, w0, x0
# CHECK-NEXT: 3. 2 16.0 0.0 0.0 ssra v0.2d, v0.2d, #1
# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
-# CHECK: [6] Code Region - fmla
+# CHECK: [7] Code Region - fmla
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 600
@@ -362,7 +400,7 @@ smaddl x0, w0, w0, x0
# CHECK-NEXT: 5. 2 20.5 0.0 0.0 fmla v0.2d, v0.2d, v1.2d
# CHECK-NEXT: 2 13.5 0.1 0.0 <total>
-# CHECK: [7] Code Region - fmlal
+# CHECK: [8] Code Region - fmlal
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 600
@@ -406,7 +444,7 @@ smaddl x0, w0, w0, x0
# CHECK-NEXT: 5. 2 27.0 0.0 0.0 fmlal v0.4s, v0.4h, v1.4h
# CHECK-NEXT: 2 18.3 0.1 0.0 <total>
-# CHECK: [8] Code Region - crc32
+# CHECK: [9] Code Region - crc32
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
@@ -444,7 +482,7 @@ smaddl x0, w0, w0, x0
# CHECK-NEXT: 3. 2 8.0 0.0 0.0 crc32cb w0, w0, w0
# CHECK-NEXT: 2 6.1 0.1 0.0 <total>
-# CHECK: [9] Code Region - smulh
+# CHECK: [10] Code Region - smulh
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
@@ -481,41 +519,3 @@ smaddl x0, w0, w0, x0
# CHECK-NEXT: 2. 2 18.5 0.0 0.0 smulh x0, x0, x1
# CHECK-NEXT: 3. 2 23.0 0.0 0.0 smulh x0, x0, x0
# CHECK-NEXT: 2 16.4 0.1 0.0 <total>
-
-# CHECK: [10] Code Region - smaddl
-
-# CHECK: Iterations: 100
-# CHECK-NEXT: Instructions: 400
-# CHECK-NEXT: Total Cycles: 803
-# CHECK-NEXT: Total uOps: 400
-
-# CHECK: Dispatch Width: 3
-# CHECK-NEXT: uOps Per Cycle: 0.50
-# CHECK-NEXT: IPC: 0.50
-# CHECK-NEXT: Block RThroughput: 4.0
-
-# CHECK: Timeline view:
-# CHECK-NEXT: 012345678
-# CHECK-NEXT: Index 0123456789
-
-# CHECK: [0,0] DeeeeER . . . mul x0, x0, x0
-# CHECK-NEXT: [0,1] D===eeER . . . smaddl x0, w1, w2, x0
-# CHECK-NEXT: [0,2] D====eeER . . . smaddl x0, w1, w2, x0
-# CHECK-NEXT: [0,3] .D=====eeER . . smaddl x0, w0, w0, x0
-# CHECK-NEXT: [1,0] .D=======eeeeER. . mul x0, x0, x0
-# CHECK-NEXT: [1,1] .D==========eeER . smaddl x0, w1, w2, x0
-# CHECK-NEXT: [1,2] . D==========eeER . smaddl x0, w1, w2, x0
-# CHECK-NEXT: [1,3] . D============eeER smaddl x0, w0, w0, x0
-
-# CHECK: Average Wait times (based on the timeline view):
-# CHECK-NEXT: [0]: Executions
-# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
-# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
-# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
-
-# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0
-# CHECK-NEXT: 1. 2 7.5 0.0 0.0 smaddl x0, w1, w2, x0
-# CHECK-NEXT: 2. 2 8.0 0.0 0.0 smaddl x0, w1, w2, x0
-# CHECK-NEXT: 3. 2 9.5 0.0 0.0 smaddl x0, w0, w0, x0
-# CHECK-NEXT: 2 7.4 0.1 0.0 <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
index 78c55729d3f7c..a4910045eca97 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
@@ -613,7 +613,7 @@
# CHECK-NEXT: 1 2 0.50 mla v0.8b, v0.8b, v0.8b
# CHECK-NEXT: 1 2 0.50 mla v15.8h, v22.8h, v4.h[3]
# CHECK-NEXT: 1 2 0.50 mla v28.2s, v10.2s, v2.s[0]
-# CHECK-NEXT: 1 5 1.00 mls v0.4h, v0.4h, v0.4h
+# CHECK-NEXT: 1 4 1.00 mls v0.4h, v0.4h, v0.4h
# CHECK-NEXT: 1 2 0.50 mls v25.8h, v29.8h, v0.h[4]
# CHECK-NEXT: 1 2 0.50 mls v22.2s, v29.2s, v0.s[3]
# CHECK-NEXT: 1 5 2.00 mls v26.4s, v5.4s, v28.4s
>From ffafcb8c4131d9fd89c0c8ba4a483a86e31083d5 Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Mon, 2 Mar 2026 15:48:37 +0000
Subject: [PATCH 8/8] Update MLA/MLS Q-form to take two uops
---
.../Target/AArch64/AArch64SchedNeoverseN1.td | 5 +++-
.../llvm-mca/AArch64/Neoverse/N1-forwarding.s | 28 +++++++++----------
.../AArch64/Neoverse/N1-neon-instructions.s | 2 +-
3 files changed, 19 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
index 046c0abeecf04..6134461aa609d 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
@@ -275,7 +275,10 @@ def N1Wr_VA : SchedWriteRes<[N1UnitV1]> { let Latency = 4; }
def N1Rd_VA : SchedReadAdvance<3, [N1Wr_VA]>;
def N1Wr_VMA : SchedWriteRes<[N1UnitV0]> { let Latency = 4; }
-def N1Wr_VMAQ : SchedWriteRes<[N1UnitV0, N1UnitV0]> { let Latency = 5; }
+def N1Wr_VMAQ : SchedWriteRes<[N1UnitV0, N1UnitV0]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
def N1Rd_VMA : SchedReadAdvance<3, [N1Wr_VMA, N1Wr_VMAQ]>;
def N1Wr_VMAL : SchedWriteRes<[N1UnitV0]> { let Latency = 4; }
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
index c5d02ec165d05..3080a04872bf9 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
@@ -247,10 +247,10 @@ smulh x0, x0, x0
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
# CHECK-NEXT: Total Cycles: 1703
-# CHECK-NEXT: Total uOps: 500
+# CHECK-NEXT: Total uOps: 800
# CHECK: Dispatch Width: 3
-# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: uOps Per Cycle: 0.47
# CHECK-NEXT: IPC: 0.23
# CHECK-NEXT: Block RThroughput: 8.0
@@ -259,13 +259,13 @@ smulh x0, x0, x0
# CHECK-NEXT: Index 0123456789 0123456789
# CHECK: [0,0] DeeeeeER . . . . . .. mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1] D=====eeeeeER . . . . .. mla v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,2] .D======eeeeeER. . . . .. mla v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,3] .D===========eeeeeER. . . .. mla v0.4s, v0.4s, v1.4s
-# CHECK-NEXT: [1,0] . D===============eeeeeER. . .. mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1] . D====================eeeeeER. .. mla v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2] . D=====================eeeeeER .. mla v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3] . D==========================eeeeeER mla v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [0,1] .D====eeeeeER . . . . .. mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2] . D=====eeeeeER. . . . .. mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3] . D=========eeeeeER. . . .. mla v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0] . D=============eeeeeER. . .. mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] . D=================eeeeeER. .. mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2] . .D==================eeeeeER .. mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3] . . D======================eeeeeER mla v0.4s, v0.4s, v1.4s
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@@ -274,11 +274,11 @@ smulh x0, x0, x0
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 2 8.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1. 2 13.5 0.0 0.0 mla v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2. 2 14.5 0.0 0.0 mla v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3. 2 19.5 0.0 0.0 mla v0.4s, v0.4s, v1.4s
-# CHECK-NEXT: 2 14.0 0.1 0.0 <total>
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2. 2 12.5 0.0 0.0 mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3. 2 16.5 0.0 0.0 mla v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: 2 12.0 0.1 0.0 <total>
# CHECK: [5] Code Region - sadalp
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
index a4910045eca97..270990154f245 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
@@ -616,7 +616,7 @@
# CHECK-NEXT: 1 4 1.00 mls v0.4h, v0.4h, v0.4h
# CHECK-NEXT: 1 2 0.50 mls v25.8h, v29.8h, v0.h[4]
# CHECK-NEXT: 1 2 0.50 mls v22.2s, v29.2s, v0.s[3]
-# CHECK-NEXT: 1 5 2.00 mls v26.4s, v5.4s, v28.4s
+# CHECK-NEXT: 2 5 2.00 mls v26.4s, v5.4s, v28.4s
# CHECK-NEXT: 1 2 0.50 mov b0, v0.b[15]
# CHECK-NEXT: 1 2 0.50 mov d6, v0.d[1]
# CHECK-NEXT: 1 2 0.50 mov h2, v0.h[5]
More information about the llvm-commits
mailing list