[llvm] [AArch64] Model late forwarding in Neoverse N1 (PR #177590)

Amina Chabane via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 2 07:50:23 PST 2026


https://github.com/Amichaxx updated https://github.com/llvm/llvm-project/pull/177590

>From 2dfbf7cd341433f444e5dec5cf69d133dbb5f04a Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Thu, 22 Jan 2026 16:36:40 +0000
Subject: [PATCH 1/8] [AArch64] Model late forwarding in Neoverse N1

---
 .../Target/AArch64/AArch64SchedNeoverseN1.td  |  99 ++++--
 .../AArch64/Neoverse/N1-basic-instructions.s  |  90 +++---
 .../llvm-mca/AArch64/Neoverse/N1-forwarding.s | 287 ++++++++++++++++++
 3 files changed, 410 insertions(+), 66 deletions(-)
 create mode 100644 llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s

diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
index 80e5bff5abba7..d6de36c6081e4 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
@@ -261,6 +261,49 @@ def N1Write_9c_6L_6V : SchedWriteRes<[N1UnitL, N1UnitL, N1UnitL,
                                       N1UnitV, N1UnitV, N1UnitV,
                                       N1UnitV, N1UnitV, N1UnitV]>;
 
+//===----------------------------------------------------------------------===//
+// Define forwarded types
+
+def N1Wr_IM32 : WriteSequence<[N1Write_2c_1M]>;
+def N1Wr_IM64 : WriteSequence<[N1Write_4c3_1M]>;
+def N1Rd_IMA  : SchedReadAdvance<1, [N1Wr_IM32, N1Wr_IM64]>;
+
+def N1Wr_FMA : WriteSequence<[N1Write_4c_1V]>;
+def N1Rd_FMA : SchedReadAdvance<2, [WriteFMul, N1Wr_FMA]>;
+
+def N1Wr_VA : WriteSequence<[N1Write_4c_1V1]>;
+def N1Rd_VA : SchedReadAdvance<3, [N1Wr_VA]>;
+
+def N1Wr_VMA  : WriteSequence<[N1Write_4c_1V0]>;
+def N1Wr_VMAQ : WriteSequence<[N1Write_5c_2V0]>;
+def N1Rd_VMA  : SchedReadAdvance<3, [N1Wr_VMA, N1Wr_VMAQ]>;
+
+def N1Wr_VMAH  : WriteSequence<[N1Write_4c_1V0]>;
+def N1Wr_VMAHQ : WriteSequence<[N1Write_5c_2V0]>;
+def N1Rd_VMAH  : SchedReadAdvance<2, [N1Wr_VMAH, N1Wr_VMAHQ]>;
+
+def N1Wr_VMAL  : WriteSequence<[N1Write_4c_1V0]>;
+def N1Rd_VMAL  : SchedReadAdvance<3, [N1Wr_VMAL]>;
+
+def N1Wr_VPA : WriteSequence<[N1Write_4c_1V1]>;
+def N1Rd_VPA : SchedReadAdvance<3, [N1Wr_VPA]>;
+
+def N1Wr_VSA : WriteSequence<[N1Write_4c_1V1]>;
+def N1Rd_VSA : SchedReadAdvance<3, [N1Wr_VSA]>;
+
+def N1Wr_VFM  : WriteSequence<[N1Write_3c_1V]>;
+def N1Wr_VFMA : WriteSequence<[N1Write_4c_1V]>;
+def N1Rd_VFMA : SchedReadAdvance<2, [N1Wr_VFM, N1Wr_VFMA]>;
+
+def N1Wr_VFMAL : WriteSequence<[N1Write_5c_1V]>;
+def N1Rd_VFMAL : SchedReadAdvance<2, [N1Wr_VFMAL]>;
+
+def N1Wr_CRC : WriteSequence<[N1Write_2c_1M]>;
+def N1Rd_CRC : SchedReadAdvance<1, [N1Wr_CRC]>;
+
+def N1Wr_MH : WriteSequence<[N1Write_5c3_1M]>;
+def N1Rd_MH : SchedReadAdvance<2, [N1Wr_MH]>;
+
 
 // Miscellaneous Instructions
 // -----------------------------------------------------------------------------
@@ -327,13 +370,21 @@ def : InstRW<[N1Write_2c_1M], (instregex "^(AND|BIC)S[WX]rs$")>;
 def : SchedAlias<WriteID32, N1Write_12c5_1M>;
 def : SchedAlias<WriteID64, N1Write_20c5_1M>;
 
-// Multiply accumulate
+// Multiply accumulate, W-form
+// Multiply accumulate, X-form
 // Multiply accumulate, long
 def : SchedAlias<WriteIM32, N1Write_2c_1M>;
 def : SchedAlias<WriteIM64, N1Write_4c3_1M>;
 
+def : InstRW<[N1Wr_IM32, ReadIM, ReadIM, N1Rd_IMA],
+             (instregex "^M(ADD|SUB)Wrrr$")>;
+def : InstRW<[N1Wr_IM64, ReadIM, ReadIM, N1Rd_IMA],
+             (instregex "^M(ADD|SUB)Xrrr$")>;
+def : InstRW<[N1Wr_IM64, ReadIM, ReadIM, N1Rd_IMA],
+             (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
+
 // Multiply high
-def : InstRW<[N1Write_5c3_1M, ReadIM, ReadIM], (instrs SMULHrr, UMULHrr)>;
+def : InstRW<[N1Wr_MH, ReadIM, ReadIM, N1Rd_MH], (instrs SMULHrr, UMULHrr)>;
 
 
 // Miscellaneous data-processing instructions
@@ -431,7 +482,7 @@ def : InstRW<[N1Write_17c7_1V0], (instrs FSQRTDr)>;
 def : SchedAlias<WriteFMul, N1Write_3c_1V>;
 
 // FP multiply accumulate
-def : InstRW<[N1Write_4c_1V], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
+def : InstRW<[N1Wr_FMA, ReadDefault, ReadDefault, N1Rd_FMA], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
 
 // FP round to integral
 def : InstRW<[N1Write_3c_1V0], (instregex "^FRINT[AIMNPXZ][HSD]r$",
@@ -588,7 +639,7 @@ def : SchedAlias<WriteVq, N1Write_2c_1V>;
 
 // ASIMD absolute diff accum
 // ASIMD absolute diff accum long
-def : InstRW<[N1Write_4c_1V1], (instregex "^[SU]ABAL?v")>;
+def : InstRW<[N1Wr_VA, N1Rd_VA], (instregex "^[SU]ABAL?v")>;
 
 // ASIMD arith, reduce, 4H/4S
 def : InstRW<[N1Write_3c_1V1], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>;
@@ -609,26 +660,32 @@ def : InstRW<[N1Write_5c_1V1_1V], (instregex "^[SU](MAX|MIN)Vv8(i8|i16)v$")>;
 def : InstRW<[N1Write_6c_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
 
 // ASIMD multiply, D-form
+def : InstRW<[N1Write_4c_1V0], (instregex "^MUL(v[14]i16|v[12]i32)$",
+                                          "^SQ(R)?DMULH(v[14]i16|v[12]i32)$",
+                                          "^([SU]|SQD)MULLv")>;
+
 // ASIMD multiply accumulate, D-form
+def : InstRW<[N1Wr_VMA, N1Rd_VMA], (instregex "^ML[AS](v[14]i16|v[12]i32)$")>;
+
 // ASIMD multiply accumulate high, D-form
+def : InstRW<[N1Wr_VMAH, N1Rd_VMAH], (instregex "^SQRDML[AS]H(v[14]i16|v[12]i32)$")>;
+
 // ASIMD multiply accumulate saturating long
 // ASIMD multiply long
 // ASIMD multiply accumulate long
-def : InstRW<[N1Write_4c_1V0], (instregex "^MUL(v[14]i16|v[12]i32)$",
-                                          "^ML[AS](v[14]i16|v[12]i32)$",
-                                          "^SQ(R)?DMULH(v[14]i16|v[12]i32)$",
-                                          "^SQRDML[AS]H(v[14]i16|v[12]i32)$",
-                                          "^SQDML[AS]Lv",
-                                          "^([SU]|SQD)MULLv",
-                                          "^[SU]ML[AS]Lv")>;
+def : InstRW<[N1Wr_VMAL, N1Rd_VMAL], (instregex "^[SU]ML[AS]Lv",
+                                                 "^SQDML[AS]Lv")>;
 
 // ASIMD multiply, Q-form
+def : InstRW<[N1Write_5c_2V0], (instregex "^MUL(v8i16|v4i32)$",
+                                          "^SQ(R)?DMULH(v8i16|v4i32)$")>;
+
 // ASIMD multiply accumulate, Q-form
+def : InstRW<[N1Wr_VMAQ, N1Rd_VMA], (instregex "^ML[AS](v8i16|v4i32)$")>;
+
 // ASIMD multiply accumulate high, Q-form
-def : InstRW<[N1Write_5c_2V0], (instregex "^MUL(v8i16|v4i32)$",
-                                          "^ML[AS](v8i16|v4i32)$",
-                                          "^SQ(R)?DMULH(v8i16|v4i32)$",
-                                          "^SQRDML[AS]H(v8i16|v4i32)$")>;
+def : InstRW<[N1Wr_VMAHQ, N1Rd_VMAH],
+             (instregex "^SQRDML[AS]H(v8i16|v4i32)$")>;
 
 // ASIMD multiply/multiply long (8x8) polynomial, D-form
 def : InstRW<[N1Write_3c_1V0], (instrs PMULv8i8, PMULLv8i8)>;
@@ -637,10 +694,10 @@ def : InstRW<[N1Write_3c_1V0], (instrs PMULv8i8, PMULLv8i8)>;
 def : InstRW<[N1Write_4c_2V0], (instrs PMULv16i8, PMULLv16i8)>;
 
 // ASIMD pairwise add and accumulate long
-def : InstRW<[N1Write_4c_1V1], (instregex "^[SU]ADALPv")>;
+def : InstRW<[N1Wr_VPA, N1Rd_VPA], (instregex "^[SU]ADALPv")>;
 
 // ASIMD shift accumulate
-def : InstRW<[N1Write_4c_1V1], (instregex "^[SU]R?SRAv")>;
+def : InstRW<[N1Wr_VSA, N1Rd_VSA], (instregex "^[SU]R?SRAv")>;
 
 // ASIMD shift by immed, basic
 // ASIMD shift by immed and insert, basic
@@ -722,13 +779,13 @@ def : InstRW<[N1Write_5c_1V], (instregex "^F(MAX|MIN)(NM)?Vv4(i16|i32)v$")>;
 def : InstRW<[N1Write_8c_3V], (instregex "^F(MAX|MIN)(NM)?Vv8i16v$")>;
 
 // ASIMD FP multiply
-def : InstRW<[N1Write_3c_1V], (instregex "^FMULX?v")>;
+def : InstRW<[N1Wr_VFM], (instregex "^FMULX?v")>;
 
 // ASIMD FP multiply accumulate
-def : InstRW<[N1Write_4c_1V], (instregex "^FML[AS]v")>;
+def : InstRW<[N1Wr_VFMA, N1Rd_VFMA], (instregex "^FML[AS]v")>;
 
 // ASIMD FP multiply accumulate long
-def : InstRW<[N1Write_5c_1V], (instregex "^FML[AS]L2?v")>;
+def : InstRW<[N1Wr_VFMAL, N1Rd_VFMAL], (instregex "^FML[AS]L2?v")>;
 
 // ASIMD FP round, D-form F32 and Q-form F64
 def : InstRW<[N1Write_3c_1V0], (instregex "^FRINT[AIMNPXZ]v2f(32|64)$")>;
@@ -1053,7 +1110,7 @@ def : InstRW<[N1Write_4c_1V0], (instregex "^SHA1[CMP]rrr$",
 // -----------------------------------------------------------------------------
 
 // CRC checksum ops
-def : InstRW<[N1Write_2c_1M], (instregex "^CRC32C?[BHWX]rr$")>;
+def : InstRW<[N1Wr_CRC, N1Rd_CRC], (instregex "^CRC32C?[BHWX]rr$")>;
 
 
 }
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
index 0a36c14e43955..fb3f2ccc92441 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
@@ -472,32 +472,32 @@
 # CHECK-NEXT:  1      4     3.00                        msub	x13, xzr, x4, x4
 # CHECK-NEXT:  1      4     3.00                        msub	x19, x30, xzr, x29
 # CHECK-NEXT:  1      4     3.00                        mneg	x4, x5, x6
-# CHECK-NEXT:  1      2     1.00                        smaddl	x3, w5, w2, x9
-# CHECK-NEXT:  1      2     1.00                        smaddl	xzr, w10, w11, x12
-# CHECK-NEXT:  1      2     1.00                        smaddl	x13, wzr, w14, x15
-# CHECK-NEXT:  1      2     1.00                        smaddl	x16, w17, wzr, x18
-# CHECK-NEXT:  1      2     1.00                        smull	x19, w20, w21
-# CHECK-NEXT:  1      2     1.00                        smsubl	x3, w5, w2, x9
-# CHECK-NEXT:  1      2     1.00                        smsubl	xzr, w10, w11, x12
-# CHECK-NEXT:  1      2     1.00                        smsubl	x13, wzr, w14, x15
-# CHECK-NEXT:  1      2     1.00                        smsubl	x16, w17, wzr, x18
-# CHECK-NEXT:  1      2     1.00                        smnegl	x19, w20, w21
-# CHECK-NEXT:  1      2     1.00                        umaddl	x3, w5, w2, x9
-# CHECK-NEXT:  1      2     1.00                        umaddl	xzr, w10, w11, x12
-# CHECK-NEXT:  1      2     1.00                        umaddl	x13, wzr, w14, x15
-# CHECK-NEXT:  1      2     1.00                        umaddl	x16, w17, wzr, x18
-# CHECK-NEXT:  1      2     1.00                        umull	x19, w20, w21
-# CHECK-NEXT:  1      2     1.00                        umsubl	x3, w5, w2, x9
-# CHECK-NEXT:  1      2     1.00                        umsubl	x16, w17, wzr, x18
-# CHECK-NEXT:  1      2     1.00                        umnegl	x19, w20, w21
+# CHECK-NEXT:  1      4     3.00                        smaddl	x3, w5, w2, x9
+# CHECK-NEXT:  1      4     3.00                        smaddl	xzr, w10, w11, x12
+# CHECK-NEXT:  1      4     3.00                        smaddl	x13, wzr, w14, x15
+# CHECK-NEXT:  1      4     3.00                        smaddl	x16, w17, wzr, x18
+# CHECK-NEXT:  1      4     3.00                        smull	x19, w20, w21
+# CHECK-NEXT:  1      4     3.00                        smsubl	x3, w5, w2, x9
+# CHECK-NEXT:  1      4     3.00                        smsubl	xzr, w10, w11, x12
+# CHECK-NEXT:  1      4     3.00                        smsubl	x13, wzr, w14, x15
+# CHECK-NEXT:  1      4     3.00                        smsubl	x16, w17, wzr, x18
+# CHECK-NEXT:  1      4     3.00                        smnegl	x19, w20, w21
+# CHECK-NEXT:  1      4     3.00                        umaddl	x3, w5, w2, x9
+# CHECK-NEXT:  1      4     3.00                        umaddl	xzr, w10, w11, x12
+# CHECK-NEXT:  1      4     3.00                        umaddl	x13, wzr, w14, x15
+# CHECK-NEXT:  1      4     3.00                        umaddl	x16, w17, wzr, x18
+# CHECK-NEXT:  1      4     3.00                        umull	x19, w20, w21
+# CHECK-NEXT:  1      4     3.00                        umsubl	x3, w5, w2, x9
+# CHECK-NEXT:  1      4     3.00                        umsubl	x16, w17, wzr, x18
+# CHECK-NEXT:  1      4     3.00                        umnegl	x19, w20, w21
 # CHECK-NEXT:  1      5     3.00                        smulh	x23, x22, xzr
 # CHECK-NEXT:  1      5     3.00                        umulh	x23, x22, xzr
 # CHECK-NEXT:  1      4     3.00                        mul	x19, x20, xzr
 # CHECK-NEXT:  1      2     1.00                        mneg	w21, w22, w23
-# CHECK-NEXT:  1      2     1.00                        smull	x11, w13, w17
-# CHECK-NEXT:  1      2     1.00                        umull	x11, w13, w17
-# CHECK-NEXT:  1      2     1.00                        smnegl	x11, w13, w17
-# CHECK-NEXT:  1      2     1.00                        umnegl	x11, w13, w17
+# CHECK-NEXT:  1      4     3.00                        smull	x11, w13, w17
+# CHECK-NEXT:  1      4     3.00                        umull	x11, w13, w17
+# CHECK-NEXT:  1      4     3.00                        smnegl	x11, w13, w17
+# CHECK-NEXT:  1      4     3.00                        umnegl	x11, w13, w17
 # CHECK-NEXT:  2      3     1.00                        extr	w3, w5, w7, #0
 # CHECK-NEXT:  2      3     1.00                        extr	w11, w13, w17, #31
 # CHECK-NEXT:  2      3     1.00                        extr	x3, x5, x7, #15
@@ -1264,7 +1264,7 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1.0]  [1.1]  [2.0]  [2.1]  [3]    [4.0]  [4.1]  [5]    [6]
-# CHECK-NEXT: 26.00  34.00  34.00  252.50 252.50 483.33 197.33 197.33 293.00 161.00
+# CHECK-NEXT: 26.00  34.00  34.00  252.50 252.50 527.33 197.33 197.33 293.00 161.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1.0]  [1.1]  [2.0]  [2.1]  [3]    [4.0]  [4.1]  [5]    [6]    Instructions:
@@ -1730,32 +1730,32 @@
 # CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     msub	x13, xzr, x4, x4
 # CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     msub	x19, x30, xzr, x29
 # CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     mneg	x4, x5, x6
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smaddl	x3, w5, w2, x9
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smaddl	xzr, w10, w11, x12
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smaddl	x13, wzr, w14, x15
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smaddl	x16, w17, wzr, x18
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smull	x19, w20, w21
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smsubl	x3, w5, w2, x9
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smsubl	xzr, w10, w11, x12
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smsubl	x13, wzr, w14, x15
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smsubl	x16, w17, wzr, x18
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smnegl	x19, w20, w21
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umaddl	x3, w5, w2, x9
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umaddl	xzr, w10, w11, x12
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umaddl	x13, wzr, w14, x15
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umaddl	x16, w17, wzr, x18
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umull	x19, w20, w21
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umsubl	x3, w5, w2, x9
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umsubl	x16, w17, wzr, x18
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umnegl	x19, w20, w21
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smaddl	x3, w5, w2, x9
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smaddl	xzr, w10, w11, x12
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smaddl	x13, wzr, w14, x15
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smaddl	x16, w17, wzr, x18
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smull	x19, w20, w21
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smsubl	x3, w5, w2, x9
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smsubl	xzr, w10, w11, x12
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smsubl	x13, wzr, w14, x15
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smsubl	x16, w17, wzr, x18
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smnegl	x19, w20, w21
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umaddl	x3, w5, w2, x9
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umaddl	xzr, w10, w11, x12
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umaddl	x13, wzr, w14, x15
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umaddl	x16, w17, wzr, x18
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umull	x19, w20, w21
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umsubl	x3, w5, w2, x9
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umsubl	x16, w17, wzr, x18
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umnegl	x19, w20, w21
 # CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smulh	x23, x22, xzr
 # CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umulh	x23, x22, xzr
 # CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     mul	x19, x20, xzr
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     mneg	w21, w22, w23
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smull	x11, w13, w17
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umull	x11, w13, w17
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smnegl	x11, w13, w17
-# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umnegl	x11, w13, w17
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smull	x11, w13, w17
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umull	x11, w13, w17
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smnegl	x11, w13, w17
+# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umnegl	x11, w13, w17
 # CHECK-NEXT:  -      -      -      -      -     1.33   0.33   0.33    -      -     extr	w3, w5, w7, #0
 # CHECK-NEXT:  -      -      -      -      -     1.33   0.33   0.33    -      -     extr	w11, w13, w17, #31
 # CHECK-NEXT:  -      -      -      -      -     1.33   0.33   0.33    -      -     extr	x3, x5, x7, #15
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
new file mode 100644
index 0000000000000..656b4daff5aab
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
@@ -0,0 +1,287 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-n1 -mattr=+fp16fml --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=2 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN madd
+mul  x0, x0, x0
+madd x0, x1, x2, x0
+madd x0, x1, x2, x0
+madd x0, x0, x0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmadd
+fadd  d0, d0, d0
+fmadd d0, d1, d2, d0
+fmul  d0, d0, d0
+fmadd d0, d1, d2, d0
+fmadd d0, d1, d2, d0
+fmadd d0, d0, d1, d2
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN saba
+mul  v0.4s, v0.4s, v0.4s
+saba v0.4s, v1.4s, v2.4s
+saba v0.4s, v1.4s, v2.4s
+saba v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmla
+fmul v0.2d, v0.2d, v0.2d
+fmla v0.2d, v1.2d, v2.2d
+fadd v0.2d, v0.2d, v0.2d
+fmla v0.2d, v1.2d, v2.2d
+fmla v0.2d, v1.2d, v2.2d
+fmla v0.2d, v0.2d, v1.2d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN crc32
+mul     w0, w0, w0
+crc32cb w0, w0, w1
+crc32cb w0, w0, w1
+crc32cb w0, w0, w0
+# LLVM-MCA-END
+# LLVM-MCA-BEGIN smulh
+mul    x0, x0, x0
+smulh x0, x1, x2
+smulh x0, x1, x2
+smulh x0, x0, x1
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region - madd
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1204
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01234567
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    . .   mul	x0, x0, x0
+# CHECK-NEXT: [0,1]     D======eeeeER  .    .    . .   madd	x0, x1, x2, x0
+# CHECK-NEXT: [0,2]     D===eeeeE---R  .    .    . .   madd	x0, x1, x2, x0
+# CHECK-NEXT: [0,3]     .D========eeeeER    .    . .   madd	x0, x0, x0, x0
+# CHECK-NEXT: [1,0]     .D==============eeeeER   . .   mul	x0, x0, x0
+# CHECK-NEXT: [1,1]     .D=================eeeeER. .   madd	x0, x1, x2, x0
+# CHECK-NEXT: [1,2]     . D==========eeeeE------R. .   madd	x0, x1, x2, x0
+# CHECK-NEXT: [1,3]     . D===================eeeeER   madd	x0, x0, x0, x0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.0    1.5    0.0       mul	x0, x0, x0
+# CHECK-NEXT: 1.     2     12.5   12.5   0.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 2.     2     7.5    7.5    4.5       madd	x0, x1, x2, x0
+# CHECK-NEXT: 3.     2     14.5   3.5    0.0       madd	x0, x0, x0, x0
+# CHECK-NEXT:        2     10.6   6.3    1.1       <total>
+
+# CHECK:      [1] Code Region - fmadd
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      2103
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234
+
+# CHECK:      [0,0]     DeeER.    .    .    .    .    .    .    .   .   fadd	d0, d0, d0
+# CHECK-NEXT: [0,1]     D==eeeeER .    .    .    .    .    .    .   .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,2]     D======eeeER   .    .    .    .    .    .   .   fmul	d0, d0, d0
+# CHECK-NEXT: [0,3]     .D========eeeeER    .    .    .    .    .   .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,4]     .D============eeeeER.    .    .    .    .   .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,5]     .D================eeeeER .    .    .    .   .   fmadd	d0, d0, d1, d2
+# CHECK-NEXT: [1,0]     . D===================eeER    .    .    .   .   fadd	d0, d0, d0
+# CHECK-NEXT: [1,1]     . D=====================eeeeER.    .    .   .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,2]     . D=========================eeeER  .    .   .   fmul	d0, d0, d0
+# CHECK-NEXT: [1,3]     .  D===========================eeeeER   .   .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,4]     .  D===============================eeeeER   .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,5]     .  D===================================eeeeER   fmadd	d0, d0, d1, d2
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     10.5   0.5    0.0       fadd	d0, d0, d0
+# CHECK-NEXT: 1.     2     12.5   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 2.     2     16.5   0.0    0.0       fmul	d0, d0, d0
+# CHECK-NEXT: 3.     2     18.5   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 4.     2     22.5   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 5.     2     26.5   0.0    0.0       fmadd	d0, d0, d1, d2
+# CHECK-NEXT:        2     17.8   0.1    0.0       <total>
+
+# CHECK:      [2] Code Region - saba
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1703
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2]     .D========eeeeER    .    .    .    ..   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3]     .D============eeeeER.    .    .    ..   saba	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0]     . D===============eeeeeER.    .    ..   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     . D====================eeeeER .    ..   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2]     .  D=======================eeeeER  ..   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     .  D===========================eeeeER   saba	v0.4s, v0.4s, v1.4s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     13.5   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2.     2     16.5   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     20.5   0.0    0.0       saba	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     14.8   0.1    0.0       <total>
+
+# CHECK:      [3] Code Region - fmla
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      2103
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          01234
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    .    .    .   .   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,2]     D=======eeER   .    .    .    .    .    .   .   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,3]     .D========eeeeER    .    .    .    .    .   .   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,4]     .D============eeeeER.    .    .    .    .   .   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,5]     .D================eeeeER .    .    .    .   .   fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: [1,0]     . D===================eeeER   .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     . D======================eeeeER    .    .   .   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,2]     . D==========================eeER  .    .   .   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,3]     .  D===========================eeeeER   .   .   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,4]     .  D===============================eeeeER   .   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,5]     .  D===================================eeeeER   fmla	v0.2d, v0.2d, v1.2d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     10.5   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     13.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 2.     2     17.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 3.     2     18.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 4.     2     22.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 5.     2     26.5   0.0    0.0       fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT:        2     18.2   0.1    0.0       <total>
+
+# CHECK:      [4] Code Region - crc32
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      803
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeER.    .    .  .   mul	w0, w0, w0
+# CHECK-NEXT: [0,1]     D==eeER   .    .  .   crc32cb	w0, w0, w1
+# CHECK-NEXT: [0,2]     D====eeER .    .  .   crc32cb	w0, w0, w1
+# CHECK-NEXT: [0,3]     .D=====eeER    .  .   crc32cb	w0, w0, w0
+# CHECK-NEXT: [1,0]     .D=======eeER  .  .   mul	w0, w0, w0
+# CHECK-NEXT: [1,1]     .D=========eeER.  .   crc32cb	w0, w0, w1
+# CHECK-NEXT: [1,2]     . D==========eeER .   crc32cb	w0, w0, w1
+# CHECK-NEXT: [1,3]     . D============eeER   crc32cb	w0, w0, w0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	w0, w0, w0
+# CHECK-NEXT: 1.     2     6.5    0.0    0.0       crc32cb	w0, w0, w1
+# CHECK-NEXT: 2.     2     8.0    0.0    0.0       crc32cb	w0, w0, w1
+# CHECK-NEXT: 3.     2     9.5    0.0    0.0       crc32cb	w0, w0, w0
+# CHECK-NEXT:        2     7.1    0.1    0.0       <total>
+
+# CHECK:      [5] Code Region - smulh
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1205
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	x0, x0, x0
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .  .   smulh	x0, x1, x2
+# CHECK-NEXT: [0,2]     D======eeeeeER .    .    .  .   smulh	x0, x1, x2
+# CHECK-NEXT: [0,3]     .D===========eeeeeER.    .  .   smulh	x0, x0, x1
+# CHECK-NEXT: [1,0]     .D=================eeeeER.  .   mul	x0, x0, x0
+# CHECK-NEXT: [1,1]     .D========eeeeeE--------R.  .   smulh	x0, x1, x2
+# CHECK-NEXT: [1,2]     . D=============eeeeeE--R.  .   smulh	x0, x1, x2
+# CHECK-NEXT: [1,3]     . D===================eeeeeER   smulh	x0, x0, x1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.5    1.0    0.0       mul	x0, x0, x0
+# CHECK-NEXT: 1.     2     6.5    6.5    4.0       smulh	x0, x1, x2
+# CHECK-NEXT: 2.     2     10.5   10.5   1.0       smulh	x0, x1, x2
+# CHECK-NEXT: 3.     2     16.0   1.0    0.0       smulh	x0, x0, x1
+# CHECK-NEXT:        2     10.6   4.8    1.3       <total>

>From c7f8dfbd724438b2ac86e1100fd028eb452b7fa3 Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Thu, 12 Feb 2026 00:11:55 +0000
Subject: [PATCH 2/8] n1-forwarding pre-commit

---
 .../llvm-mca/AArch64/Neoverse/N1-forwarding.s | 527 +++++++++++++-----
 1 file changed, 403 insertions(+), 124 deletions(-)

diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
index 656b4daff5aab..bb70593f1b91d 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
@@ -11,8 +11,8 @@ madd x0, x0, x0, x0
 # LLVM-MCA-BEGIN fmadd
 fadd  d0, d0, d0
 fmadd d0, d1, d2, d0
-fmul  d0, d0, d0
 fmadd d0, d1, d2, d0
+fmul  d0, d0, d0
 fmadd d0, d1, d2, d0
 fmadd d0, d0, d1, d2
 # LLVM-MCA-END
@@ -24,52 +24,97 @@ saba v0.4s, v1.4s, v2.4s
 saba v0.4s, v0.4s, v1.4s
 # LLVM-MCA-END
 
+# LLVM-MCA-BEGIN mla
+mul v0.4s, v0.4s, v0.4s
+mla v0.4s, v1.4s, v2.4s
+mla v0.4s, v1.4s, v2.4s
+mla v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sqrdmlah
+mul      v0.4s, v0.4s, v0.4s
+sqrdmlah v0.4s, v1.4s, v2.4s
+sqrdmlah v0.4s, v1.4s, v2.4s
+sqrdmlah v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smlal2
+mul    v0.4s, v0.4s, v0.4s
+smlal2 v0.4s, v1.8h, v2.8h
+smlal2 v0.4s, v1.8h, v2.8h
+smlal2 v0.4s, v0.8h, v1.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sadalp
+mul    v0.4s, v0.4s, v0.4s
+sadalp v0.2d, v1.4s
+sadalp v0.2d, v1.4s
+sadalp v0.2d, v0.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ssra
+mul  v0.4s, v0.4s, v0.4s
+ssra v0.2d, v1.2d, #1
+ssra v0.2d, v1.2d, #1
+ssra v0.2d, v0.2d, #1
+# LLVM-MCA-END
+
 # LLVM-MCA-BEGIN fmla
 fmul v0.2d, v0.2d, v0.2d
 fmla v0.2d, v1.2d, v2.2d
-fadd v0.2d, v0.2d, v0.2d
 fmla v0.2d, v1.2d, v2.2d
+fadd v0.2d, v0.2d, v0.2d
 fmla v0.2d, v1.2d, v2.2d
 fmla v0.2d, v0.2d, v1.2d
 # LLVM-MCA-END
 
+# LLVM-MCA-BEGIN fmlal
+fmul  v0.2d, v0.2d, v0.2d
+fmlal v0.4s, v1.4h, v2.4h
+fmlal v0.4s, v1.4h, v2.4h
+fadd  v0.2d, v0.2d, v0.2d
+fmlal v0.4s, v1.4h, v2.4h
+fmlal v0.4s, v0.4h, v1.4h
+# LLVM-MCA-END
+
 # LLVM-MCA-BEGIN crc32
 mul     w0, w0, w0
 crc32cb w0, w0, w1
 crc32cb w0, w0, w1
 crc32cb w0, w0, w0
 # LLVM-MCA-END
+
 # LLVM-MCA-BEGIN smulh
-mul    x0, x0, x0
-smulh x0, x1, x2
-smulh x0, x1, x2
+mul   x0, x0, x0
 smulh x0, x0, x1
+smulh x0, x0, x1
+smulh x0, x0, x0
 # LLVM-MCA-END
 
 # CHECK:      [0] Code Region - madd
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1204
+# CHECK-NEXT: Total Cycles:      407
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.33
-# CHECK-NEXT: IPC:               0.33
-# CHECK-NEXT: Block RThroughput: 12.0
+# CHECK-NEXT: uOps Per Cycle:    0.98
+# CHECK-NEXT: IPC:               0.98
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234567
-
-# CHECK:      [0,0]     DeeeeER   .    .    .    . .   mul	x0, x0, x0
-# CHECK-NEXT: [0,1]     D======eeeeER  .    .    . .   madd	x0, x1, x2, x0
-# CHECK-NEXT: [0,2]     D===eeeeE---R  .    .    . .   madd	x0, x1, x2, x0
-# CHECK-NEXT: [0,3]     .D========eeeeER    .    . .   madd	x0, x0, x0, x0
-# CHECK-NEXT: [1,0]     .D==============eeeeER   . .   mul	x0, x0, x0
-# CHECK-NEXT: [1,1]     .D=================eeeeER. .   madd	x0, x1, x2, x0
-# CHECK-NEXT: [1,2]     . D==========eeeeE------R. .   madd	x0, x1, x2, x0
-# CHECK-NEXT: [1,3]     . D===================eeeeER   madd	x0, x0, x0, x0
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .   mul	x0, x0, x0
+# CHECK-NEXT: [0,1]     D==eeeeER .    .   madd	x0, x1, x2, x0
+# CHECK-NEXT: [0,2]     D=eeeeE-R .    .   madd	x0, x1, x2, x0
+# CHECK-NEXT: [0,3]     .D====eeeeER   .   madd	x0, x0, x0, x0
+# CHECK-NEXT: [1,0]     .D========eeeeER   mul	x0, x0, x0
+# CHECK-NEXT: [1,1]     .D===eeeeE-----R   madd	x0, x1, x2, x0
+# CHECK-NEXT: [1,2]     . D=eeeeE------R   madd	x0, x1, x2, x0
+# CHECK-NEXT: [1,3]     . D=====eeeeE--R   madd	x0, x0, x0, x0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -78,40 +123,40 @@ smulh x0, x0, x1
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.0    1.5    0.0       mul	x0, x0, x0
-# CHECK-NEXT: 1.     2     12.5   12.5   0.0       madd	x0, x1, x2, x0
-# CHECK-NEXT: 2.     2     7.5    7.5    4.5       madd	x0, x1, x2, x0
-# CHECK-NEXT: 3.     2     14.5   3.5    0.0       madd	x0, x0, x0, x0
-# CHECK-NEXT:        2     10.6   6.3    1.1       <total>
+# CHECK-NEXT: 0.     2     5.0    0.5    0.0       mul	x0, x0, x0
+# CHECK-NEXT: 1.     2     3.5    3.5    2.5       madd	x0, x1, x2, x0
+# CHECK-NEXT: 2.     2     2.0    2.0    3.5       madd	x0, x1, x2, x0
+# CHECK-NEXT: 3.     2     5.5    0.0    1.0       madd	x0, x0, x0, x0
+# CHECK-NEXT:        2     4.0    1.5    1.8       <total>
 
 # CHECK:      [1] Code Region - fmadd
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
-# CHECK-NEXT: Total Cycles:      2103
+# CHECK-NEXT: Total Cycles:      1903
 # CHECK-NEXT: Total uOps:        600
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.32
+# CHECK-NEXT: IPC:               0.32
 # CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01234
-
-# CHECK:      [0,0]     DeeER.    .    .    .    .    .    .    .   .   fadd	d0, d0, d0
-# CHECK-NEXT: [0,1]     D==eeeeER .    .    .    .    .    .    .   .   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [0,2]     D======eeeER   .    .    .    .    .    .   .   fmul	d0, d0, d0
-# CHECK-NEXT: [0,3]     .D========eeeeER    .    .    .    .    .   .   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [0,4]     .D============eeeeER.    .    .    .    .   .   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [0,5]     .D================eeeeER .    .    .    .   .   fmadd	d0, d0, d1, d2
-# CHECK-NEXT: [1,0]     . D===================eeER    .    .    .   .   fadd	d0, d0, d0
-# CHECK-NEXT: [1,1]     . D=====================eeeeER.    .    .   .   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [1,2]     . D=========================eeeER  .    .   .   fmul	d0, d0, d0
-# CHECK-NEXT: [1,3]     .  D===========================eeeeER   .   .   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [1,4]     .  D===============================eeeeER   .   fmadd	d0, d1, d2, d0
-# CHECK-NEXT: [1,5]     .  D===================================eeeeER   fmadd	d0, d0, d1, d2
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeER.    .    .    .    .    .    .    .   fadd	d0, d0, d0
+# CHECK-NEXT: [0,1]     D==eeeeER .    .    .    .    .    .    .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,2]     D====eeeeER    .    .    .    .    .    .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,3]     .D=======eeeER .    .    .    .    .    .   fmul	d0, d0, d0
+# CHECK-NEXT: [0,4]     .D==========eeeeER  .    .    .    .    .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [0,5]     .D==============eeeeER   .    .    .    .   fmadd	d0, d0, d1, d2
+# CHECK-NEXT: [1,0]     . D=================eeER .    .    .    .   fadd	d0, d0, d0
+# CHECK-NEXT: [1,1]     . D===================eeeeER  .    .    .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,2]     . D=====================eeeeER.    .    .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,3]     .  D========================eeeER  .    .   fmul	d0, d0, d0
+# CHECK-NEXT: [1,4]     .  D===========================eeeeER   .   fmadd	d0, d1, d2, d0
+# CHECK-NEXT: [1,5]     .  D===============================eeeeER   fmadd	d0, d0, d1, d2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -120,16 +165,54 @@ smulh x0, x0, x1
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     10.5   0.5    0.0       fadd	d0, d0, d0
-# CHECK-NEXT: 1.     2     12.5   0.0    0.0       fmadd	d0, d1, d2, d0
-# CHECK-NEXT: 2.     2     16.5   0.0    0.0       fmul	d0, d0, d0
-# CHECK-NEXT: 3.     2     18.5   0.0    0.0       fmadd	d0, d1, d2, d0
-# CHECK-NEXT: 4.     2     22.5   0.0    0.0       fmadd	d0, d1, d2, d0
-# CHECK-NEXT: 5.     2     26.5   0.0    0.0       fmadd	d0, d0, d1, d2
-# CHECK-NEXT:        2     17.8   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     9.5    0.5    0.0       fadd	d0, d0, d0
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 2.     2     13.5   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       fmul	d0, d0, d0
+# CHECK-NEXT: 4.     2     19.5   0.0    0.0       fmadd	d0, d1, d2, d0
+# CHECK-NEXT: 5.     2     23.5   0.0    0.0       fmadd	d0, d0, d1, d2
+# CHECK-NEXT:        2     15.7   0.1    0.0       <total>
 
 # CHECK:      [2] Code Region - saba
 
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1403
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2]     .D=====eeeeER  .    .    .    .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3]     .D=========eeeeER   .    .    .   saba	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0]     . D============eeeeeER   .    .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     . D=================eeeeER    .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2]     .  D=================eeeeER   .   saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     .  D=====================eeeeER   saba	v0.4s, v0.4s, v1.4s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     12.0   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       saba	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
+
+# CHECK:      [3] Code Region - mla
+
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
 # CHECK-NEXT: Total Cycles:      1703
@@ -138,20 +221,20 @@ smulh x0, x0, x1
 # CHECK:      Dispatch Width:    3
 # CHECK-NEXT: uOps Per Cycle:    0.29
 # CHECK-NEXT: IPC:               0.23
-# CHECK-NEXT: Block RThroughput: 3.0
+# CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456
 # CHECK-NEXT: Index     0123456789          0123456789
 
 # CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .    ..   saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,2]     .D========eeeeER    .    .    .    ..   saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,3]     .D============eeeeER.    .    .    ..   saba	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    ..   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2]     .D======eeeeeER.    .    .    .    ..   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3]     .D===========eeeeeER.    .    .    ..   mla	v0.4s, v0.4s, v1.4s
 # CHECK-NEXT: [1,0]     . D===============eeeeeER.    .    ..   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     . D====================eeeeER .    ..   saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2]     .  D=======================eeeeER  ..   saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3]     .  D===========================eeeeER   saba	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,1]     . D====================eeeeeER.    ..   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2]     .  D=====================eeeeeER   ..   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     .  D==========================eeeeeER   mla	v0.4s, v0.4s, v1.4s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -161,39 +244,235 @@ smulh x0, x0, x1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     2     8.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     13.5   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2.     2     16.5   0.0    0.0       saba	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3.     2     20.5   0.0    0.0       saba	v0.4s, v0.4s, v1.4s
-# CHECK-NEXT:        2     14.8   0.1    0.0       <total>
+# CHECK-NEXT: 1.     2     13.5   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2.     2     14.5   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     19.5   0.0    0.0       mla	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
+
+# CHECK:      [4] Code Region - sqrdmlah
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1803
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.28
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          012345678
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .  .   sqrdmlah	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2]     .D=======eeeeeER    .    .    .    .  .   sqrdmlah	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3]     .D============eeeeeER    .    .    .  .   sqrdmlah	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0]     . D================eeeeeER    .    .  .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     . D=====================eeeeeER    .  .   sqrdmlah	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2]     .  D=======================eeeeeER .  .   sqrdmlah	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     .  D============================eeeeeER   sqrdmlah	v0.4s, v0.4s, v1.4s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     14.0   0.0    0.0       sqrdmlah	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2.     2     16.0   0.0    0.0       sqrdmlah	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     21.0   0.0    0.0       sqrdmlah	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
+
+# CHECK:      [5] Code Region - smlal2
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1403
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,2]     .D=====eeeeER  .    .    .    .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [0,3]     .D=========eeeeER   .    .    .   smlal2	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT: [1,0]     . D============eeeeeER   .    .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     . D=================eeeeER    .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,2]     .  D=================eeeeER   .   smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: [1,3]     .  D=====================eeeeER   smlal2	v0.4s, v0.8h, v1.8h
 
-# CHECK:      [3] Code Region - fmla
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     12.0   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       smlal2	v0.4s, v0.8h, v1.8h
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
+
+# CHECK:      [6] Code Region - sadalp
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1403
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [0,2]     .D=====eeeeER  .    .    .    .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [0,3]     .D=========eeeeER   .    .    .   sadalp	v0.2d, v0.4s
+# CHECK-NEXT: [1,0]     . D============eeeeeER   .    .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     . D=================eeeeER    .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [1,2]     .  D=================eeeeER   .   sadalp	v0.2d, v1.4s
+# CHECK-NEXT: [1,3]     .  D=====================eeeeER   sadalp	v0.2d, v0.4s
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     12.0   0.0    0.0       sadalp	v0.2d, v1.4s
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       sadalp	v0.2d, v1.4s
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       sadalp	v0.2d, v0.4s
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
+
+# CHECK:      [7] Code Region - ssra
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      1403
+# CHECK-NEXT: Total uOps:        500
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.36
+# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: [0,2]     .D=====eeeeER  .    .    .    .   ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: [0,3]     .D=========eeeeER   .    .    .   ssra	v0.2d, v0.2d, #1
+# CHECK-NEXT: [1,0]     . D============eeeeeER   .    .   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     . D=================eeeeER    .   ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: [1,2]     .  D=================eeeeER   .   ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: [1,3]     .  D=====================eeeeER   ssra	v0.2d, v0.2d, #1
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     12.0   0.0    0.0       ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: 2.     2     12.0   0.0    0.0       ssra	v0.2d, v1.2d, #1
+# CHECK-NEXT: 3.     2     16.0   0.0    0.0       ssra	v0.2d, v0.2d, #1
+# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
+
+# CHECK:      [8] Code Region - fmla
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
-# CHECK-NEXT: Total Cycles:      2103
+# CHECK-NEXT: Total Cycles:      1703
 # CHECK-NEXT: Total uOps:        600
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.29
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    0.35
+# CHECK-NEXT: IPC:               0.35
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456789          0123456
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    ..   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D=eeeeER  .    .    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,2]     D===eeeeER.    .    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,3]     .D======eeER   .    .    .    .    ..   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,4]     .D========eeeeER    .    .    .    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,5]     .D============eeeeER.    .    .    ..   fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: [1,0]     . D===============eeeER  .    .    ..   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     . D================eeeeER.    .    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,2]     . D==================eeeeER   .    ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,3]     .  D=====================eeER .    ..   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,4]     .  D=======================eeeeER  ..   fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,5]     .  D===========================eeeeER   fmla	v0.2d, v0.2d, v1.2d
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     8.5    0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     9.5    0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 2.     2     11.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 3.     2     14.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 4.     2     16.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 5.     2     20.5   0.0    0.0       fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT:        2     13.5   0.1    0.0       <total>
+
+# CHECK:      [9] Code Region - fmlal
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      600
+# CHECK-NEXT: Total Cycles:      2303
+# CHECK-NEXT: Total uOps:        600
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.26
+# CHECK-NEXT: IPC:               0.26
 # CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          01234
-
-# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    .    .    .   .   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [0,2]     D=======eeER   .    .    .    .    .    .   .   fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,3]     .D========eeeeER    .    .    .    .    .   .   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [0,4]     .D============eeeeER.    .    .    .    .   .   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [0,5]     .D================eeeeER .    .    .    .   .   fmla	v0.2d, v0.2d, v1.2d
-# CHECK-NEXT: [1,0]     . D===================eeeER   .    .    .   .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1]     . D======================eeeeER    .    .   .   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,2]     . D==========================eeER  .    .   .   fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,3]     .  D===========================eeeeER   .   .   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,4]     .  D===============================eeeeER   .   fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: [1,5]     .  D===================================eeeeER   fmla	v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: Index     0123456789          0123456789          012345678
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    .    .  .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .    .    .  .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,2]     D======eeeeeER .    .    .    .    .    .    .  .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,3]     .D==========eeER    .    .    .    .    .    .  .   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,4]     .D============eeeeeER    .    .    .    .    .  .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,5]     .D=================eeeeeER    .    .    .    .  .   fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: [1,0]     . D=====================eeeER .    .    .    .  .   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     . D========================eeeeeER .    .    .  .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,2]     . D===========================eeeeeER   .    .  .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,3]     .  D===============================eeER .    .  .   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,4]     .  D=================================eeeeeER .  .   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,5]     .  D======================================eeeeeER   fmlal	v0.4s, v0.4h, v1.4h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -202,38 +481,38 @@ smulh x0, x0, x1
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     10.5   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1.     2     13.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 2.     2     17.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 3.     2     18.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 4.     2     22.5   0.0    0.0       fmla	v0.2d, v1.2d, v2.2d
-# CHECK-NEXT: 5.     2     26.5   0.0    0.0       fmla	v0.2d, v0.2d, v1.2d
-# CHECK-NEXT:        2     18.2   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     11.5   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     14.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 2.     2     17.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 3.     2     21.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 4.     2     23.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 5.     2     28.5   0.0    0.0       fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT:        2     19.5   0.1    0.0       <total>
 
-# CHECK:      [4] Code Region - crc32
+# CHECK:      [10] Code Region - crc32
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      803
+# CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    0.57
+# CHECK-NEXT: IPC:               0.57
 # CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
+# CHECK-NEXT:                     0123456
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.    .    .  .   mul	w0, w0, w0
-# CHECK-NEXT: [0,1]     D==eeER   .    .  .   crc32cb	w0, w0, w1
-# CHECK-NEXT: [0,2]     D====eeER .    .  .   crc32cb	w0, w0, w1
-# CHECK-NEXT: [0,3]     .D=====eeER    .  .   crc32cb	w0, w0, w0
-# CHECK-NEXT: [1,0]     .D=======eeER  .  .   mul	w0, w0, w0
-# CHECK-NEXT: [1,1]     .D=========eeER.  .   crc32cb	w0, w0, w1
-# CHECK-NEXT: [1,2]     . D==========eeER .   crc32cb	w0, w0, w1
-# CHECK-NEXT: [1,3]     . D============eeER   crc32cb	w0, w0, w0
+# CHECK:      [0,0]     DeeER.    .    ..   mul	w0, w0, w0
+# CHECK-NEXT: [0,1]     D==eeER   .    ..   crc32cb	w0, w0, w1
+# CHECK-NEXT: [0,2]     D===eeER  .    ..   crc32cb	w0, w0, w1
+# CHECK-NEXT: [0,3]     .D====eeER.    ..   crc32cb	w0, w0, w0
+# CHECK-NEXT: [1,0]     .D======eeER   ..   mul	w0, w0, w0
+# CHECK-NEXT: [1,1]     .D========eeER ..   crc32cb	w0, w0, w1
+# CHECK-NEXT: [1,2]     . D========eeER..   crc32cb	w0, w0, w1
+# CHECK-NEXT: [1,3]     . D==========eeER   crc32cb	w0, w0, w0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -242,36 +521,36 @@ smulh x0, x0, x1
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	w0, w0, w0
-# CHECK-NEXT: 1.     2     6.5    0.0    0.0       crc32cb	w0, w0, w1
-# CHECK-NEXT: 2.     2     8.0    0.0    0.0       crc32cb	w0, w0, w1
-# CHECK-NEXT: 3.     2     9.5    0.0    0.0       crc32cb	w0, w0, w0
-# CHECK-NEXT:        2     7.1    0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     4.0    0.5    0.0       mul	w0, w0, w0
+# CHECK-NEXT: 1.     2     6.0    0.0    0.0       crc32cb	w0, w0, w1
+# CHECK-NEXT: 2.     2     6.5    0.0    0.0       crc32cb	w0, w0, w1
+# CHECK-NEXT: 3.     2     8.0    0.0    0.0       crc32cb	w0, w0, w0
+# CHECK-NEXT:        2     6.1    0.1    0.0       <total>
 
-# CHECK:      [5] Code Region - smulh
+# CHECK:      [11] Code Region - smulh
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1205
+# CHECK-NEXT: Total Cycles:      1903
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.33
-# CHECK-NEXT: IPC:               0.33
-# CHECK-NEXT: Block RThroughput: 12.0
+# CHECK-NEXT: uOps Per Cycle:    0.21
+# CHECK-NEXT: IPC:               0.21
+# CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          012345678
-
-# CHECK:      [0,0]     DeeeeER   .    .    .    .  .   mul	x0, x0, x0
-# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .  .   smulh	x0, x1, x2
-# CHECK-NEXT: [0,2]     D======eeeeeER .    .    .  .   smulh	x0, x1, x2
-# CHECK-NEXT: [0,3]     .D===========eeeeeER.    .  .   smulh	x0, x0, x1
-# CHECK-NEXT: [1,0]     .D=================eeeeER.  .   mul	x0, x0, x0
-# CHECK-NEXT: [1,1]     .D========eeeeeE--------R.  .   smulh	x0, x1, x2
-# CHECK-NEXT: [1,2]     . D=============eeeeeE--R.  .   smulh	x0, x1, x2
-# CHECK-NEXT: [1,3]     . D===================eeeeeER   smulh	x0, x0, x1
+# CHECK-NEXT:                     0123456789          0123456789
+# CHECK-NEXT: Index     0123456789          0123456789          0
+
+# CHECK:      [0,0]     DeeeeER   .    .    .    .    .    .    .   mul	x0, x0, x0
+# CHECK-NEXT: [0,1]     D====eeeeeER   .    .    .    .    .    .   smulh	x0, x0, x1
+# CHECK-NEXT: [0,2]     D=========eeeeeER   .    .    .    .    .   smulh	x0, x0, x1
+# CHECK-NEXT: [0,3]     .D=============eeeeeER   .    .    .    .   smulh	x0, x0, x0
+# CHECK-NEXT: [1,0]     .D==================eeeeER    .    .    .   mul	x0, x0, x0
+# CHECK-NEXT: [1,1]     .D======================eeeeeER    .    .   smulh	x0, x0, x1
+# CHECK-NEXT: [1,2]     . D==========================eeeeeER    .   smulh	x0, x0, x1
+# CHECK-NEXT: [1,3]     . D===============================eeeeeER   smulh	x0, x0, x0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -280,8 +559,8 @@ smulh x0, x0, x1
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.5    1.0    0.0       mul	x0, x0, x0
-# CHECK-NEXT: 1.     2     6.5    6.5    4.0       smulh	x0, x1, x2
-# CHECK-NEXT: 2.     2     10.5   10.5   1.0       smulh	x0, x1, x2
-# CHECK-NEXT: 3.     2     16.0   1.0    0.0       smulh	x0, x0, x1
-# CHECK-NEXT:        2     10.6   4.8    1.3       <total>
+# CHECK-NEXT: 0.     2     10.0   0.5    0.0       mul	x0, x0, x0
+# CHECK-NEXT: 1.     2     14.0   0.0    0.0       smulh	x0, x0, x1
+# CHECK-NEXT: 2.     2     18.5   0.0    0.0       smulh	x0, x0, x1
+# CHECK-NEXT: 3.     2     23.0   0.0    0.0       smulh	x0, x0, x0
+# CHECK-NEXT:        2     16.4   0.1    0.0       <total>

>From 502730ba69cd3f613c34fba4fe3c10b4d1fd9a5e Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Thu, 12 Feb 2026 00:21:10 +0000
Subject: [PATCH 3/8] Updated scheduling model + tests

---
 .../Target/AArch64/AArch64SchedNeoverseN1.td  |  37 ++--
 .../AArch64/Neoverse/N1-basic-instructions.s  | 158 +++++++++---------
 .../AArch64/Neoverse/N1-neon-instructions.s   |  12 +-
 3 files changed, 105 insertions(+), 102 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
index d6de36c6081e4..b890c01dd0ddb 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
@@ -264,44 +264,44 @@ def N1Write_9c_6L_6V : SchedWriteRes<[N1UnitL, N1UnitL, N1UnitL,
 //===----------------------------------------------------------------------===//
 // Define forwarded types
 
-def N1Wr_IM32 : WriteSequence<[N1Write_2c_1M]>;
-def N1Wr_IM64 : WriteSequence<[N1Write_4c3_1M]>;
+def N1Wr_IM32 : SchedWriteRes<[N1UnitM]> { let Latency = 2; }
+def N1Wr_IM64 : SchedWriteRes<[N1UnitM]> { let Latency = 4; }
 def N1Rd_IMA  : SchedReadAdvance<1, [N1Wr_IM32, N1Wr_IM64]>;
 
-def N1Wr_FMA : WriteSequence<[N1Write_4c_1V]>;
+def N1Wr_FMA : SchedWriteRes<[N1UnitV]> { let Latency = 4; }
 def N1Rd_FMA : SchedReadAdvance<2, [WriteFMul, N1Wr_FMA]>;
 
-def N1Wr_VA : WriteSequence<[N1Write_4c_1V1]>;
+def N1Wr_VA : SchedWriteRes<[N1UnitV1]> { let Latency = 4; }
 def N1Rd_VA : SchedReadAdvance<3, [N1Wr_VA]>;
 
-def N1Wr_VMA  : WriteSequence<[N1Write_4c_1V0]>;
-def N1Wr_VMAQ : WriteSequence<[N1Write_5c_2V0]>;
+def N1Wr_VMA  : SchedWriteRes<[N1UnitV0]> { let Latency = 5; }
+def N1Wr_VMAQ : SchedWriteRes<[N1UnitV0, N1UnitV0]> { let Latency = 5; }
 def N1Rd_VMA  : SchedReadAdvance<3, [N1Wr_VMA, N1Wr_VMAQ]>;
 
-def N1Wr_VMAH  : WriteSequence<[N1Write_4c_1V0]>;
-def N1Wr_VMAHQ : WriteSequence<[N1Write_5c_2V0]>;
+def N1Wr_VMAH  : SchedWriteRes<[N1UnitV0]> { let Latency = 4; }
+def N1Wr_VMAHQ : SchedWriteRes<[N1UnitV0, N1UnitV0]> { let Latency = 5; }
 def N1Rd_VMAH  : SchedReadAdvance<2, [N1Wr_VMAH, N1Wr_VMAHQ]>;
 
-def N1Wr_VMAL  : WriteSequence<[N1Write_4c_1V0]>;
+def N1Wr_VMAL  : SchedWriteRes<[N1UnitV0]> { let Latency = 4; }
 def N1Rd_VMAL  : SchedReadAdvance<3, [N1Wr_VMAL]>;
 
-def N1Wr_VPA : WriteSequence<[N1Write_4c_1V1]>;
+def N1Wr_VPA : SchedWriteRes<[N1UnitV1]> { let Latency = 4; }
 def N1Rd_VPA : SchedReadAdvance<3, [N1Wr_VPA]>;
 
-def N1Wr_VSA : WriteSequence<[N1Write_4c_1V1]>;
+def N1Wr_VSA : SchedWriteRes<[N1UnitV1]> { let Latency = 4; }
 def N1Rd_VSA : SchedReadAdvance<3, [N1Wr_VSA]>;
 
-def N1Wr_VFM  : WriteSequence<[N1Write_3c_1V]>;
-def N1Wr_VFMA : WriteSequence<[N1Write_4c_1V]>;
+def N1Wr_VFM  : SchedWriteRes<[N1UnitV]> { let Latency = 3; }
+def N1Wr_VFMA : SchedWriteRes<[N1UnitV]> { let Latency = 4; }
 def N1Rd_VFMA : SchedReadAdvance<2, [N1Wr_VFM, N1Wr_VFMA]>;
 
-def N1Wr_VFMAL : WriteSequence<[N1Write_5c_1V]>;
+def N1Wr_VFMAL : SchedWriteRes<[N1UnitV]> { let Latency = 5; }
 def N1Rd_VFMAL : SchedReadAdvance<2, [N1Wr_VFMAL]>;
 
-def N1Wr_CRC : WriteSequence<[N1Write_2c_1M]>;
+def N1Wr_CRC : SchedWriteRes<[N1UnitM]> { let Latency = 2; }
 def N1Rd_CRC : SchedReadAdvance<1, [N1Wr_CRC]>;
 
-def N1Wr_MH : WriteSequence<[N1Write_5c3_1M]>;
+def N1Wr_MH : SchedWriteRes<[N1UnitM]> { let Latency = 5; }
 def N1Rd_MH : SchedReadAdvance<2, [N1Wr_MH]>;
 
 
@@ -329,6 +329,9 @@ def : SchedAlias<WriteBrReg, N1Write_1c_1B>;
 // Branch and link, register
 def : InstRW<[N1Write_1c_1B_1I], (instrs BL, BLR)>;
 
+// Compare and branch
+def : InstRW<[N1Write_1c_1B], (instregex "^[CT]BN?Z[XW]$")>;
+
 
 // Arithmetic and Logical Instructions
 // -----------------------------------------------------------------------------
@@ -348,7 +351,7 @@ def : SchedAlias<WriteI, N1Write_1c_1I>;
 def : SchedAlias<WriteIEReg, N1Write_2c_1M>;
 
 // Arithmetic, LSL shift, shift <= 4
-// Arithmetic, flagset, LSL shift, shift <= 4
+// Arithmetic, flagset, LSL shift, shift <= 4ah
 // Arithmetic, LSR/ASR/ROR shift or LSL shift > 4
 def N1WriteISReg : SchedWriteVariant<[
                      SchedVar<IsCheapLSL,  [N1Write_1c_1I]>,
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
index fb3f2ccc92441..5b5d29623f52f 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
@@ -448,56 +448,56 @@
 # CHECK-NEXT:  1      2     1.00                        crc32ch	w25, w26, w16
 # CHECK-NEXT:  1      2     1.00                        crc32cw	w27, w12, w23
 # CHECK-NEXT:  1      2     1.00                        crc32cx	w21, w28, x5
-# CHECK-NEXT:  1      5     3.00                        smulh	x30, x29, x28
-# CHECK-NEXT:  1      5     3.00                        smulh	xzr, x27, x26
-# CHECK-NEXT:  1      5     3.00                        umulh	x30, x29, x28
-# CHECK-NEXT:  1      5     3.00                        umulh	x23, x30, xzr
+# CHECK-NEXT:  1      5     1.00                        smulh	x30, x29, x28
+# CHECK-NEXT:  1      5     1.00                        smulh	xzr, x27, x26
+# CHECK-NEXT:  1      5     1.00                        umulh	x30, x29, x28
+# CHECK-NEXT:  1      5     1.00                        umulh	x23, x30, xzr
 # CHECK-NEXT:  1      2     1.00                        madd	w1, w3, w7, w4
 # CHECK-NEXT:  1      2     1.00                        madd	wzr, w0, w9, w11
 # CHECK-NEXT:  1      2     1.00                        madd	w13, wzr, w4, w4
 # CHECK-NEXT:  1      2     1.00                        madd	w19, w30, wzr, w29
 # CHECK-NEXT:  1      2     1.00                        mul	w4, w5, w6
-# CHECK-NEXT:  1      4     3.00                        madd	x1, x3, x7, x4
-# CHECK-NEXT:  1      4     3.00                        madd	xzr, x0, x9, x11
-# CHECK-NEXT:  1      4     3.00                        madd	x13, xzr, x4, x4
-# CHECK-NEXT:  1      4     3.00                        madd	x19, x30, xzr, x29
-# CHECK-NEXT:  1      4     3.00                        mul	x4, x5, x6
+# CHECK-NEXT:  1      4     1.00                        madd	x1, x3, x7, x4
+# CHECK-NEXT:  1      4     1.00                        madd	xzr, x0, x9, x11
+# CHECK-NEXT:  1      4     1.00                        madd	x13, xzr, x4, x4
+# CHECK-NEXT:  1      4     1.00                        madd	x19, x30, xzr, x29
+# CHECK-NEXT:  1      4     1.00                        mul	x4, x5, x6
 # CHECK-NEXT:  1      2     1.00                        msub	w1, w3, w7, w4
 # CHECK-NEXT:  1      2     1.00                        msub	wzr, w0, w9, w11
 # CHECK-NEXT:  1      2     1.00                        msub	w13, wzr, w4, w4
 # CHECK-NEXT:  1      2     1.00                        msub	w19, w30, wzr, w29
 # CHECK-NEXT:  1      2     1.00                        mneg	w4, w5, w6
-# CHECK-NEXT:  1      4     3.00                        msub	x1, x3, x7, x4
-# CHECK-NEXT:  1      4     3.00                        msub	xzr, x0, x9, x11
-# CHECK-NEXT:  1      4     3.00                        msub	x13, xzr, x4, x4
-# CHECK-NEXT:  1      4     3.00                        msub	x19, x30, xzr, x29
-# CHECK-NEXT:  1      4     3.00                        mneg	x4, x5, x6
-# CHECK-NEXT:  1      4     3.00                        smaddl	x3, w5, w2, x9
-# CHECK-NEXT:  1      4     3.00                        smaddl	xzr, w10, w11, x12
-# CHECK-NEXT:  1      4     3.00                        smaddl	x13, wzr, w14, x15
-# CHECK-NEXT:  1      4     3.00                        smaddl	x16, w17, wzr, x18
-# CHECK-NEXT:  1      4     3.00                        smull	x19, w20, w21
-# CHECK-NEXT:  1      4     3.00                        smsubl	x3, w5, w2, x9
-# CHECK-NEXT:  1      4     3.00                        smsubl	xzr, w10, w11, x12
-# CHECK-NEXT:  1      4     3.00                        smsubl	x13, wzr, w14, x15
-# CHECK-NEXT:  1      4     3.00                        smsubl	x16, w17, wzr, x18
-# CHECK-NEXT:  1      4     3.00                        smnegl	x19, w20, w21
-# CHECK-NEXT:  1      4     3.00                        umaddl	x3, w5, w2, x9
-# CHECK-NEXT:  1      4     3.00                        umaddl	xzr, w10, w11, x12
-# CHECK-NEXT:  1      4     3.00                        umaddl	x13, wzr, w14, x15
-# CHECK-NEXT:  1      4     3.00                        umaddl	x16, w17, wzr, x18
-# CHECK-NEXT:  1      4     3.00                        umull	x19, w20, w21
-# CHECK-NEXT:  1      4     3.00                        umsubl	x3, w5, w2, x9
-# CHECK-NEXT:  1      4     3.00                        umsubl	x16, w17, wzr, x18
-# CHECK-NEXT:  1      4     3.00                        umnegl	x19, w20, w21
-# CHECK-NEXT:  1      5     3.00                        smulh	x23, x22, xzr
-# CHECK-NEXT:  1      5     3.00                        umulh	x23, x22, xzr
-# CHECK-NEXT:  1      4     3.00                        mul	x19, x20, xzr
+# CHECK-NEXT:  1      4     1.00                        msub	x1, x3, x7, x4
+# CHECK-NEXT:  1      4     1.00                        msub	xzr, x0, x9, x11
+# CHECK-NEXT:  1      4     1.00                        msub	x13, xzr, x4, x4
+# CHECK-NEXT:  1      4     1.00                        msub	x19, x30, xzr, x29
+# CHECK-NEXT:  1      4     1.00                        mneg	x4, x5, x6
+# CHECK-NEXT:  1      4     1.00                        smaddl	x3, w5, w2, x9
+# CHECK-NEXT:  1      4     1.00                        smaddl	xzr, w10, w11, x12
+# CHECK-NEXT:  1      4     1.00                        smaddl	x13, wzr, w14, x15
+# CHECK-NEXT:  1      4     1.00                        smaddl	x16, w17, wzr, x18
+# CHECK-NEXT:  1      4     1.00                        smull	x19, w20, w21
+# CHECK-NEXT:  1      4     1.00                        smsubl	x3, w5, w2, x9
+# CHECK-NEXT:  1      4     1.00                        smsubl	xzr, w10, w11, x12
+# CHECK-NEXT:  1      4     1.00                        smsubl	x13, wzr, w14, x15
+# CHECK-NEXT:  1      4     1.00                        smsubl	x16, w17, wzr, x18
+# CHECK-NEXT:  1      4     1.00                        smnegl	x19, w20, w21
+# CHECK-NEXT:  1      4     1.00                        umaddl	x3, w5, w2, x9
+# CHECK-NEXT:  1      4     1.00                        umaddl	xzr, w10, w11, x12
+# CHECK-NEXT:  1      4     1.00                        umaddl	x13, wzr, w14, x15
+# CHECK-NEXT:  1      4     1.00                        umaddl	x16, w17, wzr, x18
+# CHECK-NEXT:  1      4     1.00                        umull	x19, w20, w21
+# CHECK-NEXT:  1      4     1.00                        umsubl	x3, w5, w2, x9
+# CHECK-NEXT:  1      4     1.00                        umsubl	x16, w17, wzr, x18
+# CHECK-NEXT:  1      4     1.00                        umnegl	x19, w20, w21
+# CHECK-NEXT:  1      5     1.00                        smulh	x23, x22, xzr
+# CHECK-NEXT:  1      5     1.00                        umulh	x23, x22, xzr
+# CHECK-NEXT:  1      4     1.00                        mul	x19, x20, xzr
 # CHECK-NEXT:  1      2     1.00                        mneg	w21, w22, w23
-# CHECK-NEXT:  1      4     3.00                        smull	x11, w13, w17
-# CHECK-NEXT:  1      4     3.00                        umull	x11, w13, w17
-# CHECK-NEXT:  1      4     3.00                        smnegl	x11, w13, w17
-# CHECK-NEXT:  1      4     3.00                        umnegl	x11, w13, w17
+# CHECK-NEXT:  1      4     1.00                        smull	x11, w13, w17
+# CHECK-NEXT:  1      4     1.00                        umull	x11, w13, w17
+# CHECK-NEXT:  1      4     1.00                        smnegl	x11, w13, w17
+# CHECK-NEXT:  1      4     1.00                        umnegl	x11, w13, w17
 # CHECK-NEXT:  2      3     1.00                        extr	w3, w5, w7, #0
 # CHECK-NEXT:  2      3     1.00                        extr	w11, w13, w17, #31
 # CHECK-NEXT:  2      3     1.00                        extr	x3, x5, x7, #15
@@ -1264,7 +1264,7 @@
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1.0]  [1.1]  [2.0]  [2.1]  [3]    [4.0]  [4.1]  [5]    [6]
-# CHECK-NEXT: 26.00  34.00  34.00  252.50 252.50 527.33 197.33 197.33 293.00 161.00
+# CHECK-NEXT: 26.00  34.00  34.00  252.50 252.50 449.33 197.33 197.33 293.00 161.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1.0]  [1.1]  [2.0]  [2.1]  [3]    [4.0]  [4.1]  [5]    [6]    Instructions:
@@ -1706,56 +1706,56 @@
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     crc32ch	w25, w26, w16
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     crc32cw	w27, w12, w23
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     crc32cx	w21, w28, x5
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smulh	x30, x29, x28
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smulh	xzr, x27, x26
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umulh	x30, x29, x28
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umulh	x23, x30, xzr
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smulh	x30, x29, x28
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smulh	xzr, x27, x26
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umulh	x30, x29, x28
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umulh	x23, x30, xzr
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     madd	w1, w3, w7, w4
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     madd	wzr, w0, w9, w11
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     madd	w13, wzr, w4, w4
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     madd	w19, w30, wzr, w29
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     mul	w4, w5, w6
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     madd	x1, x3, x7, x4
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     madd	xzr, x0, x9, x11
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     madd	x13, xzr, x4, x4
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     madd	x19, x30, xzr, x29
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     mul	x4, x5, x6
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     madd	x1, x3, x7, x4
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     madd	xzr, x0, x9, x11
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     madd	x13, xzr, x4, x4
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     madd	x19, x30, xzr, x29
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     mul	x4, x5, x6
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     msub	w1, w3, w7, w4
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     msub	wzr, w0, w9, w11
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     msub	w13, wzr, w4, w4
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     msub	w19, w30, wzr, w29
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     mneg	w4, w5, w6
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     msub	x1, x3, x7, x4
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     msub	xzr, x0, x9, x11
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     msub	x13, xzr, x4, x4
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     msub	x19, x30, xzr, x29
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     mneg	x4, x5, x6
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smaddl	x3, w5, w2, x9
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smaddl	xzr, w10, w11, x12
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smaddl	x13, wzr, w14, x15
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smaddl	x16, w17, wzr, x18
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smull	x19, w20, w21
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smsubl	x3, w5, w2, x9
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smsubl	xzr, w10, w11, x12
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smsubl	x13, wzr, w14, x15
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smsubl	x16, w17, wzr, x18
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smnegl	x19, w20, w21
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umaddl	x3, w5, w2, x9
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umaddl	xzr, w10, w11, x12
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umaddl	x13, wzr, w14, x15
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umaddl	x16, w17, wzr, x18
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umull	x19, w20, w21
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umsubl	x3, w5, w2, x9
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umsubl	x16, w17, wzr, x18
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umnegl	x19, w20, w21
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smulh	x23, x22, xzr
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umulh	x23, x22, xzr
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     mul	x19, x20, xzr
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     msub	x1, x3, x7, x4
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     msub	xzr, x0, x9, x11
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     msub	x13, xzr, x4, x4
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     msub	x19, x30, xzr, x29
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     mneg	x4, x5, x6
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smaddl	x3, w5, w2, x9
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smaddl	xzr, w10, w11, x12
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smaddl	x13, wzr, w14, x15
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smaddl	x16, w17, wzr, x18
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smull	x19, w20, w21
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smsubl	x3, w5, w2, x9
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smsubl	xzr, w10, w11, x12
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smsubl	x13, wzr, w14, x15
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smsubl	x16, w17, wzr, x18
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smnegl	x19, w20, w21
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umaddl	x3, w5, w2, x9
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umaddl	xzr, w10, w11, x12
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umaddl	x13, wzr, w14, x15
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umaddl	x16, w17, wzr, x18
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umull	x19, w20, w21
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umsubl	x3, w5, w2, x9
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umsubl	x16, w17, wzr, x18
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umnegl	x19, w20, w21
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smulh	x23, x22, xzr
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umulh	x23, x22, xzr
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     mul	x19, x20, xzr
 # CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     mneg	w21, w22, w23
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smull	x11, w13, w17
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umull	x11, w13, w17
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     smnegl	x11, w13, w17
-# CHECK-NEXT:  -      -      -      -      -     3.00    -      -      -      -     umnegl	x11, w13, w17
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smull	x11, w13, w17
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umull	x11, w13, w17
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     smnegl	x11, w13, w17
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -      -     umnegl	x11, w13, w17
 # CHECK-NEXT:  -      -      -      -      -     1.33   0.33   0.33    -      -     extr	w3, w5, w7, #0
 # CHECK-NEXT:  -      -      -      -      -     1.33   0.33   0.33    -      -     extr	w11, w13, w17, #31
 # CHECK-NEXT:  -      -      -      -      -     1.33   0.33   0.33    -      -     extr	x3, x5, x7, #15
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
index 270990154f245..1c03d4be8add2 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
@@ -613,10 +613,10 @@
 # CHECK-NEXT:  1      2     0.50                        mla	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  1      2     0.50                        mla	v15.8h, v22.8h, v4.h[3]
 # CHECK-NEXT:  1      2     0.50                        mla	v28.2s, v10.2s, v2.s[0]
-# CHECK-NEXT:  1      4     1.00                        mls	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      5     1.00                        mls	v0.4h, v0.4h, v0.4h
 # CHECK-NEXT:  1      2     0.50                        mls	v25.8h, v29.8h, v0.h[4]
 # CHECK-NEXT:  1      2     0.50                        mls	v22.2s, v29.2s, v0.s[3]
-# CHECK-NEXT:  2      5     2.00                        mls	v26.4s, v5.4s, v28.4s
+# CHECK-NEXT:  1      5     2.00                        mls	v26.4s, v5.4s, v28.4s
 # CHECK-NEXT:  1      2     0.50                        mov	b0, v0.b[15]
 # CHECK-NEXT:  1      2     0.50                        mov	d6, v0.d[1]
 # CHECK-NEXT:  1      2     0.50                        mov	h2, v0.h[5]
@@ -938,10 +938,10 @@
 # CHECK-NEXT:  1      2     0.50                        sqrdmlah	v0.4s, v1.4s, v2.s[1]
 # CHECK-NEXT:  1      4     1.00                        sqrdmlah	h0, h1, h2
 # CHECK-NEXT:  1      4     1.00                        sqrdmlah	v0.4h, v1.4h, v2.4h
-# CHECK-NEXT:  2      5     2.00                        sqrdmlah	v0.8h, v1.8h, v2.8h
+# CHECK-NEXT:  1      5     2.00                        sqrdmlah	v0.8h, v1.8h, v2.8h
 # CHECK-NEXT:  1      4     1.00                        sqrdmlah	s0, s1, s2
 # CHECK-NEXT:  1      4     1.00                        sqrdmlah	v0.2s, v1.2s, v2.2s
-# CHECK-NEXT:  2      5     2.00                        sqrdmlah	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT:  1      5     2.00                        sqrdmlah	v0.4s, v1.4s, v2.4s
 # CHECK-NEXT:  1      2     0.50                        sqrdmlsh	h0, h1, v2.h[3]
 # CHECK-NEXT:  1      2     0.50                        sqrdmlsh	v0.4h, v1.4h, v2.h[3]
 # CHECK-NEXT:  1      2     0.50                        sqrdmlsh	v0.8h, v1.8h, v2.h[3]
@@ -950,10 +950,10 @@
 # CHECK-NEXT:  1      2     0.50                        sqrdmlsh	v0.4s, v1.4s, v2.s[1]
 # CHECK-NEXT:  1      4     1.00                        sqrdmlsh	h0, h1, h2
 # CHECK-NEXT:  1      4     1.00                        sqrdmlsh	v0.4h, v1.4h, v2.4h
-# CHECK-NEXT:  2      5     2.00                        sqrdmlsh	v0.8h, v1.8h, v2.8h
+# CHECK-NEXT:  1      5     2.00                        sqrdmlsh	v0.8h, v1.8h, v2.8h
 # CHECK-NEXT:  1      4     1.00                        sqrdmlsh	s0, s1, s2
 # CHECK-NEXT:  1      4     1.00                        sqrdmlsh	v0.2s, v1.2s, v2.2s
-# CHECK-NEXT:  2      5     2.00                        sqrdmlsh	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT:  1      5     2.00                        sqrdmlsh	v0.4s, v1.4s, v2.4s
 # CHECK-NEXT:  1      4     1.00                        sqrdmulh	h10, h11, h12
 # CHECK-NEXT:  1      2     0.50                        sqrdmulh	h7, h15, v0.h[3]
 # CHECK-NEXT:  1      2     0.50                        sqrdmulh	s15, s14, v0.s[1]

>From a63212d36106a4ed189e6785b150623b74e93411 Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Thu, 12 Feb 2026 01:07:16 +0000
Subject: [PATCH 4/8] Fixups

---
 llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
index b890c01dd0ddb..6a2bf1d3475c0 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
@@ -329,9 +329,6 @@ def : SchedAlias<WriteBrReg, N1Write_1c_1B>;
 // Branch and link, register
 def : InstRW<[N1Write_1c_1B_1I], (instrs BL, BLR)>;
 
-// Compare and branch
-def : InstRW<[N1Write_1c_1B], (instregex "^[CT]BN?Z[XW]$")>;
-
 
 // Arithmetic and Logical Instructions
 // -----------------------------------------------------------------------------
@@ -351,7 +348,7 @@ def : SchedAlias<WriteI, N1Write_1c_1I>;
 def : SchedAlias<WriteIEReg, N1Write_2c_1M>;
 
 // Arithmetic, LSL shift, shift <= 4
-// Arithmetic, flagset, LSL shift, shift <= 4ah
+// Arithmetic, flagset, LSL shift, shift <= 4
 // Arithmetic, LSR/ASR/ROR shift or LSL shift > 4
 def N1WriteISReg : SchedWriteVariant<[
                      SchedVar<IsCheapLSL,  [N1Write_1c_1I]>,

>From b0a243c38437f3efdf40994d82a49e707648ab5d Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Wed, 25 Feb 2026 16:43:39 +0000
Subject: [PATCH 5/8] Summary: Responds to comments

- Added smaddl test in N1-forwarding.s
- Removed late forwarding for ASIMD multiply accumulate high D/Q form and ASIMD multiply accumulate saturating long
- updated VFMAL read advance to 3

Co-authored-by: Asher8118
---
 .../Target/AArch64/AArch64SchedNeoverseN1.td  |  18 +-
 .../AArch64/Neoverse/N1-basic-instructions.s  |  44 ++--
 .../llvm-mca/AArch64/Neoverse/N1-forwarding.s | 195 +++++++-----------
 .../AArch64/Neoverse/N1-neon-instructions.s   |   8 +-
 4 files changed, 106 insertions(+), 159 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
index 6a2bf1d3475c0..99e0ef1d8c261 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
@@ -278,13 +278,6 @@ def N1Wr_VMA  : SchedWriteRes<[N1UnitV0]> { let Latency = 5; }
 def N1Wr_VMAQ : SchedWriteRes<[N1UnitV0, N1UnitV0]> { let Latency = 5; }
 def N1Rd_VMA  : SchedReadAdvance<3, [N1Wr_VMA, N1Wr_VMAQ]>;
 
-def N1Wr_VMAH  : SchedWriteRes<[N1UnitV0]> { let Latency = 4; }
-def N1Wr_VMAHQ : SchedWriteRes<[N1UnitV0, N1UnitV0]> { let Latency = 5; }
-def N1Rd_VMAH  : SchedReadAdvance<2, [N1Wr_VMAH, N1Wr_VMAHQ]>;
-
-def N1Wr_VMAL  : SchedWriteRes<[N1UnitV0]> { let Latency = 4; }
-def N1Rd_VMAL  : SchedReadAdvance<3, [N1Wr_VMAL]>;
-
 def N1Wr_VPA : SchedWriteRes<[N1UnitV1]> { let Latency = 4; }
 def N1Rd_VPA : SchedReadAdvance<3, [N1Wr_VPA]>;
 
@@ -296,7 +289,7 @@ def N1Wr_VFMA : SchedWriteRes<[N1UnitV]> { let Latency = 4; }
 def N1Rd_VFMA : SchedReadAdvance<2, [N1Wr_VFM, N1Wr_VFMA]>;
 
 def N1Wr_VFMAL : SchedWriteRes<[N1UnitV]> { let Latency = 5; }
-def N1Rd_VFMAL : SchedReadAdvance<2, [N1Wr_VFMAL]>;
+def N1Rd_VFMAL : SchedReadAdvance<3, [N1Wr_VFMAL]>;
 
 def N1Wr_CRC : SchedWriteRes<[N1UnitM]> { let Latency = 2; }
 def N1Rd_CRC : SchedReadAdvance<1, [N1Wr_CRC]>;
@@ -380,7 +373,7 @@ def : InstRW<[N1Wr_IM32, ReadIM, ReadIM, N1Rd_IMA],
              (instregex "^M(ADD|SUB)Wrrr$")>;
 def : InstRW<[N1Wr_IM64, ReadIM, ReadIM, N1Rd_IMA],
              (instregex "^M(ADD|SUB)Xrrr$")>;
-def : InstRW<[N1Wr_IM64, ReadIM, ReadIM, N1Rd_IMA],
+def : InstRW<[N1Wr_IM32, ReadIM, ReadIM, N1Rd_IMA],
              (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
 
 // Multiply high
@@ -668,12 +661,12 @@ def : InstRW<[N1Write_4c_1V0], (instregex "^MUL(v[14]i16|v[12]i32)$",
 def : InstRW<[N1Wr_VMA, N1Rd_VMA], (instregex "^ML[AS](v[14]i16|v[12]i32)$")>;
 
 // ASIMD multiply accumulate high, D-form
-def : InstRW<[N1Wr_VMAH, N1Rd_VMAH], (instregex "^SQRDML[AS]H(v[14]i16|v[12]i32)$")>;
+def : InstRW<[N1Write_4c_1V0], (instregex "^SQRDML[AS]H(v[14]i16|v[12]i32)$")>;
 
 // ASIMD multiply accumulate saturating long
 // ASIMD multiply long
 // ASIMD multiply accumulate long
-def : InstRW<[N1Wr_VMAL, N1Rd_VMAL], (instregex "^[SU]ML[AS]Lv",
+def : InstRW<[N1Write_4c_1V0], (instregex "^[SU]ML[AS]Lv",
                                                  "^SQDML[AS]Lv")>;
 
 // ASIMD multiply, Q-form
@@ -684,8 +677,7 @@ def : InstRW<[N1Write_5c_2V0], (instregex "^MUL(v8i16|v4i32)$",
 def : InstRW<[N1Wr_VMAQ, N1Rd_VMA], (instregex "^ML[AS](v8i16|v4i32)$")>;
 
 // ASIMD multiply accumulate high, Q-form
-def : InstRW<[N1Wr_VMAHQ, N1Rd_VMAH],
-             (instregex "^SQRDML[AS]H(v8i16|v4i32)$")>;
+def : InstRW<[N1Write_5c_2V0], (instregex "^SQRDML[AS]H(v8i16|v4i32)$")>;
 
 // ASIMD multiply/multiply long (8x8) polynomial, D-form
 def : InstRW<[N1Write_3c_1V0], (instrs PMULv8i8, PMULLv8i8)>;
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
index 5b5d29623f52f..6fa54ddf61e2f 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
@@ -472,32 +472,32 @@
 # CHECK-NEXT:  1      4     1.00                        msub	x13, xzr, x4, x4
 # CHECK-NEXT:  1      4     1.00                        msub	x19, x30, xzr, x29
 # CHECK-NEXT:  1      4     1.00                        mneg	x4, x5, x6
-# CHECK-NEXT:  1      4     1.00                        smaddl	x3, w5, w2, x9
-# CHECK-NEXT:  1      4     1.00                        smaddl	xzr, w10, w11, x12
-# CHECK-NEXT:  1      4     1.00                        smaddl	x13, wzr, w14, x15
-# CHECK-NEXT:  1      4     1.00                        smaddl	x16, w17, wzr, x18
-# CHECK-NEXT:  1      4     1.00                        smull	x19, w20, w21
-# CHECK-NEXT:  1      4     1.00                        smsubl	x3, w5, w2, x9
-# CHECK-NEXT:  1      4     1.00                        smsubl	xzr, w10, w11, x12
-# CHECK-NEXT:  1      4     1.00                        smsubl	x13, wzr, w14, x15
-# CHECK-NEXT:  1      4     1.00                        smsubl	x16, w17, wzr, x18
-# CHECK-NEXT:  1      4     1.00                        smnegl	x19, w20, w21
-# CHECK-NEXT:  1      4     1.00                        umaddl	x3, w5, w2, x9
-# CHECK-NEXT:  1      4     1.00                        umaddl	xzr, w10, w11, x12
-# CHECK-NEXT:  1      4     1.00                        umaddl	x13, wzr, w14, x15
-# CHECK-NEXT:  1      4     1.00                        umaddl	x16, w17, wzr, x18
-# CHECK-NEXT:  1      4     1.00                        umull	x19, w20, w21
-# CHECK-NEXT:  1      4     1.00                        umsubl	x3, w5, w2, x9
-# CHECK-NEXT:  1      4     1.00                        umsubl	x16, w17, wzr, x18
-# CHECK-NEXT:  1      4     1.00                        umnegl	x19, w20, w21
+# CHECK-NEXT:  1      2     1.00                        smaddl	x3, w5, w2, x9
+# CHECK-NEXT:  1      2     1.00                        smaddl	xzr, w10, w11, x12
+# CHECK-NEXT:  1      2     1.00                        smaddl	x13, wzr, w14, x15
+# CHECK-NEXT:  1      2     1.00                        smaddl	x16, w17, wzr, x18
+# CHECK-NEXT:  1      2     1.00                        smull	x19, w20, w21
+# CHECK-NEXT:  1      2     1.00                        smsubl	x3, w5, w2, x9
+# CHECK-NEXT:  1      2     1.00                        smsubl	xzr, w10, w11, x12
+# CHECK-NEXT:  1      2     1.00                        smsubl	x13, wzr, w14, x15
+# CHECK-NEXT:  1      2     1.00                        smsubl	x16, w17, wzr, x18
+# CHECK-NEXT:  1      2     1.00                        smnegl	x19, w20, w21
+# CHECK-NEXT:  1      2     1.00                        umaddl	x3, w5, w2, x9
+# CHECK-NEXT:  1      2     1.00                        umaddl	xzr, w10, w11, x12
+# CHECK-NEXT:  1      2     1.00                        umaddl	x13, wzr, w14, x15
+# CHECK-NEXT:  1      2     1.00                        umaddl	x16, w17, wzr, x18
+# CHECK-NEXT:  1      2     1.00                        umull	x19, w20, w21
+# CHECK-NEXT:  1      2     1.00                        umsubl	x3, w5, w2, x9
+# CHECK-NEXT:  1      2     1.00                        umsubl	x16, w17, wzr, x18
+# CHECK-NEXT:  1      2     1.00                        umnegl	x19, w20, w21
 # CHECK-NEXT:  1      5     1.00                        smulh	x23, x22, xzr
 # CHECK-NEXT:  1      5     1.00                        umulh	x23, x22, xzr
 # CHECK-NEXT:  1      4     1.00                        mul	x19, x20, xzr
 # CHECK-NEXT:  1      2     1.00                        mneg	w21, w22, w23
-# CHECK-NEXT:  1      4     1.00                        smull	x11, w13, w17
-# CHECK-NEXT:  1      4     1.00                        umull	x11, w13, w17
-# CHECK-NEXT:  1      4     1.00                        smnegl	x11, w13, w17
-# CHECK-NEXT:  1      4     1.00                        umnegl	x11, w13, w17
+# CHECK-NEXT:  1      2     1.00                        smull	x11, w13, w17
+# CHECK-NEXT:  1      2     1.00                        umull	x11, w13, w17
+# CHECK-NEXT:  1      2     1.00                        smnegl	x11, w13, w17
+# CHECK-NEXT:  1      2     1.00                        umnegl	x11, w13, w17
 # CHECK-NEXT:  2      3     1.00                        extr	w3, w5, w7, #0
 # CHECK-NEXT:  2      3     1.00                        extr	w11, w13, w17, #31
 # CHECK-NEXT:  2      3     1.00                        extr	x3, x5, x7, #15
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
index bb70593f1b91d..4bffe97ed1b07 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
@@ -31,20 +31,6 @@ mla v0.4s, v1.4s, v2.4s
 mla v0.4s, v0.4s, v1.4s
 # LLVM-MCA-END
 
-# LLVM-MCA-BEGIN sqrdmlah
-mul      v0.4s, v0.4s, v0.4s
-sqrdmlah v0.4s, v1.4s, v2.4s
-sqrdmlah v0.4s, v1.4s, v2.4s
-sqrdmlah v0.4s, v0.4s, v1.4s
-# LLVM-MCA-END
-
-# LLVM-MCA-BEGIN smlal2
-mul    v0.4s, v0.4s, v0.4s
-smlal2 v0.4s, v1.8h, v2.8h
-smlal2 v0.4s, v1.8h, v2.8h
-smlal2 v0.4s, v0.8h, v1.8h
-# LLVM-MCA-END
-
 # LLVM-MCA-BEGIN sadalp
 mul    v0.4s, v0.4s, v0.4s
 sadalp v0.2d, v1.4s
@@ -91,6 +77,13 @@ smulh x0, x0, x1
 smulh x0, x0, x0
 # LLVM-MCA-END
 
+# LLVM-MCA-BEGIN smaddl
+mul    x0, x0, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w0, w0, x0
+# LLVM-MCA-END
+
 # CHECK:      [0] Code Region - madd
 
 # CHECK:      Iterations:        100
@@ -249,83 +242,7 @@ smulh x0, x0, x0
 # CHECK-NEXT: 3.     2     19.5   0.0    0.0       mla	v0.4s, v0.4s, v1.4s
 # CHECK-NEXT:        2     14.0   0.1    0.0       <total>
 
-# CHECK:      [4] Code Region - sqrdmlah
-
-# CHECK:      Iterations:        100
-# CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1803
-# CHECK-NEXT: Total uOps:        500
-
-# CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.28
-# CHECK-NEXT: IPC:               0.22
-# CHECK-NEXT: Block RThroughput: 8.0
-
-# CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    .  .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    .  .   sqrdmlah	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,2]     .D=======eeeeeER    .    .    .    .  .   sqrdmlah	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,3]     .D============eeeeeER    .    .    .  .   sqrdmlah	v0.4s, v0.4s, v1.4s
-# CHECK-NEXT: [1,0]     . D================eeeeeER    .    .  .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     . D=====================eeeeeER    .  .   sqrdmlah	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2]     .  D=======================eeeeeER .  .   sqrdmlah	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3]     .  D============================eeeeeER   sqrdmlah	v0.4s, v0.4s, v1.4s
-
-# CHECK:      Average Wait times (based on the timeline view):
-# CHECK-NEXT: [0]: Executions
-# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
-# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
-# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
-
-# CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     9.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     14.0   0.0    0.0       sqrdmlah	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2.     2     16.0   0.0    0.0       sqrdmlah	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3.     2     21.0   0.0    0.0       sqrdmlah	v0.4s, v0.4s, v1.4s
-# CHECK-NEXT:        2     15.0   0.1    0.0       <total>
-
-# CHECK:      [5] Code Region - smlal2
-
-# CHECK:      Iterations:        100
-# CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      1403
-# CHECK-NEXT: Total uOps:        500
-
-# CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.36
-# CHECK-NEXT: IPC:               0.29
-# CHECK-NEXT: Block RThroughput: 5.0
-
-# CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789
-
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D=====eeeeER   .    .    .    .   smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,2]     .D=====eeeeER  .    .    .    .   smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [0,3]     .D=========eeeeER   .    .    .   smlal2	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT: [1,0]     . D============eeeeeER   .    .   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     . D=================eeeeER    .   smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,2]     .  D=================eeeeER   .   smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: [1,3]     .  D=====================eeeeER   smlal2	v0.4s, v0.8h, v1.8h
-
-# CHECK:      Average Wait times (based on the timeline view):
-# CHECK-NEXT: [0]: Executions
-# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
-# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
-# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
-
-# CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     7.0    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     12.0   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 2.     2     12.0   0.0    0.0       smlal2	v0.4s, v1.8h, v2.8h
-# CHECK-NEXT: 3.     2     16.0   0.0    0.0       smlal2	v0.4s, v0.8h, v1.8h
-# CHECK-NEXT:        2     11.8   0.1    0.0       <total>
-
-# CHECK:      [6] Code Region - sadalp
+# CHECK:      [4] Code Region - sadalp
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
@@ -363,7 +280,7 @@ smulh x0, x0, x0
 # CHECK-NEXT: 3.     2     16.0   0.0    0.0       sadalp	v0.2d, v0.4s
 # CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
-# CHECK:      [7] Code Region - ssra
+# CHECK:      [5] Code Region - ssra
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
@@ -401,7 +318,7 @@ smulh x0, x0, x0
 # CHECK-NEXT: 3.     2     16.0   0.0    0.0       ssra	v0.2d, v0.2d, #1
 # CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
-# CHECK:      [8] Code Region - fmla
+# CHECK:      [6] Code Region - fmla
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
@@ -445,34 +362,34 @@ smulh x0, x0, x0
 # CHECK-NEXT: 5.     2     20.5   0.0    0.0       fmla	v0.2d, v0.2d, v1.2d
 # CHECK-NEXT:        2     13.5   0.1    0.0       <total>
 
-# CHECK:      [9] Code Region - fmlal
+# CHECK:      [7] Code Region - fmlal
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
-# CHECK-NEXT: Total Cycles:      2303
+# CHECK-NEXT: Total Cycles:      2203
 # CHECK-NEXT: Total uOps:        600
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.26
-# CHECK-NEXT: IPC:               0.26
+# CHECK-NEXT: uOps Per Cycle:    0.27
+# CHECK-NEXT: IPC:               0.27
 # CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012345678
-
-# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    .    .  .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .    .    .  .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [0,2]     D======eeeeeER .    .    .    .    .    .    .  .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [0,3]     .D==========eeER    .    .    .    .    .    .  .   fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [0,4]     .D============eeeeeER    .    .    .    .    .  .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [0,5]     .D=================eeeeeER    .    .    .    .  .   fmlal	v0.4s, v0.4h, v1.4h
-# CHECK-NEXT: [1,0]     . D=====================eeeER .    .    .    .  .   fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,1]     . D========================eeeeeER .    .    .  .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,2]     . D===========================eeeeeER   .    .  .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,3]     .  D===============================eeER .    .  .   fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: [1,4]     .  D=================================eeeeeER .  .   fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: [1,5]     .  D======================================eeeeeER   fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: Index     0123456789          0123456789          0123456
+
+# CHECK:      [0,0]     DeeeER    .    .    .    .    .    .    .    ..   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1]     D===eeeeeER    .    .    .    .    .    .    ..   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,2]     D=====eeeeeER  .    .    .    .    .    .    ..   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,3]     .D=========eeER.    .    .    .    .    .    ..   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,4]     .D===========eeeeeER.    .    .    .    .    ..   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,5]     .D================eeeeeER.    .    .    .    ..   fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: [1,0]     . D====================eeeER  .    .    .    ..   fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1]     . D=======================eeeeeER  .    .    ..   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,2]     . D=========================eeeeeER.    .    ..   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,3]     .  D=============================eeER   .    ..   fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,4]     .  D===============================eeeeeER   ..   fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,5]     .  D====================================eeeeeER   fmlal	v0.4s, v0.4h, v1.4h
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -481,15 +398,15 @@ smulh x0, x0, x0
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     11.5   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 1.     2     14.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 2.     2     17.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 3.     2     21.5   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT: 4.     2     23.5   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
-# CHECK-NEXT: 5.     2     28.5   0.0    0.0       fmlal	v0.4s, v0.4h, v1.4h
-# CHECK-NEXT:        2     19.5   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     11.0   0.5    0.0       fmul	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1.     2     14.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 2.     2     16.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 3.     2     20.0   0.0    0.0       fadd	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 4.     2     22.0   0.0    0.0       fmlal	v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 5.     2     27.0   0.0    0.0       fmlal	v0.4s, v0.4h, v1.4h
+# CHECK-NEXT:        2     18.3   0.1    0.0       <total>
 
-# CHECK:      [10] Code Region - crc32
+# CHECK:      [8] Code Region - crc32
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
@@ -527,7 +444,7 @@ smulh x0, x0, x0
 # CHECK-NEXT: 3.     2     8.0    0.0    0.0       crc32cb	w0, w0, w0
 # CHECK-NEXT:        2     6.1    0.1    0.0       <total>
 
-# CHECK:      [11] Code Region - smulh
+# CHECK:      [9] Code Region - smulh
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
@@ -564,3 +481,41 @@ smulh x0, x0, x0
 # CHECK-NEXT: 2.     2     18.5   0.0    0.0       smulh	x0, x0, x1
 # CHECK-NEXT: 3.     2     23.0   0.0    0.0       smulh	x0, x0, x0
 # CHECK-NEXT:        2     16.4   0.1    0.0       <total>
+
+# CHECK:      [10] Code Region - smaddl
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      803
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .  .   mul	x0, x0, x0
+# CHECK-NEXT: [0,1]     D===eeER  .    .  .   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [0,2]     D====eeER .    .  .   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [0,3]     .D=====eeER    .  .   smaddl	x0, w0, w0, x0
+# CHECK-NEXT: [1,0]     .D=======eeeeER.  .   mul	x0, x0, x0
+# CHECK-NEXT: [1,1]     .D==========eeER  .   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [1,2]     . D==========eeER .   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [1,3]     . D============eeER   smaddl	x0, w0, w0, x0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	x0, x0, x0
+# CHECK-NEXT: 1.     2     7.5    0.0    0.0       smaddl	x0, w1, w2, x0
+# CHECK-NEXT: 2.     2     8.0    0.0    0.0       smaddl	x0, w1, w2, x0
+# CHECK-NEXT: 3.     2     9.5    0.0    0.0       smaddl	x0, w0, w0, x0
+# CHECK-NEXT:        2     7.4    0.1    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
index 1c03d4be8add2..78c55729d3f7c 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
@@ -938,10 +938,10 @@
 # CHECK-NEXT:  1      2     0.50                        sqrdmlah	v0.4s, v1.4s, v2.s[1]
 # CHECK-NEXT:  1      4     1.00                        sqrdmlah	h0, h1, h2
 # CHECK-NEXT:  1      4     1.00                        sqrdmlah	v0.4h, v1.4h, v2.4h
-# CHECK-NEXT:  1      5     2.00                        sqrdmlah	v0.8h, v1.8h, v2.8h
+# CHECK-NEXT:  2      5     2.00                        sqrdmlah	v0.8h, v1.8h, v2.8h
 # CHECK-NEXT:  1      4     1.00                        sqrdmlah	s0, s1, s2
 # CHECK-NEXT:  1      4     1.00                        sqrdmlah	v0.2s, v1.2s, v2.2s
-# CHECK-NEXT:  1      5     2.00                        sqrdmlah	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT:  2      5     2.00                        sqrdmlah	v0.4s, v1.4s, v2.4s
 # CHECK-NEXT:  1      2     0.50                        sqrdmlsh	h0, h1, v2.h[3]
 # CHECK-NEXT:  1      2     0.50                        sqrdmlsh	v0.4h, v1.4h, v2.h[3]
 # CHECK-NEXT:  1      2     0.50                        sqrdmlsh	v0.8h, v1.8h, v2.h[3]
@@ -950,10 +950,10 @@
 # CHECK-NEXT:  1      2     0.50                        sqrdmlsh	v0.4s, v1.4s, v2.s[1]
 # CHECK-NEXT:  1      4     1.00                        sqrdmlsh	h0, h1, h2
 # CHECK-NEXT:  1      4     1.00                        sqrdmlsh	v0.4h, v1.4h, v2.4h
-# CHECK-NEXT:  1      5     2.00                        sqrdmlsh	v0.8h, v1.8h, v2.8h
+# CHECK-NEXT:  2      5     2.00                        sqrdmlsh	v0.8h, v1.8h, v2.8h
 # CHECK-NEXT:  1      4     1.00                        sqrdmlsh	s0, s1, s2
 # CHECK-NEXT:  1      4     1.00                        sqrdmlsh	v0.2s, v1.2s, v2.2s
-# CHECK-NEXT:  1      5     2.00                        sqrdmlsh	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT:  2      5     2.00                        sqrdmlsh	v0.4s, v1.4s, v2.4s
 # CHECK-NEXT:  1      4     1.00                        sqrdmulh	h10, h11, h12
 # CHECK-NEXT:  1      2     0.50                        sqrdmulh	h7, h15, v0.h[3]
 # CHECK-NEXT:  1      2     0.50                        sqrdmulh	s15, s14, v0.s[1]

>From 147cf9088c3fdf02187fcc380cd4034d90cb2f2e Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Wed, 25 Feb 2026 19:52:24 +0000
Subject: [PATCH 6/8] Update test

---
 .../llvm-mca/AArch64/Neoverse/N1-forwarding.s | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
index 4bffe97ed1b07..ad40aff3f30ac 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
@@ -88,26 +88,26 @@ smaddl x0, w0, w0, x0
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      407
+# CHECK-NEXT: Total Cycles:      1403
 # CHECK-NEXT: Total uOps:        400
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.98
-# CHECK-NEXT: IPC:               0.98
+# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: IPC:               0.29
 # CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     012345
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT:                     0123456789          0
+# CHECK-NEXT: Index     0123456789          0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    .   mul	x0, x0, x0
-# CHECK-NEXT: [0,1]     D==eeeeER .    .   madd	x0, x1, x2, x0
-# CHECK-NEXT: [0,2]     D=eeeeE-R .    .   madd	x0, x1, x2, x0
-# CHECK-NEXT: [0,3]     .D====eeeeER   .   madd	x0, x0, x0, x0
-# CHECK-NEXT: [1,0]     .D========eeeeER   mul	x0, x0, x0
-# CHECK-NEXT: [1,1]     .D===eeeeE-----R   madd	x0, x1, x2, x0
-# CHECK-NEXT: [1,2]     . D=eeeeE------R   madd	x0, x1, x2, x0
-# CHECK-NEXT: [1,3]     . D=====eeeeE--R   madd	x0, x0, x0, x0
+# CHECK:      [0,0]     DeeeeER   .    .    .    .    .   mul	x0, x0, x0
+# CHECK-NEXT: [0,1]     D===eeeeER.    .    .    .    .   madd	x0, x1, x2, x0
+# CHECK-NEXT: [0,2]     D======eeeeER  .    .    .    .   madd	x0, x1, x2, x0
+# CHECK-NEXT: [0,3]     .D=========eeeeER   .    .    .   madd	x0, x0, x0, x0
+# CHECK-NEXT: [1,0]     .D=============eeeeER    .    .   mul	x0, x0, x0
+# CHECK-NEXT: [1,1]     .D================eeeeER .    .   madd	x0, x1, x2, x0
+# CHECK-NEXT: [1,2]     . D==================eeeeER   .   madd	x0, x1, x2, x0
+# CHECK-NEXT: [1,3]     . D======================eeeeER   madd	x0, x0, x0, x0
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -116,11 +116,11 @@ smaddl x0, w0, w0, x0
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     5.0    0.5    0.0       mul	x0, x0, x0
-# CHECK-NEXT: 1.     2     3.5    3.5    2.5       madd	x0, x1, x2, x0
-# CHECK-NEXT: 2.     2     2.0    2.0    3.5       madd	x0, x1, x2, x0
-# CHECK-NEXT: 3.     2     5.5    0.0    1.0       madd	x0, x0, x0, x0
-# CHECK-NEXT:        2     4.0    1.5    1.8       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	x0, x0, x0
+# CHECK-NEXT: 1.     2     10.5   0.0    0.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 2.     2     13.0   0.0    0.0       madd	x0, x1, x2, x0
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       madd	x0, x0, x0, x0
+# CHECK-NEXT:        2     11.9   0.1    0.0       <total>
 
 # CHECK:      [1] Code Region - fmadd
 

>From 964021f7aa3df651924f518835f6bc69ae4493d9 Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Fri, 27 Feb 2026 10:59:31 +0000
Subject: [PATCH 7/8] Responds to comments and some general fix ups

---
 .../Target/AArch64/AArch64SchedNeoverseN1.td  |  21 ++--
 .../llvm-mca/AArch64/Neoverse/N1-forwarding.s | 106 +++++++++---------
 .../AArch64/Neoverse/N1-neon-instructions.s   |   2 +-
 3 files changed, 67 insertions(+), 62 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
index 99e0ef1d8c261..046c0abeecf04 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
@@ -274,10 +274,13 @@ def N1Rd_FMA : SchedReadAdvance<2, [WriteFMul, N1Wr_FMA]>;
 def N1Wr_VA : SchedWriteRes<[N1UnitV1]> { let Latency = 4; }
 def N1Rd_VA : SchedReadAdvance<3, [N1Wr_VA]>;
 
-def N1Wr_VMA  : SchedWriteRes<[N1UnitV0]> { let Latency = 5; }
+def N1Wr_VMA  : SchedWriteRes<[N1UnitV0]> { let Latency = 4; }
 def N1Wr_VMAQ : SchedWriteRes<[N1UnitV0, N1UnitV0]> { let Latency = 5; }
 def N1Rd_VMA  : SchedReadAdvance<3, [N1Wr_VMA, N1Wr_VMAQ]>;
 
+def N1Wr_VMAL : SchedWriteRes<[N1UnitV0]> { let Latency = 4; }
+def N1Rd_VMAL : SchedReadAdvance<3, [N1Wr_VMAL]>;
+
 def N1Wr_VPA : SchedWriteRes<[N1UnitV1]> { let Latency = 4; }
 def N1Rd_VPA : SchedReadAdvance<3, [N1Wr_VPA]>;
 
@@ -365,7 +368,7 @@ def : SchedAlias<WriteID64, N1Write_20c5_1M>;
 
 // Multiply accumulate, W-form
 // Multiply accumulate, X-form
-// Multiply accumulate, long
+// Multiply accumulate long
 def : SchedAlias<WriteIM32, N1Write_2c_1M>;
 def : SchedAlias<WriteIM64, N1Write_4c3_1M>;
 
@@ -654,8 +657,10 @@ def : InstRW<[N1Write_6c_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
 
 // ASIMD multiply, D-form
 def : InstRW<[N1Write_4c_1V0], (instregex "^MUL(v[14]i16|v[12]i32)$",
-                                          "^SQ(R)?DMULH(v[14]i16|v[12]i32)$",
-                                          "^([SU]|SQD)MULLv")>;
+                                          "^SQ(R)?DMULH(v[14]i16|v[12]i32)$")>;
+
+// ASIMD multiply long
+def : InstRW<[N1Write_4c_1V0], (instregex "^([SU]|SQD)MULLv")>;
 
 // ASIMD multiply accumulate, D-form
 def : InstRW<[N1Wr_VMA, N1Rd_VMA], (instregex "^ML[AS](v[14]i16|v[12]i32)$")>;
@@ -663,11 +668,11 @@ def : InstRW<[N1Wr_VMA, N1Rd_VMA], (instregex "^ML[AS](v[14]i16|v[12]i32)$")>;
 // ASIMD multiply accumulate high, D-form
 def : InstRW<[N1Write_4c_1V0], (instregex "^SQRDML[AS]H(v[14]i16|v[12]i32)$")>;
 
-// ASIMD multiply accumulate saturating long
-// ASIMD multiply long
 // ASIMD multiply accumulate long
-def : InstRW<[N1Write_4c_1V0], (instregex "^[SU]ML[AS]Lv",
-                                                 "^SQDML[AS]Lv")>;
+def : InstRW<[N1Wr_VMAL, N1Rd_VMAL], (instregex "^[SU]ML[AS]Lv")>;
+
+// ASIMD multiply accumulate saturating long
+def : InstRW<[N1Write_4c_1V0], (instregex "^SQDML[AS]Lv")>;
 
 // ASIMD multiply, Q-form
 def : InstRW<[N1Write_5c_2V0], (instregex "^MUL(v8i16|v4i32)$",
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
index ad40aff3f30ac..c5d02ec165d05 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
@@ -17,6 +17,13 @@ fmadd d0, d1, d2, d0
 fmadd d0, d0, d1, d2
 # LLVM-MCA-END
 
+# LLVM-MCA-BEGIN smaddl
+mul    x0, x0, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w0, w0, x0
+# LLVM-MCA-END
+
 # LLVM-MCA-BEGIN saba
 mul  v0.4s, v0.4s, v0.4s
 saba v0.4s, v1.4s, v2.4s
@@ -77,13 +84,6 @@ smulh x0, x0, x1
 smulh x0, x0, x0
 # LLVM-MCA-END
 
-# LLVM-MCA-BEGIN smaddl
-mul    x0, x0, x0
-smaddl x0, w1, w2, x0
-smaddl x0, w1, w2, x0
-smaddl x0, w0, w0, x0
-# LLVM-MCA-END
-
 # CHECK:      [0] Code Region - madd
 
 # CHECK:      Iterations:        100
@@ -166,7 +166,45 @@ smaddl x0, w0, w0, x0
 # CHECK-NEXT: 5.     2     23.5   0.0    0.0       fmadd	d0, d0, d1, d2
 # CHECK-NEXT:        2     15.7   0.1    0.0       <total>
 
-# CHECK:      [2] Code Region - saba
+# CHECK:      [2] Code Region - smaddl
+
+# CHECK:      Iterations:        100
+# CHECK-NEXT: Instructions:      400
+# CHECK-NEXT: Total Cycles:      803
+# CHECK-NEXT: Total uOps:        400
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeER   .    .  .   mul	x0, x0, x0
+# CHECK-NEXT: [0,1]     D===eeER  .    .  .   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [0,2]     D====eeER .    .  .   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [0,3]     .D=====eeER    .  .   smaddl	x0, w0, w0, x0
+# CHECK-NEXT: [1,0]     .D=======eeeeER.  .   mul	x0, x0, x0
+# CHECK-NEXT: [1,1]     .D==========eeER  .   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [1,2]     . D==========eeER .   smaddl	x0, w1, w2, x0
+# CHECK-NEXT: [1,3]     . D============eeER   smaddl	x0, w0, w0, x0
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	x0, x0, x0
+# CHECK-NEXT: 1.     2     7.5    0.0    0.0       smaddl	x0, w1, w2, x0
+# CHECK-NEXT: 2.     2     8.0    0.0    0.0       smaddl	x0, w1, w2, x0
+# CHECK-NEXT: 3.     2     9.5    0.0    0.0       smaddl	x0, w0, w0, x0
+# CHECK-NEXT:        2     7.4    0.1    0.0       <total>
+
+# CHECK:      [3] Code Region - saba
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
@@ -204,7 +242,7 @@ smaddl x0, w0, w0, x0
 # CHECK-NEXT: 3.     2     16.0   0.0    0.0       saba	v0.4s, v0.4s, v1.4s
 # CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
-# CHECK:      [3] Code Region - mla
+# CHECK:      [4] Code Region - mla
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
@@ -242,7 +280,7 @@ smaddl x0, w0, w0, x0
 # CHECK-NEXT: 3.     2     19.5   0.0    0.0       mla	v0.4s, v0.4s, v1.4s
 # CHECK-NEXT:        2     14.0   0.1    0.0       <total>
 
-# CHECK:      [4] Code Region - sadalp
+# CHECK:      [5] Code Region - sadalp
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
@@ -280,7 +318,7 @@ smaddl x0, w0, w0, x0
 # CHECK-NEXT: 3.     2     16.0   0.0    0.0       sadalp	v0.2d, v0.4s
 # CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
-# CHECK:      [5] Code Region - ssra
+# CHECK:      [6] Code Region - ssra
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
@@ -318,7 +356,7 @@ smaddl x0, w0, w0, x0
 # CHECK-NEXT: 3.     2     16.0   0.0    0.0       ssra	v0.2d, v0.2d, #1
 # CHECK-NEXT:        2     11.8   0.1    0.0       <total>
 
-# CHECK:      [6] Code Region - fmla
+# CHECK:      [7] Code Region - fmla
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
@@ -362,7 +400,7 @@ smaddl x0, w0, w0, x0
 # CHECK-NEXT: 5.     2     20.5   0.0    0.0       fmla	v0.2d, v0.2d, v1.2d
 # CHECK-NEXT:        2     13.5   0.1    0.0       <total>
 
-# CHECK:      [7] Code Region - fmlal
+# CHECK:      [8] Code Region - fmlal
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      600
@@ -406,7 +444,7 @@ smaddl x0, w0, w0, x0
 # CHECK-NEXT: 5.     2     27.0   0.0    0.0       fmlal	v0.4s, v0.4h, v1.4h
 # CHECK-NEXT:        2     18.3   0.1    0.0       <total>
 
-# CHECK:      [8] Code Region - crc32
+# CHECK:      [9] Code Region - crc32
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
@@ -444,7 +482,7 @@ smaddl x0, w0, w0, x0
 # CHECK-NEXT: 3.     2     8.0    0.0    0.0       crc32cb	w0, w0, w0
 # CHECK-NEXT:        2     6.1    0.1    0.0       <total>
 
-# CHECK:      [9] Code Region - smulh
+# CHECK:      [10] Code Region - smulh
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
@@ -481,41 +519,3 @@ smaddl x0, w0, w0, x0
 # CHECK-NEXT: 2.     2     18.5   0.0    0.0       smulh	x0, x0, x1
 # CHECK-NEXT: 3.     2     23.0   0.0    0.0       smulh	x0, x0, x0
 # CHECK-NEXT:        2     16.4   0.1    0.0       <total>
-
-# CHECK:      [10] Code Region - smaddl
-
-# CHECK:      Iterations:        100
-# CHECK-NEXT: Instructions:      400
-# CHECK-NEXT: Total Cycles:      803
-# CHECK-NEXT: Total uOps:        400
-
-# CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.50
-# CHECK-NEXT: Block RThroughput: 4.0
-
-# CHECK:      Timeline view:
-# CHECK-NEXT:                     012345678
-# CHECK-NEXT: Index     0123456789
-
-# CHECK:      [0,0]     DeeeeER   .    .  .   mul	x0, x0, x0
-# CHECK-NEXT: [0,1]     D===eeER  .    .  .   smaddl	x0, w1, w2, x0
-# CHECK-NEXT: [0,2]     D====eeER .    .  .   smaddl	x0, w1, w2, x0
-# CHECK-NEXT: [0,3]     .D=====eeER    .  .   smaddl	x0, w0, w0, x0
-# CHECK-NEXT: [1,0]     .D=======eeeeER.  .   mul	x0, x0, x0
-# CHECK-NEXT: [1,1]     .D==========eeER  .   smaddl	x0, w1, w2, x0
-# CHECK-NEXT: [1,2]     . D==========eeER .   smaddl	x0, w1, w2, x0
-# CHECK-NEXT: [1,3]     . D============eeER   smaddl	x0, w0, w0, x0
-
-# CHECK:      Average Wait times (based on the timeline view):
-# CHECK-NEXT: [0]: Executions
-# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
-# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
-# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
-
-# CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     4.5    0.5    0.0       mul	x0, x0, x0
-# CHECK-NEXT: 1.     2     7.5    0.0    0.0       smaddl	x0, w1, w2, x0
-# CHECK-NEXT: 2.     2     8.0    0.0    0.0       smaddl	x0, w1, w2, x0
-# CHECK-NEXT: 3.     2     9.5    0.0    0.0       smaddl	x0, w0, w0, x0
-# CHECK-NEXT:        2     7.4    0.1    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
index 78c55729d3f7c..a4910045eca97 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
@@ -613,7 +613,7 @@
 # CHECK-NEXT:  1      2     0.50                        mla	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  1      2     0.50                        mla	v15.8h, v22.8h, v4.h[3]
 # CHECK-NEXT:  1      2     0.50                        mla	v28.2s, v10.2s, v2.s[0]
-# CHECK-NEXT:  1      5     1.00                        mls	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      4     1.00                        mls	v0.4h, v0.4h, v0.4h
 # CHECK-NEXT:  1      2     0.50                        mls	v25.8h, v29.8h, v0.h[4]
 # CHECK-NEXT:  1      2     0.50                        mls	v22.2s, v29.2s, v0.s[3]
 # CHECK-NEXT:  1      5     2.00                        mls	v26.4s, v5.4s, v28.4s

>From ffafcb8c4131d9fd89c0c8ba4a483a86e31083d5 Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Mon, 2 Mar 2026 15:48:37 +0000
Subject: [PATCH 8/8] Update MLA/MLS Q-form to take two uops

---
 .../Target/AArch64/AArch64SchedNeoverseN1.td  |  5 +++-
 .../llvm-mca/AArch64/Neoverse/N1-forwarding.s | 28 +++++++++----------
 .../AArch64/Neoverse/N1-neon-instructions.s   |  2 +-
 3 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
index 046c0abeecf04..6134461aa609d 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
@@ -275,7 +275,10 @@ def N1Wr_VA : SchedWriteRes<[N1UnitV1]> { let Latency = 4; }
 def N1Rd_VA : SchedReadAdvance<3, [N1Wr_VA]>;
 
 def N1Wr_VMA  : SchedWriteRes<[N1UnitV0]> { let Latency = 4; }
-def N1Wr_VMAQ : SchedWriteRes<[N1UnitV0, N1UnitV0]> { let Latency = 5; }
+def N1Wr_VMAQ : SchedWriteRes<[N1UnitV0, N1UnitV0]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
 def N1Rd_VMA  : SchedReadAdvance<3, [N1Wr_VMA, N1Wr_VMAQ]>;
 
 def N1Wr_VMAL : SchedWriteRes<[N1UnitV0]> { let Latency = 4; }
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
index c5d02ec165d05..3080a04872bf9 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
@@ -247,10 +247,10 @@ smulh x0, x0, x0
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      400
 # CHECK-NEXT: Total Cycles:      1703
-# CHECK-NEXT: Total uOps:        500
+# CHECK-NEXT: Total uOps:        800
 
 # CHECK:      Dispatch Width:    3
-# CHECK-NEXT: uOps Per Cycle:    0.29
+# CHECK-NEXT: uOps Per Cycle:    0.47
 # CHECK-NEXT: IPC:               0.23
 # CHECK-NEXT: Block RThroughput: 8.0
 
@@ -259,13 +259,13 @@ smulh x0, x0, x0
 # CHECK-NEXT: Index     0123456789          0123456789
 
 # CHECK:      [0,0]     DeeeeeER  .    .    .    .    .    ..   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [0,1]     D=====eeeeeER  .    .    .    .    ..   mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,2]     .D======eeeeeER.    .    .    .    ..   mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [0,3]     .D===========eeeeeER.    .    .    ..   mla	v0.4s, v0.4s, v1.4s
-# CHECK-NEXT: [1,0]     . D===============eeeeeER.    .    ..   mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: [1,1]     . D====================eeeeeER.    ..   mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,2]     .  D=====================eeeeeER   ..   mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: [1,3]     .  D==========================eeeeeER   mla	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [0,1]     .D====eeeeeER  .    .    .    .    ..   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2]     . D=====eeeeeER.    .    .    .    ..   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3]     .  D=========eeeeeER.    .    .    ..   mla	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0]     .   D=============eeeeeER.    .    ..   mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1]     .    D=================eeeeeER.    ..   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2]     .    .D==================eeeeeER   ..   mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3]     .    . D======================eeeeeER   mla	v0.4s, v0.4s, v1.4s
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -274,11 +274,11 @@ smulh x0, x0, x0
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     2     8.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1.     2     13.5   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 2.     2     14.5   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
-# CHECK-NEXT: 3.     2     19.5   0.0    0.0       mla	v0.4s, v0.4s, v1.4s
-# CHECK-NEXT:        2     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 0.     2     7.5    0.5    0.0       mul	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1.     2     11.5   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2.     2     12.5   0.0    0.0       mla	v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3.     2     16.5   0.0    0.0       mla	v0.4s, v0.4s, v1.4s
+# CHECK-NEXT:        2     12.0   0.1    0.0       <total>
 
 # CHECK:      [5] Code Region - sadalp
 
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
index a4910045eca97..270990154f245 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-neon-instructions.s
@@ -616,7 +616,7 @@
 # CHECK-NEXT:  1      4     1.00                        mls	v0.4h, v0.4h, v0.4h
 # CHECK-NEXT:  1      2     0.50                        mls	v25.8h, v29.8h, v0.h[4]
 # CHECK-NEXT:  1      2     0.50                        mls	v22.2s, v29.2s, v0.s[3]
-# CHECK-NEXT:  1      5     2.00                        mls	v26.4s, v5.4s, v28.4s
+# CHECK-NEXT:  2      5     2.00                        mls	v26.4s, v5.4s, v28.4s
 # CHECK-NEXT:  1      2     0.50                        mov	b0, v0.b[15]
 # CHECK-NEXT:  1      2     0.50                        mov	d6, v0.d[1]
 # CHECK-NEXT:  1      2     0.50                        mov	h2, v0.h[5]



More information about the llvm-commits mailing list