[llvm] a28699b - [AArch64] Model late forwarding in Neoverse N1 (#177590)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 6 02:34:42 PST 2026
Author: Amina Chabane
Date: 2026-03-06T10:34:37Z
New Revision: a28699bdcc5fb9075a2f24db6fc89f1f0c920b7d
URL: https://github.com/llvm/llvm-project/commit/a28699bdcc5fb9075a2f24db6fc89f1f0c920b7d
DIFF: https://github.com/llvm/llvm-project/commit/a28699bdcc5fb9075a2f24db6fc89f1f0c920b7d.diff
LOG: [AArch64] Model late forwarding in Neoverse N1 (#177590)
Based on the [N1
SWOG](https://developer.arm.com/documentation/109896/latest/), this
patch introduces late forwarding to Neoverse N1.
Added:
llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
Modified:
llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
index 80e5bff5abba7..3b101e0b7655e 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td
@@ -261,6 +261,47 @@ def N1Write_9c_6L_6V : SchedWriteRes<[N1UnitL, N1UnitL, N1UnitL,
N1UnitV, N1UnitV, N1UnitV,
N1UnitV, N1UnitV, N1UnitV]>;
+//===----------------------------------------------------------------------===//
+// Define forwarded types
+
+def N1Wr_IM32 : SchedWriteRes<[N1UnitM]> { let Latency = 2; }
+def N1Wr_IM64 : SchedWriteRes<[N1UnitM]> { let Latency = 4;
+ let ReleaseAtCycles = [3]; }
+def N1Rd_IMA : SchedReadAdvance<1, [N1Wr_IM32, N1Wr_IM64]>;
+
+def N1Wr_MH : SchedWriteRes<[N1UnitM]> { let Latency = 5;
+ let ReleaseAtCycles = [4]; }
+def N1Rd_MH : SchedReadAdvance<2, [N1Wr_MH]>;
+
+def N1Wr_FMA : SchedWriteRes<[N1UnitV]> { let Latency = 4; }
+def N1Rd_FMA : SchedReadAdvance<2, [WriteFMul, N1Wr_FMA]>;
+
+def N1Wr_VA : SchedWriteRes<[N1UnitV1]> { let Latency = 4; }
+def N1Rd_VA : SchedReadAdvance<3, [N1Wr_VA]>;
+
+def N1Wr_VMA : SchedWriteRes<[N1UnitV0]> { let Latency = 4; }
+def N1Wr_VMAQ : SchedWriteRes<[N1UnitV0, N1UnitV0]> { let Latency = 5;
+ let NumMicroOps = 2; }
+def N1Rd_VMA : SchedReadAdvance<3, [N1Wr_VMA, N1Wr_VMAQ]>;
+
+def N1Wr_VMAL : SchedWriteRes<[N1UnitV0]> { let Latency = 4; }
+def N1Rd_VMAL : SchedReadAdvance<3, [N1Wr_VMAL]>;
+
+def N1Wr_VPA : SchedWriteRes<[N1UnitV1]> { let Latency = 4; }
+def N1Rd_VPA : SchedReadAdvance<3, [N1Wr_VPA]>;
+
+def N1Wr_VSA : SchedWriteRes<[N1UnitV1]> { let Latency = 4; }
+def N1Rd_VSA : SchedReadAdvance<3, [N1Wr_VSA]>;
+
+def N1Wr_VFM : SchedWriteRes<[N1UnitV]> { let Latency = 3; }
+def N1Wr_VFMA : SchedWriteRes<[N1UnitV]> { let Latency = 4; }
+def N1Rd_VFMA : SchedReadAdvance<2, [N1Wr_VFM, N1Wr_VFMA]>;
+
+def N1Wr_VFMAL : SchedWriteRes<[N1UnitV]> { let Latency = 5; }
+def N1Rd_VFMAL : SchedReadAdvance<3, [N1Wr_VFMAL]>;
+
+def N1Wr_CRC : SchedWriteRes<[N1UnitM]> { let Latency = 2; }
+def N1Rd_CRC : SchedReadAdvance<1, [N1Wr_CRC]>;
// Miscellaneous Instructions
// -----------------------------------------------------------------------------
@@ -327,13 +368,21 @@ def : InstRW<[N1Write_2c_1M], (instregex "^(AND|BIC)S[WX]rs$")>;
def : SchedAlias<WriteID32, N1Write_12c5_1M>;
def : SchedAlias<WriteID64, N1Write_20c5_1M>;
-// Multiply accumulate
-// Multiply accumulate, long
+// Multiply accumulate, W-form
+// Multiply accumulate, X-form
+// Multiply accumulate long
def : SchedAlias<WriteIM32, N1Write_2c_1M>;
def : SchedAlias<WriteIM64, N1Write_4c3_1M>;
+def : InstRW<[N1Wr_IM32, ReadIM, ReadIM, N1Rd_IMA],
+ (instregex "^M(ADD|SUB)Wrrr$")>;
+def : InstRW<[N1Wr_IM64, ReadIM, ReadIM, N1Rd_IMA],
+ (instregex "^M(ADD|SUB)Xrrr$")>;
+def : InstRW<[N1Wr_IM32, ReadIM, ReadIM, N1Rd_IMA],
+ (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
+
// Multiply high
-def : InstRW<[N1Write_5c3_1M, ReadIM, ReadIM], (instrs SMULHrr, UMULHrr)>;
+def : InstRW<[N1Wr_MH, ReadIM, ReadIM, N1Rd_MH], (instrs SMULHrr, UMULHrr)>;
// Miscellaneous data-processing instructions
@@ -431,7 +480,7 @@ def : InstRW<[N1Write_17c7_1V0], (instrs FSQRTDr)>;
def : SchedAlias<WriteFMul, N1Write_3c_1V>;
// FP multiply accumulate
-def : InstRW<[N1Write_4c_1V], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
+def : InstRW<[N1Wr_FMA, ReadDefault, ReadDefault, N1Rd_FMA], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>;
// FP round to integral
def : InstRW<[N1Write_3c_1V0], (instregex "^FRINT[AIMNPXZ][HSD]r$",
@@ -588,7 +637,7 @@ def : SchedAlias<WriteVq, N1Write_2c_1V>;
// ASIMD absolute
diff accum
// ASIMD absolute
diff accum long
-def : InstRW<[N1Write_4c_1V1], (instregex "^[SU]ABAL?v")>;
+def : InstRW<[N1Wr_VA, N1Rd_VA], (instregex "^[SU]ABAL?v")>;
// ASIMD arith, reduce, 4H/4S
def : InstRW<[N1Write_3c_1V1], (instregex "^(ADDV|[SU]ADDLV)v4(i16|i32)v$")>;
@@ -609,26 +658,33 @@ def : InstRW<[N1Write_5c_1V1_1V], (instregex "^[SU](MAX|MIN)Vv8(i8|i16)v$")>;
def : InstRW<[N1Write_6c_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
// ASIMD multiply, D-form
+def : InstRW<[N1Write_4c_1V0], (instregex "^MUL(v[14]i16|v[12]i32)$",
+ "^SQ(R)?DMULH(v[14]i16|v[12]i32)$")>;
+
+// ASIMD multiply long
+def : InstRW<[N1Write_4c_1V0], (instregex "^([SU]|SQD)MULLv")>;
+
// ASIMD multiply accumulate, D-form
+def : InstRW<[N1Wr_VMA, N1Rd_VMA], (instregex "^ML[AS](v[14]i16|v[12]i32)$")>;
+
// ASIMD multiply accumulate high, D-form
-// ASIMD multiply accumulate saturating long
-// ASIMD multiply long
+def : InstRW<[N1Write_4c_1V0], (instregex "^SQRDML[AS]H(v[14]i16|v[12]i32)$")>;
+
// ASIMD multiply accumulate long
-def : InstRW<[N1Write_4c_1V0], (instregex "^MUL(v[14]i16|v[12]i32)$",
- "^ML[AS](v[14]i16|v[12]i32)$",
- "^SQ(R)?DMULH(v[14]i16|v[12]i32)$",
- "^SQRDML[AS]H(v[14]i16|v[12]i32)$",
- "^SQDML[AS]Lv",
- "^([SU]|SQD)MULLv",
- "^[SU]ML[AS]Lv")>;
+def : InstRW<[N1Wr_VMAL, N1Rd_VMAL], (instregex "^[SU]ML[AS]Lv")>;
+
+// ASIMD multiply accumulate saturating long
+def : InstRW<[N1Write_4c_1V0], (instregex "^SQDML[AS]Lv")>;
// ASIMD multiply, Q-form
+def : InstRW<[N1Write_5c_2V0], (instregex "^MUL(v8i16|v4i32)$",
+ "^SQ(R)?DMULH(v8i16|v4i32)$")>;
+
// ASIMD multiply accumulate, Q-form
+def : InstRW<[N1Wr_VMAQ, N1Rd_VMA], (instregex "^ML[AS](v8i16|v4i32)$")>;
+
// ASIMD multiply accumulate high, Q-form
-def : InstRW<[N1Write_5c_2V0], (instregex "^MUL(v8i16|v4i32)$",
- "^ML[AS](v8i16|v4i32)$",
- "^SQ(R)?DMULH(v8i16|v4i32)$",
- "^SQRDML[AS]H(v8i16|v4i32)$")>;
+def : InstRW<[N1Write_5c_2V0], (instregex "^SQRDML[AS]H(v8i16|v4i32)$")>;
// ASIMD multiply/multiply long (8x8) polynomial, D-form
def : InstRW<[N1Write_3c_1V0], (instrs PMULv8i8, PMULLv8i8)>;
@@ -637,10 +693,10 @@ def : InstRW<[N1Write_3c_1V0], (instrs PMULv8i8, PMULLv8i8)>;
def : InstRW<[N1Write_4c_2V0], (instrs PMULv16i8, PMULLv16i8)>;
// ASIMD pairwise add and accumulate long
-def : InstRW<[N1Write_4c_1V1], (instregex "^[SU]ADALPv")>;
+def : InstRW<[N1Wr_VPA, N1Rd_VPA], (instregex "^[SU]ADALPv")>;
// ASIMD shift accumulate
-def : InstRW<[N1Write_4c_1V1], (instregex "^[SU]R?SRAv")>;
+def : InstRW<[N1Wr_VSA, N1Rd_VSA], (instregex "^[SU]R?SRAv")>;
// ASIMD shift by immed, basic
// ASIMD shift by immed and insert, basic
@@ -722,13 +778,13 @@ def : InstRW<[N1Write_5c_1V], (instregex "^F(MAX|MIN)(NM)?Vv4(i16|i32)v$")>;
def : InstRW<[N1Write_8c_3V], (instregex "^F(MAX|MIN)(NM)?Vv8i16v$")>;
// ASIMD FP multiply
-def : InstRW<[N1Write_3c_1V], (instregex "^FMULX?v")>;
+def : InstRW<[N1Wr_VFM], (instregex "^FMULX?v")>;
// ASIMD FP multiply accumulate
-def : InstRW<[N1Write_4c_1V], (instregex "^FML[AS]v")>;
+def : InstRW<[N1Wr_VFMA, N1Rd_VFMA], (instregex "^FML[AS]v")>;
// ASIMD FP multiply accumulate long
-def : InstRW<[N1Write_5c_1V], (instregex "^FML[AS]L2?v")>;
+def : InstRW<[N1Wr_VFMAL, N1Rd_VFMAL], (instregex "^FML[AS]L2?v")>;
// ASIMD FP round, D-form F32 and Q-form F64
def : InstRW<[N1Write_3c_1V0], (instregex "^FRINT[AIMNPXZ]v2f(32|64)$")>;
@@ -1053,7 +1109,7 @@ def : InstRW<[N1Write_4c_1V0], (instregex "^SHA1[CMP]rrr$",
// -----------------------------------------------------------------------------
// CRC checksum ops
-def : InstRW<[N1Write_2c_1M], (instregex "^CRC32C?[BHWX]rr$")>;
+def : InstRW<[N1Wr_CRC, N1Rd_CRC], (instregex "^CRC32C?[BHWX]rr$")>;
}
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
index 4803f1e68648b..17ec3056fb3cf 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-basic-instructions.s
@@ -448,10 +448,10 @@
# CHECK-NEXT: 1 2 1.00 crc32ch w25, w26, w16
# CHECK-NEXT: 1 2 1.00 crc32cw w27, w12, w23
# CHECK-NEXT: 1 2 1.00 crc32cx w21, w28, x5
-# CHECK-NEXT: 1 5 3.00 smulh x30, x29, x28
-# CHECK-NEXT: 1 5 3.00 smulh xzr, x27, x26
-# CHECK-NEXT: 1 5 3.00 umulh x30, x29, x28
-# CHECK-NEXT: 1 5 3.00 umulh x23, x30, xzr
+# CHECK-NEXT: 1 5 4.00 smulh x30, x29, x28
+# CHECK-NEXT: 1 5 4.00 smulh xzr, x27, x26
+# CHECK-NEXT: 1 5 4.00 umulh x30, x29, x28
+# CHECK-NEXT: 1 5 4.00 umulh x23, x30, xzr
# CHECK-NEXT: 1 2 1.00 madd w1, w3, w7, w4
# CHECK-NEXT: 1 2 1.00 madd wzr, w0, w9, w11
# CHECK-NEXT: 1 2 1.00 madd w13, wzr, w4, w4
@@ -490,8 +490,8 @@
# CHECK-NEXT: 1 2 1.00 umsubl x3, w5, w2, x9
# CHECK-NEXT: 1 2 1.00 umsubl x16, w17, wzr, x18
# CHECK-NEXT: 1 2 1.00 umnegl x19, w20, w21
-# CHECK-NEXT: 1 5 3.00 smulh x23, x22, xzr
-# CHECK-NEXT: 1 5 3.00 umulh x23, x22, xzr
+# CHECK-NEXT: 1 5 4.00 smulh x23, x22, xzr
+# CHECK-NEXT: 1 5 4.00 umulh x23, x22, xzr
# CHECK-NEXT: 1 4 3.00 mul x19, x20, xzr
# CHECK-NEXT: 1 2 1.00 mneg w21, w22, w23
# CHECK-NEXT: 1 2 1.00 smull x11, w13, w17
@@ -1264,7 +1264,7 @@
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1.0] [1.1] [2.0] [2.1] [3] [4.0] [4.1] [5] [6]
-# CHECK-NEXT: 26.00 34.00 34.00 252.50 252.50 483.33 197.33 197.33 293.00 161.00
+# CHECK-NEXT: 26.00 34.00 34.00 252.50 252.50 489.33 197.33 197.33 293.00 161.00
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1.0] [1.1] [2.0] [2.1] [3] [4.0] [4.1] [5] [6] Instructions:
@@ -1706,10 +1706,10 @@
# CHECK-NEXT: - - - - - 1.00 - - - - crc32ch w25, w26, w16
# CHECK-NEXT: - - - - - 1.00 - - - - crc32cw w27, w12, w23
# CHECK-NEXT: - - - - - 1.00 - - - - crc32cx w21, w28, x5
-# CHECK-NEXT: - - - - - 3.00 - - - - smulh x30, x29, x28
-# CHECK-NEXT: - - - - - 3.00 - - - - smulh xzr, x27, x26
-# CHECK-NEXT: - - - - - 3.00 - - - - umulh x30, x29, x28
-# CHECK-NEXT: - - - - - 3.00 - - - - umulh x23, x30, xzr
+# CHECK-NEXT: - - - - - 4.00 - - - - smulh x30, x29, x28
+# CHECK-NEXT: - - - - - 4.00 - - - - smulh xzr, x27, x26
+# CHECK-NEXT: - - - - - 4.00 - - - - umulh x30, x29, x28
+# CHECK-NEXT: - - - - - 4.00 - - - - umulh x23, x30, xzr
# CHECK-NEXT: - - - - - 1.00 - - - - madd w1, w3, w7, w4
# CHECK-NEXT: - - - - - 1.00 - - - - madd wzr, w0, w9, w11
# CHECK-NEXT: - - - - - 1.00 - - - - madd w13, wzr, w4, w4
@@ -1748,8 +1748,8 @@
# CHECK-NEXT: - - - - - 1.00 - - - - umsubl x3, w5, w2, x9
# CHECK-NEXT: - - - - - 1.00 - - - - umsubl x16, w17, wzr, x18
# CHECK-NEXT: - - - - - 1.00 - - - - umnegl x19, w20, w21
-# CHECK-NEXT: - - - - - 3.00 - - - - smulh x23, x22, xzr
-# CHECK-NEXT: - - - - - 3.00 - - - - umulh x23, x22, xzr
+# CHECK-NEXT: - - - - - 4.00 - - - - smulh x23, x22, xzr
+# CHECK-NEXT: - - - - - 4.00 - - - - umulh x23, x22, xzr
# CHECK-NEXT: - - - - - 3.00 - - - - mul x19, x20, xzr
# CHECK-NEXT: - - - - - 1.00 - - - - mneg w21, w22, w23
# CHECK-NEXT: - - - - - 1.00 - - - - smull x11, w13, w17
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
new file mode 100644
index 0000000000000..df5d2590d0c64
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-forwarding.s
@@ -0,0 +1,521 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-n1 -mattr=+fp16fml --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=2 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN madd
+mul x0, x0, x0
+madd x0, x1, x2, x0
+madd x0, x1, x2, x0
+madd x0, x0, x0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmadd
+fadd d0, d0, d0
+fmadd d0, d1, d2, d0
+fmadd d0, d1, d2, d0
+fmul d0, d0, d0
+fmadd d0, d1, d2, d0
+fmadd d0, d0, d1, d2
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smaddl
+mul x0, x0, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w0, w0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN saba
+mul v0.4s, v0.4s, v0.4s
+saba v0.4s, v1.4s, v2.4s
+saba v0.4s, v1.4s, v2.4s
+saba v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN mla
+mul v0.4s, v0.4s, v0.4s
+mla v0.4s, v1.4s, v2.4s
+mla v0.4s, v1.4s, v2.4s
+mla v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sadalp
+mul v0.4s, v0.4s, v0.4s
+sadalp v0.2d, v1.4s
+sadalp v0.2d, v1.4s
+sadalp v0.2d, v0.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN ssra
+mul v0.4s, v0.4s, v0.4s
+ssra v0.2d, v1.2d, #1
+ssra v0.2d, v1.2d, #1
+ssra v0.2d, v0.2d, #1
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmla
+fmul v0.2d, v0.2d, v0.2d
+fmla v0.2d, v1.2d, v2.2d
+fmla v0.2d, v1.2d, v2.2d
+fadd v0.2d, v0.2d, v0.2d
+fmla v0.2d, v1.2d, v2.2d
+fmla v0.2d, v0.2d, v1.2d
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmlal
+fmul v0.2d, v0.2d, v0.2d
+fmlal v0.4s, v1.4h, v2.4h
+fmlal v0.4s, v1.4h, v2.4h
+fadd v0.2d, v0.2d, v0.2d
+fmlal v0.4s, v1.4h, v2.4h
+fmlal v0.4s, v0.4h, v1.4h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN crc32
+mul w0, w0, w0
+crc32cb w0, w0, w1
+crc32cb w0, w0, w1
+crc32cb w0, w0, w0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smulh
+mul x0, x0, x0
+smulh x0, x0, x1
+smulh x0, x0, x1
+smulh x0, x0, x0
+# LLVM-MCA-END
+
+# CHECK: [0] Code Region - madd
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1403
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.29
+# CHECK-NEXT: IPC: 0.29
+# CHECK-NEXT: Block RThroughput: 12.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeER . . . . . mul x0, x0, x0
+# CHECK-NEXT: [0,1] D===eeeeER. . . . . madd x0, x1, x2, x0
+# CHECK-NEXT: [0,2] D======eeeeER . . . . madd x0, x1, x2, x0
+# CHECK-NEXT: [0,3] .D=========eeeeER . . . madd x0, x0, x0, x0
+# CHECK-NEXT: [1,0] .D=============eeeeER . . mul x0, x0, x0
+# CHECK-NEXT: [1,1] .D================eeeeER . . madd x0, x1, x2, x0
+# CHECK-NEXT: [1,2] . D==================eeeeER . madd x0, x1, x2, x0
+# CHECK-NEXT: [1,3] . D======================eeeeER madd x0, x0, x0, x0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul x0, x0, x0
+# CHECK-NEXT: 1. 2 10.5 0.0 0.0 madd x0, x1, x2, x0
+# CHECK-NEXT: 2. 2 13.0 0.0 0.0 madd x0, x1, x2, x0
+# CHECK-NEXT: 3. 2 16.5 0.0 0.0 madd x0, x0, x0, x0
+# CHECK-NEXT: 2 11.9 0.1 0.0 <total>
+
+# CHECK: [1] Code Region - fmadd
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 600
+# CHECK-NEXT: Total Cycles: 1903
+# CHECK-NEXT: Total uOps: 600
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.32
+# CHECK-NEXT: IPC: 0.32
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 0
+
+# CHECK: [0,0] DeeER. . . . . . . . fadd d0, d0, d0
+# CHECK-NEXT: [0,1] D==eeeeER . . . . . . . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [0,2] D====eeeeER . . . . . . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [0,3] .D=======eeeER . . . . . . fmul d0, d0, d0
+# CHECK-NEXT: [0,4] .D==========eeeeER . . . . . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [0,5] .D==============eeeeER . . . . fmadd d0, d0, d1, d2
+# CHECK-NEXT: [1,0] . D=================eeER . . . . fadd d0, d0, d0
+# CHECK-NEXT: [1,1] . D===================eeeeER . . . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [1,2] . D=====================eeeeER. . . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [1,3] . D========================eeeER . . fmul d0, d0, d0
+# CHECK-NEXT: [1,4] . D===========================eeeeER . fmadd d0, d1, d2, d0
+# CHECK-NEXT: [1,5] . D===============================eeeeER fmadd d0, d0, d1, d2
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 9.5 0.5 0.0 fadd d0, d0, d0
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fmadd d0, d1, d2, d0
+# CHECK-NEXT: 2. 2 13.5 0.0 0.0 fmadd d0, d1, d2, d0
+# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmul d0, d0, d0
+# CHECK-NEXT: 4. 2 19.5 0.0 0.0 fmadd d0, d1, d2, d0
+# CHECK-NEXT: 5. 2 23.5 0.0 0.0 fmadd d0, d0, d1, d2
+# CHECK-NEXT: 2 15.7 0.1 0.0 <total>
+
+# CHECK: [2] Code Region - smaddl
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 803
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.50
+# CHECK-NEXT: IPC: 0.50
+# CHECK-NEXT: Block RThroughput: 6.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 012345678
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeeeER . . . mul x0, x0, x0
+# CHECK-NEXT: [0,1] D===eeER . . . smaddl x0, w1, w2, x0
+# CHECK-NEXT: [0,2] D====eeER . . . smaddl x0, w1, w2, x0
+# CHECK-NEXT: [0,3] .D=====eeER . . smaddl x0, w0, w0, x0
+# CHECK-NEXT: [1,0] .D=======eeeeER. . mul x0, x0, x0
+# CHECK-NEXT: [1,1] .D==========eeER . smaddl x0, w1, w2, x0
+# CHECK-NEXT: [1,2] . D==========eeER . smaddl x0, w1, w2, x0
+# CHECK-NEXT: [1,3] . D============eeER smaddl x0, w0, w0, x0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0
+# CHECK-NEXT: 1. 2 7.5 0.0 0.0 smaddl x0, w1, w2, x0
+# CHECK-NEXT: 2. 2 8.0 0.0 0.0 smaddl x0, w1, w2, x0
+# CHECK-NEXT: 3. 2 9.5 0.0 0.0 smaddl x0, w0, w0, x0
+# CHECK-NEXT: 2 7.4 0.1 0.0 <total>
+
+# CHECK: [3] Code Region - saba
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1403
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.29
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2] .D=====eeeeER . . . . saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3] .D=========eeeeER . . . saba v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0] . D============eeeeeER . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] . D=================eeeeER . saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2] . D=================eeeeER . saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3] . D=====================eeeeER saba v0.4s, v0.4s, v1.4s
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 12.0 0.0 0.0 saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 saba v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 saba v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
+
+# CHECK: [4] Code Region - mla
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total uOps: 800
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.47
+# CHECK-NEXT: IPC: 0.23
+# CHECK-NEXT: Block RThroughput: 8.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . .. mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] .D====eeeeeER . . . . .. mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,2] . D=====eeeeeER. . . . .. mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [0,3] . D=========eeeeeER. . . .. mla v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: [1,0] . D=============eeeeeER. . .. mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] . D=================eeeeeER. .. mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,2] . .D==================eeeeeER .. mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: [1,3] . . D======================eeeeeER mla v0.4s, v0.4s, v1.4s
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 11.5 0.0 0.0 mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 2. 2 12.5 0.0 0.0 mla v0.4s, v1.4s, v2.4s
+# CHECK-NEXT: 3. 2 16.5 0.0 0.0 mla v0.4s, v0.4s, v1.4s
+# CHECK-NEXT: 2 12.0 0.1 0.0 <total>
+
+# CHECK: [5] Code Region - sadalp
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1403
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.29
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . sadalp v0.2d, v1.4s
+# CHECK-NEXT: [0,2] .D=====eeeeER . . . . sadalp v0.2d, v1.4s
+# CHECK-NEXT: [0,3] .D=========eeeeER . . . sadalp v0.2d, v0.4s
+# CHECK-NEXT: [1,0] . D============eeeeeER . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] . D=================eeeeER . sadalp v0.2d, v1.4s
+# CHECK-NEXT: [1,2] . D=================eeeeER . sadalp v0.2d, v1.4s
+# CHECK-NEXT: [1,3] . D=====================eeeeER sadalp v0.2d, v0.4s
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 12.0 0.0 0.0 sadalp v0.2d, v1.4s
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 sadalp v0.2d, v1.4s
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sadalp v0.2d, v0.4s
+# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
+
+# CHECK: [6] Code Region - ssra
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1403
+# CHECK-NEXT: Total uOps: 500
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.36
+# CHECK-NEXT: IPC: 0.29
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeeeER . . . . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [0,1] D=====eeeeER . . . . ssra v0.2d, v1.2d, #1
+# CHECK-NEXT: [0,2] .D=====eeeeER . . . . ssra v0.2d, v1.2d, #1
+# CHECK-NEXT: [0,3] .D=========eeeeER . . . ssra v0.2d, v0.2d, #1
+# CHECK-NEXT: [1,0] . D============eeeeeER . . mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: [1,1] . D=================eeeeER . ssra v0.2d, v1.2d, #1
+# CHECK-NEXT: [1,2] . D=================eeeeER . ssra v0.2d, v1.2d, #1
+# CHECK-NEXT: [1,3] . D=====================eeeeER ssra v0.2d, v0.2d, #1
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s
+# CHECK-NEXT: 1. 2 12.0 0.0 0.0 ssra v0.2d, v1.2d, #1
+# CHECK-NEXT: 2. 2 12.0 0.0 0.0 ssra v0.2d, v1.2d, #1
+# CHECK-NEXT: 3. 2 16.0 0.0 0.0 ssra v0.2d, v0.2d, #1
+# CHECK-NEXT: 2 11.8 0.1 0.0 <total>
+
+# CHECK: [7] Code Region - fmla
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 600
+# CHECK-NEXT: Total Cycles: 1703
+# CHECK-NEXT: Total uOps: 600
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.35
+# CHECK-NEXT: IPC: 0.35
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456
+# CHECK-NEXT: Index 0123456789 0123456789
+
+# CHECK: [0,0] DeeeER . . . . . .. fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1] D=eeeeER . . . . . .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,2] D===eeeeER. . . . . .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,3] .D======eeER . . . . .. fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,4] .D========eeeeER . . . .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [0,5] .D============eeeeER. . . .. fmla v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: [1,0] . D===============eeeER . . .. fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1] . D================eeeeER. . .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,2] . D==================eeeeER . .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,3] . D=====================eeER . .. fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,4] . D=======================eeeeER .. fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: [1,5] . D===========================eeeeER fmla v0.2d, v0.2d, v1.2d
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1. 2 9.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 2. 2 11.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 3. 2 14.5 0.0 0.0 fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 4. 2 16.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d
+# CHECK-NEXT: 5. 2 20.5 0.0 0.0 fmla v0.2d, v0.2d, v1.2d
+# CHECK-NEXT: 2 13.5 0.1 0.0 <total>
+
+# CHECK: [8] Code Region - fmlal
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 600
+# CHECK-NEXT: Total Cycles: 2203
+# CHECK-NEXT: Total uOps: 600
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.27
+# CHECK-NEXT: IPC: 0.27
+# CHECK-NEXT: Block RThroughput: 3.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 0123456
+
+# CHECK: [0,0] DeeeER . . . . . . . .. fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,1] D===eeeeeER . . . . . . .. fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . . .. fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,3] .D=========eeER. . . . . . .. fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [0,4] .D===========eeeeeER. . . . . .. fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [0,5] .D================eeeeeER. . . . .. fmlal v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: [1,0] . D====================eeeER . . . .. fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,1] . D=======================eeeeeER . . .. fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,2] . D=========================eeeeeER. . .. fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,3] . D=============================eeER . .. fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: [1,4] . D===============================eeeeeER .. fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: [1,5] . D====================================eeeeeER fmlal v0.4s, v0.4h, v1.4h
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 11.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 1. 2 14.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 2. 2 16.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 3. 2 20.0 0.0 0.0 fadd v0.2d, v0.2d, v0.2d
+# CHECK-NEXT: 4. 2 22.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h
+# CHECK-NEXT: 5. 2 27.0 0.0 0.0 fmlal v0.4s, v0.4h, v1.4h
+# CHECK-NEXT: 2 18.3 0.1 0.0 <total>
+
+# CHECK: [9] Code Region - crc32
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 703
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.57
+# CHECK-NEXT: IPC: 0.57
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DeeER. . .. mul w0, w0, w0
+# CHECK-NEXT: [0,1] D==eeER . .. crc32cb w0, w0, w1
+# CHECK-NEXT: [0,2] D===eeER . .. crc32cb w0, w0, w1
+# CHECK-NEXT: [0,3] .D====eeER. .. crc32cb w0, w0, w0
+# CHECK-NEXT: [1,0] .D======eeER .. mul w0, w0, w0
+# CHECK-NEXT: [1,1] .D========eeER .. crc32cb w0, w0, w1
+# CHECK-NEXT: [1,2] . D========eeER.. crc32cb w0, w0, w1
+# CHECK-NEXT: [1,3] . D==========eeER crc32cb w0, w0, w0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 4.0 0.5 0.0 mul w0, w0, w0
+# CHECK-NEXT: 1. 2 6.0 0.0 0.0 crc32cb w0, w0, w1
+# CHECK-NEXT: 2. 2 6.5 0.0 0.0 crc32cb w0, w0, w1
+# CHECK-NEXT: 3. 2 8.0 0.0 0.0 crc32cb w0, w0, w0
+# CHECK-NEXT: 2 6.1 0.1 0.0 <total>
+
+# CHECK: [10] Code Region - smulh
+
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 400
+# CHECK-NEXT: Total Cycles: 1903
+# CHECK-NEXT: Total uOps: 400
+
+# CHECK: Dispatch Width: 3
+# CHECK-NEXT: uOps Per Cycle: 0.21
+# CHECK-NEXT: IPC: 0.21
+# CHECK-NEXT: Block RThroughput: 15.0
+
+# CHECK: Timeline view:
+# CHECK-NEXT: 0123456789 0123456789
+# CHECK-NEXT: Index 0123456789 0123456789 0
+
+# CHECK: [0,0] DeeeeER . . . . . . . mul x0, x0, x0
+# CHECK-NEXT: [0,1] D====eeeeeER . . . . . . smulh x0, x0, x1
+# CHECK-NEXT: [0,2] D=========eeeeeER . . . . . smulh x0, x0, x1
+# CHECK-NEXT: [0,3] .D=============eeeeeER . . . . smulh x0, x0, x0
+# CHECK-NEXT: [1,0] .D==================eeeeER . . . mul x0, x0, x0
+# CHECK-NEXT: [1,1] .D======================eeeeeER . . smulh x0, x0, x1
+# CHECK-NEXT: [1,2] . D==========================eeeeeER . smulh x0, x0, x1
+# CHECK-NEXT: [1,3] . D===============================eeeeeER smulh x0, x0, x0
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 2 10.0 0.5 0.0 mul x0, x0, x0
+# CHECK-NEXT: 1. 2 14.0 0.0 0.0 smulh x0, x0, x1
+# CHECK-NEXT: 2. 2 18.5 0.0 0.0 smulh x0, x0, x1
+# CHECK-NEXT: 3. 2 23.0 0.0 0.0 smulh x0, x0, x0
+# CHECK-NEXT: 2 16.4 0.1 0.0 <total>
More information about the llvm-commits
mailing list