[llvm] [AArch64][machine-scheduler][Neoverse-N2] fdiv is blocking (PR #119206)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 9 05:07:49 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: None (simonwallis2)
<details>
<summary>Changes</summary>
For Neoverse-N2, mark FP divide and square root instructions as blocking their pipeline until complete.
This matches the way that blocking integer divide instructions are marked.
>From the Software Optimization Guide, section 3.14 Notes:
1. FP divide and square root operations are performed using an iterative algorithm and block subsequent similar operations to the same pipeline until complete.
Change-Id: I3fa743e62cc41d45d2c12d3923736789aaece7e3
---
Full diff: https://github.com/llvm/llvm-project/pull/119206.diff
3 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td (+47-7)
- (modified) llvm/test/CodeGen/AArch64/machine-combiner.ll (+4-4)
- (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-basic-instructions.s (+9-9)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
index 737fc7390455d8..324de2117bddbb 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
@@ -547,6 +547,21 @@ def N2Write_9c_4L_4V : SchedWriteRes<[N2UnitL, N2UnitL, N2UnitL, N2UnitL,
let NumMicroOps = 8;
}
+def N2Write_7c_7V0 : SchedWriteRes<[N2UnitV0]> {
+ let Latency = 7;
+ let NumMicroOps = 7;
+ let ReleaseAtCycles = [7];
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 9 micro-op types
+
+def N2Write_9c_9V0 : SchedWriteRes<[N2UnitV0]> {
+ let Latency = 9;
+ let NumMicroOps = 9;
+ let ReleaseAtCycles = [9];
+}
+
//===----------------------------------------------------------------------===//
// Define generic 10 micro-op types
@@ -557,6 +572,12 @@ def N2Write_7c_5L01_5V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01,
let NumMicroOps = 10;
}
+def N2Write_10c_10V0 : SchedWriteRes<[N2UnitV0]> {
+ let Latency = 10;
+ let NumMicroOps = 10;
+ let ReleaseAtCycles = [10];
+}
+
//===----------------------------------------------------------------------===//
// Define generic 12 micro-op types
@@ -580,6 +601,21 @@ def N2Write_7c_5L01_5S_5V : SchedWriteRes<[N2UnitL01, N2UnitL01, N2UnitL01,
let NumMicroOps = 15;
}
+def N2Write_15c_15V0 : SchedWriteRes<[N2UnitV0]> {
+ let Latency = 15;
+ let NumMicroOps = 15;
+ let ReleaseAtCycles = [15];
+}
+
+//===----------------------------------------------------------------------===//
+// Define generic 16 micro-op types
+
+def N2Write_16c_16V0 : SchedWriteRes<[N2UnitV0]> {
+ let Latency = 16;
+ let NumMicroOps = 16;
+ let ReleaseAtCycles = [16];
+}
+
//===----------------------------------------------------------------------===//
// Define generic 18 micro-op types
@@ -795,22 +831,26 @@ def : SchedAlias<WriteF, N2Write_2c_1V>;
// FP compare
def : SchedAlias<WriteFCmp, N2Write_2c_1V0>;
+// FP divide and square root operations are performed using an iterative
+// algorithm and block subsequent similar operations to the same pipeline
+// until complete.
+
// FP divide, square root
-def : SchedAlias<WriteFDiv, N2Write_7c_1V0>;
+def : SchedAlias<WriteFDiv, N2Write_7c_7V0>;
// FP divide, H-form
-def : InstRW<[N2Write_7c_1V0], (instrs FDIVHrr)>;
+def : InstRW<[N2Write_7c_7V0], (instrs FDIVHrr)>;
// FP divide, S-form
-def : InstRW<[N2Write_10c_1V0], (instrs FDIVSrr)>;
+def : InstRW<[N2Write_10c_10V0], (instrs FDIVSrr)>;
// FP divide, D-form
-def : InstRW<[N2Write_15c_1V0], (instrs FDIVDrr)>;
+def : InstRW<[N2Write_15c_15V0], (instrs FDIVDrr)>;
// FP square root, H-form
-def : InstRW<[N2Write_7c_1V0], (instrs FSQRTHr)>;
+def : InstRW<[N2Write_7c_7V0], (instrs FSQRTHr)>;
// FP square root, S-form
-def : InstRW<[N2Write_9c_1V0], (instrs FSQRTSr)>;
+def : InstRW<[N2Write_9c_9V0], (instrs FSQRTSr)>;
// FP square root, D-form
-def : InstRW<[N2Write_16c_1V0], (instrs FSQRTDr)>;
+def : InstRW<[N2Write_16c_16V0], (instrs FSQRTDr)>;
// FP multiply
def : WriteRes<WriteFMul, [N2UnitV]> { let Latency = 3; }
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner.ll b/llvm/test/CodeGen/AArch64/machine-combiner.ll
index 70a638857ce4a9..c8df283aace0b1 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner.ll
+++ b/llvm/test/CodeGen/AArch64/machine-combiner.ll
@@ -262,8 +262,8 @@ define half @reassociate_adds_half(half %x0, half %x1, half %x2, half %x3) {
; CHECK-UNSAFE-LABEL: reassociate_adds_half:
; CHECK-UNSAFE: // %bb.0:
; CHECK-UNSAFE-NEXT: fdiv h0, h0, h1
-; CHECK-UNSAFE-NEXT: fadd h1, h3, h2
-; CHECK-UNSAFE-NEXT: fadd h0, h1, h0
+; CHECK-UNSAFE-NEXT: fadd h2, h3, h2
+; CHECK-UNSAFE-NEXT: fadd h0, h2, h0
; CHECK-UNSAFE-NEXT: ret
%t0 = fdiv half %x0, %x1
%t1 = fadd half %x2, %t0
@@ -284,8 +284,8 @@ define half @reassociate_muls_half(half %x0, half %x1, half %x2, half %x3) {
; CHECK-UNSAFE-LABEL: reassociate_muls_half:
; CHECK-UNSAFE: // %bb.0:
; CHECK-UNSAFE-NEXT: fdiv h0, h0, h1
-; CHECK-UNSAFE-NEXT: fmul h1, h3, h2
-; CHECK-UNSAFE-NEXT: fmul h0, h1, h0
+; CHECK-UNSAFE-NEXT: fmul h2, h3, h2
+; CHECK-UNSAFE-NEXT: fmul h0, h2, h0
; CHECK-UNSAFE-NEXT: ret
%t0 = fdiv half %x0, %x1
%t1 = fmul half %x2, %t0
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-basic-instructions.s
index f4c4a20573c4eb..cf1cf0e98c8018 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-basic-instructions.s
@@ -1891,7 +1891,7 @@ drps
# CHECK-NEXT: 1 2 0.50 fmov s0, s1
# CHECK-NEXT: 1 2 0.50 fabs s2, s3
# CHECK-NEXT: 1 2 0.50 fneg s4, s5
-# CHECK-NEXT: 1 9 1.00 fsqrt s6, s7
+# CHECK-NEXT: 9 9 9.00 fsqrt s6, s7
# CHECK-NEXT: 1 3 1.00 fcvt d8, s9
# CHECK-NEXT: 1 3 1.00 fcvt h10, s11
# CHECK-NEXT: 1 3 1.00 frintn s12, s13
@@ -1904,7 +1904,7 @@ drps
# CHECK-NEXT: 1 2 0.50 fmov d0, d1
# CHECK-NEXT: 1 2 0.50 fabs d2, d3
# CHECK-NEXT: 1 2 0.50 fneg d4, d5
-# CHECK-NEXT: 1 16 1.00 fsqrt d6, d7
+# CHECK-NEXT: 16 16 16.00 fsqrt d6, d7
# CHECK-NEXT: 1 3 1.00 fcvt s8, d9
# CHECK-NEXT: 1 3 1.00 fcvt h10, d11
# CHECK-NEXT: 1 3 1.00 frintn d12, d13
@@ -1917,7 +1917,7 @@ drps
# CHECK-NEXT: 1 3 1.00 fcvt s26, h27
# CHECK-NEXT: 1 3 1.00 fcvt d28, h29
# CHECK-NEXT: 1 3 0.50 fmul s20, s19, s17
-# CHECK-NEXT: 1 10 1.00 fdiv s1, s2, s3
+# CHECK-NEXT: 10 10 10.00 fdiv s1, s2, s3
# CHECK-NEXT: 1 2 0.50 fadd s4, s5, s6
# CHECK-NEXT: 1 2 0.50 fsub s7, s8, s9
# CHECK-NEXT: 1 2 0.50 fmax s10, s11, s12
@@ -1926,7 +1926,7 @@ drps
# CHECK-NEXT: 1 2 0.50 fminnm s19, s20, s21
# CHECK-NEXT: 1 3 0.50 fnmul s22, s23, s2
# CHECK-NEXT: 1 3 0.50 fmul d20, d19, d17
-# CHECK-NEXT: 1 15 1.00 fdiv d1, d2, d3
+# CHECK-NEXT: 15 15 15.00 fdiv d1, d2, d3
# CHECK-NEXT: 1 2 0.50 fadd d4, d5, d6
# CHECK-NEXT: 1 2 0.50 fsub d7, d8, d9
# CHECK-NEXT: 1 2 0.50 fmax d10, d11, d12
@@ -2557,7 +2557,7 @@ drps
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8]
-# CHECK-NEXT: 11.00 11.00 33.00 33.00 87.33 151.33 151.33 517.00 251.00 162.50 162.50 169.50 85.50
+# CHECK-NEXT: 11.00 11.00 33.00 33.00 87.33 151.33 151.33 517.00 251.00 162.50 162.50 215.50 85.50
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] Instructions:
@@ -3075,7 +3075,7 @@ drps
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fmov s0, s1
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fabs s2, s3
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fneg s4, s5
-# CHECK-NEXT: - - - - - - - - - - - 1.00 - fsqrt s6, s7
+# CHECK-NEXT: - - - - - - - - - - - 9.00 - fsqrt s6, s7
# CHECK-NEXT: - - - - - - - - - - - 1.00 - fcvt d8, s9
# CHECK-NEXT: - - - - - - - - - - - 1.00 - fcvt h10, s11
# CHECK-NEXT: - - - - - - - - - - - 1.00 - frintn s12, s13
@@ -3088,7 +3088,7 @@ drps
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fmov d0, d1
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fabs d2, d3
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fneg d4, d5
-# CHECK-NEXT: - - - - - - - - - - - 1.00 - fsqrt d6, d7
+# CHECK-NEXT: - - - - - - - - - - - 16.00 - fsqrt d6, d7
# CHECK-NEXT: - - - - - - - - - - - 1.00 - fcvt s8, d9
# CHECK-NEXT: - - - - - - - - - - - 1.00 - fcvt h10, d11
# CHECK-NEXT: - - - - - - - - - - - 1.00 - frintn d12, d13
@@ -3101,7 +3101,7 @@ drps
# CHECK-NEXT: - - - - - - - - - - - 1.00 - fcvt s26, h27
# CHECK-NEXT: - - - - - - - - - - - 1.00 - fcvt d28, h29
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fmul s20, s19, s17
-# CHECK-NEXT: - - - - - - - - - - - 1.00 - fdiv s1, s2, s3
+# CHECK-NEXT: - - - - - - - - - - - 10.00 - fdiv s1, s2, s3
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fadd s4, s5, s6
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fsub s7, s8, s9
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fmax s10, s11, s12
@@ -3110,7 +3110,7 @@ drps
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fminnm s19, s20, s21
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fnmul s22, s23, s2
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fmul d20, d19, d17
-# CHECK-NEXT: - - - - - - - - - - - 1.00 - fdiv d1, d2, d3
+# CHECK-NEXT: - - - - - - - - - - - 15.00 - fdiv d1, d2, d3
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fadd d4, d5, d6
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fsub d7, d8, d9
# CHECK-NEXT: - - - - - - - - - - - 0.50 0.50 fmax d10, d11, d12
``````````
</details>
https://github.com/llvm/llvm-project/pull/119206
More information about the llvm-commits
mailing list