[llvm] [AArch64] Fix throughout of 64-bit SVE gather loads (PR #168572)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 18 09:29:59 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Asher Dobrescu (Asher8118)
<details>
<summary>Changes</summary>
In the Neoverse N3 Software Optimisation Guide, SVE non termporal gather load, vector+scalar 64-bit element size and gather load, vector + imm, 64-bit element size both show throughput of 4/5. However, it currently shows as 2/3. This patch adds a new resource in order to show the correct throughput.
---
Patch is 1.63 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168572.diff
5 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td (+11-4)
- (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-basic-instructions.s (+1178-1174)
- (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-mte-instructions.s (+121-117)
- (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-neon-instructions.s (+1083-1079)
- (modified) llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-sve-instructions.s (+3546-3542)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
index c73f60a1a7741b..13f8c1be0a9dd7 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
@@ -40,6 +40,7 @@ def N3UnitM0 : ProcResource<1>; // Integer Single/Multi-Cycle 0
def N3UnitM1 : ProcResource<1>; // Integer Single/Multi-Cycle 1
def N3UnitL01 : ProcResource<2>; // Load/Store 0/1
def N3UnitL2 : ProcResource<1>; // Load 2
+def N3UnitGL : ProcResource<4>; // Gather Load
def N3UnitD : ProcResource<2>; // Integer Store data 0/1
def N3UnitV0 : ProcResource<1>; // FP/ASIMD 0
def N3UnitV1 : ProcResource<1>; // FP/ASIMD 1
@@ -160,6 +161,12 @@ def N3Write_6c_2L : SchedWriteRes<[N3UnitL, N3UnitL]> {
let NumMicroOps = 2;
}
+def N3Write_6c_2GL : SchedWriteRes<[N3UnitL, N3UnitGL]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ReleaseAtCycles = [3, 5];
+}
+
def N3Write_2c_1L01_1V : SchedWriteRes<[N3UnitL01, N3UnitV]> {
let Latency = 2;
let NumMicroOps = 2;
@@ -2243,8 +2250,8 @@ def : InstRW<[N3Write_7c_4L], (instregex "^LDNT1[BHW]_ZZR_S$",
"^LDNT1S[BH]_ZZR_S$")>;
// Non temporal gather load, vector + scalar 64-bit element size
-def : InstRW<[N3Write_6c_2L], (instregex "^LDNT1S?[BHW]_ZZR_D$")>;
-def : InstRW<[N3Write_6c_2L], (instrs LDNT1D_ZZR_D)>;
+def : InstRW<[N3Write_6c_2GL], (instregex "^LDNT1S?[BHW]_ZZR_D$")>;
+def : InstRW<[N3Write_6c_2GL], (instrs LDNT1D_ZZR_D)>;
// Contiguous first faulting load, scalar + scalar
def : InstRW<[N3Write_6c_1L], (instregex "^LDFF1[BHWD]$",
@@ -2293,11 +2300,11 @@ def : InstRW<[N3Write_7c_4L], (instregex "^GLD(FF)?1S?[BH]_S_IMM$",
"^GLD(FF)?1W_IMM$")>;
// Gather load, vector + imm, 64-bit element size
-def : InstRW<[N3Write_6c_2L], (instregex "^GLD(FF)?1S?[BHW]_D_IMM$",
+def : InstRW<[N3Write_6c_2GL], (instregex "^GLD(FF)?1S?[BHW]_D_IMM$",
"^GLD(FF)?1D_IMM$")>;
// Gather load, 64-bit element size
-def : InstRW<[N3Write_6c_2L],
+def : InstRW<[N3Write_6c_2GL],
(instregex "^GLD(FF)?1S?[BHW]_D_[SU]XTW(_SCALED)?$",
"^GLD(FF)?1S?[BHW]_D(_SCALED)?$",
"^GLD(FF)?1D_[SU]XTW(_SCALED)?$",
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-basic-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-basic-instructions.s
index b9758280e2491e..1767d15d862ad6 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-basic-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-basic-instructions.s
@@ -2545,1181 +2545,1185 @@ drps
# CHECK-NEXT: [0.1] - N3UnitB
# CHECK-NEXT: [1.0] - N3UnitD
# CHECK-NEXT: [1.1] - N3UnitD
-# CHECK-NEXT: [2] - N3UnitL2
-# CHECK-NEXT: [3.0] - N3UnitL01
-# CHECK-NEXT: [3.1] - N3UnitL01
-# CHECK-NEXT: [4] - N3UnitM0
-# CHECK-NEXT: [5] - N3UnitM1
-# CHECK-NEXT: [6.0] - N3UnitS
-# CHECK-NEXT: [6.1] - N3UnitS
-# CHECK-NEXT: [7] - N3UnitV0
-# CHECK-NEXT: [8] - N3UnitV1
+# CHECK-NEXT: [2.0] - N3UnitGL
+# CHECK-NEXT: [2.1] - N3UnitGL
+# CHECK-NEXT: [2.2] - N3UnitGL
+# CHECK-NEXT: [2.3] - N3UnitGL
+# CHECK-NEXT: [3] - N3UnitL2
+# CHECK-NEXT: [4.0] - N3UnitL01
+# CHECK-NEXT: [4.1] - N3UnitL01
+# CHECK-NEXT: [5] - N3UnitM0
+# CHECK-NEXT: [6] - N3UnitM1
+# CHECK-NEXT: [7.0] - N3UnitS
+# CHECK-NEXT: [7.1] - N3UnitS
+# CHECK-NEXT: [8] - N3UnitV0
+# CHECK-NEXT: [9] - N3UnitV1
# CHECK: Resource pressure per iteration:
-# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8]
-# CHECK-NEXT: 11.00 11.00 33.00 33.00 99.33 163.33 163.33 357.75 212.75 156.25 156.25 184.50 64.50
+# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2.0] [2.1] [2.2] [2.3] [3] [4.0] [4.1] [5] [6] [7.0] [7.1] [8] [9]
+# CHECK-NEXT: 11.00 11.00 33.00 33.00 - - - - 99.33 163.33 163.33 357.75 212.75 156.25 156.25 184.50 64.50
# CHECK: Resource pressure by instruction:
-# CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] Instructions:
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add w2, w3, #4095
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add w30, w29, #1, lsl #12
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add w13, w5, #4095, lsl #12
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add x5, x7, #1638
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add w20, wsp, #801
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add wsp, wsp, #1104
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add wsp, w30, #4084
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add x0, x24, #291
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add x3, x24, #4095, lsl #12
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add x8, sp, #1074
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add sp, x29, #3816
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - sub w0, wsp, #4077
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - sub w4, w20, #546, lsl #12
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - sub sp, sp, #288
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - sub wsp, w19, #16
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - adds w13, w23, #291, lsl #12
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - cmn w2, #4095
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - adds w20, wsp, #0
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - cmn x3, #1, lsl #12
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - cmp sp, #20, lsl #12
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - cmp x30, #4095
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - subs x4, sp, #3822
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - cmn w3, #291, lsl #12
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - cmn wsp, #1365
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - cmn sp, #1092, lsl #12
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - mov sp, x30
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - mov wsp, w20
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - mov x11, sp
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - mov w24, wsp
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add w3, w5, w7
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add wzr, w3, w5
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add w20, wzr, w4
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add w4, w6, wzr
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add w11, w13, w15
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - add w9, w3, wzr, lsl #10
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - add w17, w29, w20, lsl #31
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - add w21, w22, w23, lsr #0
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - add w24, w25, w26, lsr #18
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - add w27, w28, w29, lsr #31
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - add w2, w3, w4, asr #0
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - add w5, w6, w7, asr #21
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - add w8, w9, w10, asr #31
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add x3, x5, x7
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add xzr, x3, x5
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add x20, xzr, x4
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add x4, x6, xzr
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - add x11, x13, x15
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - add x9, x3, xzr, lsl #10
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - add x17, x29, x20, lsl #63
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - add x21, x22, x23, lsr #0
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - add x24, x25, x26, lsr #18
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - add x27, x28, x29, lsr #63
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - add x2, x3, x4, asr #0
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - add x5, x6, x7, asr #21
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - add x8, x9, x10, asr #63
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - adds w3, w5, w7
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - cmn w3, w5
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - adds w20, wzr, w4
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - adds w4, w6, wzr
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - adds w11, w13, w15
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - adds w9, w3, wzr, lsl #10
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - adds w17, w29, w20, lsl #31
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - adds w21, w22, w23, lsr #0
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - adds w24, w25, w26, lsr #18
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - adds w27, w28, w29, lsr #31
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - adds w2, w3, w4, asr #0
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - adds w5, w6, w7, asr #21
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - adds w8, w9, w10, asr #31
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - adds x3, x5, x7
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - cmn x3, x5
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - adds x20, xzr, x4
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - adds x4, x6, xzr
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - adds x11, x13, x15
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - adds x9, x3, xzr, lsl #10
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - adds x17, x29, x20, lsl #63
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - adds x21, x22, x23, lsr #0
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - adds x24, x25, x26, lsr #18
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - adds x27, x28, x29, lsr #63
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - adds x2, x3, x4, asr #0
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - adds x5, x6, x7, asr #21
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - adds x8, x9, x10, asr #63
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - sub w3, w5, w7
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - sub wzr, w3, w5
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - sub w4, w6, wzr
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - sub w11, w13, w15
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - sub w9, w3, wzr, lsl #10
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - sub w17, w29, w20, lsl #31
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - sub w21, w22, w23, lsr #0
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - sub w24, w25, w26, lsr #18
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - sub w27, w28, w29, lsr #31
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - sub w2, w3, w4, asr #0
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - sub w5, w6, w7, asr #21
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - sub w8, w9, w10, asr #31
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - sub x3, x5, x7
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - sub xzr, x3, x5
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - sub x4, x6, xzr
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - sub x11, x13, x15
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - sub x9, x3, xzr, lsl #10
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - sub x17, x29, x20, lsl #63
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - sub x21, x22, x23, lsr #0
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - sub x24, x25, x26, lsr #18
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - sub x27, x28, x29, lsr #63
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - sub x2, x3, x4, asr #0
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - sub x5, x6, x7, asr #21
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - sub x8, x9, x10, asr #63
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - subs w3, w5, w7
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - cmp w3, w5
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - subs w4, w6, wzr
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - subs w11, w13, w15
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - subs w9, w3, wzr, lsl #10
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - subs w17, w29, w20, lsl #31
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - subs w21, w22, w23, lsr #0
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - subs w24, w25, w26, lsr #18
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - subs w27, w28, w29, lsr #31
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - subs w2, w3, w4, asr #0
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - subs w5, w6, w7, asr #21
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - subs w8, w9, w10, asr #31
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - subs x3, x5, x7
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - cmp x3, x5
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - subs x4, x6, xzr
-# CHECK-NEXT: - - - - - - - 0.25 0.25 0.25 0.25 - - subs x11, x13, x15
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - subs x9, x3, xzr, lsl #10
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - subs x17, x29, x20, lsl #63
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - subs x21, x22, x23, lsr #0
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - subs x24, x25, x26, lsr #18
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - subs x27, x28, x29, lsr #63
-# CHECK-NEXT: - - - - - - - 0.50 0.50 - - - - subs x2, x3, x4, asr #0
-# CHECK-NEXT: - - -...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/168572
More information about the llvm-commits
mailing list