[llvm] 35b2317 - [AArch64] Support USDOT in performAddDotCombine (#171864)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Dec 14 21:46:47 PST 2025
Author: David Green
Date: 2025-12-15T05:46:43Z
New Revision: 35b23172c590613d546727f2cc6bb20a651a142b
URL: https://github.com/llvm/llvm-project/commit/35b23172c590613d546727f2cc6bb20a651a142b
DIFF: https://github.com/llvm/llvm-project/commit/35b23172c590613d546727f2cc6bb20a651a142b.diff
LOG: [AArch64] Support USDOT in performAddDotCombine (#171864)
This function does
// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
Which can equally apply to USDOT too now that we have a node for it.
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/aarch64-matmul.ll
llvm/test/CodeGen/AArch64/neon-dotreduce.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 426d950f850e6..9145492eb5d71 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21825,7 +21825,8 @@ static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
// Handle commutivity
auto isZeroDot = [](SDValue Dot) {
return (Dot.getOpcode() == AArch64ISD::UDOT ||
- Dot.getOpcode() == AArch64ISD::SDOT) &&
+ Dot.getOpcode() == AArch64ISD::SDOT ||
+ Dot.getOpcode() == AArch64ISD::USDOT) &&
isZerosVector(Dot.getOperand(0).getNode());
};
if (!isZeroDot(Dot))
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
index e7e9ee7330613..c6776f3dd2513 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s | FileCheck %s
-; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm -global-isel < %s | FileCheck %s
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: smmla.v4i32.v16i8:
@@ -160,6 +160,42 @@ entry:
ret <4 x i32> %vusdot1.i
}
+define <2 x i32> @usdot_add_zero.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+; CHECK-SD-LABEL: usdot_add_zero.v2i32.v8i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: usdot v0.2s, v1.8b, v2.8b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: usdot_add_zero.v2i32.v8i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
+; CHECK-GI-NEXT: usdot v3.2s, v1.8b, v2.8b
+; CHECK-GI-NEXT: add v0.2s, v3.2s, v0.2s
+; CHECK-GI-NEXT: ret
+entry:
+ %x = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %a, <8 x i8> %b)
+ %y = add <2 x i32> %x, %r
+ ret <2 x i32> %y
+}
+
+define <4 x i32> @usdot_add_zero.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-SD-LABEL: usdot_add_zero.v4i32.v16i8:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: usdot v0.4s, v1.16b, v2.16b
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: usdot_add_zero.v4i32.v16i8:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
+; CHECK-GI-NEXT: usdot v3.4s, v1.16b, v2.16b
+; CHECK-GI-NEXT: add v0.4s, v3.4s, v0.4s
+; CHECK-GI-NEXT: ret
+entry:
+ %x = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %a, <16 x i8> %b)
+ %y = add <4 x i32> %x, %r
+ ret <4 x i32> %y
+}
+
declare <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
declare <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 4b0d110632959..dbbe00c89eecf 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -1375,11 +1375,9 @@ define i32 @test_usdot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i
; CHECK-SD-LABEL: test_usdot_v8i8_double:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
-; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT: usdot v5.2s, v0.8b, v1.8b
; CHECK-SD-NEXT: usdot v4.2s, v2.8b, v3.8b
-; CHECK-SD-NEXT: add v0.2s, v5.2s, v4.2s
-; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s
+; CHECK-SD-NEXT: usdot v4.2s, v0.8b, v1.8b
+; CHECK-SD-NEXT: addp v0.2s, v4.2s, v4.2s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
@@ -1416,11 +1414,9 @@ define i32 @test_usdot_swapped_operands_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8
; CHECK-SD-LABEL: test_usdot_swapped_operands_v8i8_double:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
-; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT: usdot v5.2s, v1.8b, v0.8b
; CHECK-SD-NEXT: usdot v4.2s, v3.8b, v2.8b
-; CHECK-SD-NEXT: add v0.2s, v5.2s, v4.2s
-; CHECK-SD-NEXT: addp v0.2s, v0.2s, v0.2s
+; CHECK-SD-NEXT: usdot v4.2s, v1.8b, v0.8b
+; CHECK-SD-NEXT: addp v0.2s, v4.2s, v4.2s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
@@ -1457,11 +1453,9 @@ define i32 @test_usdot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <1
; CHECK-SD-LABEL: test_usdot_v16i8_double:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
-; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT: usdot v5.4s, v0.16b, v1.16b
; CHECK-SD-NEXT: usdot v4.4s, v2.16b, v3.16b
-; CHECK-SD-NEXT: add v0.4s, v5.4s, v4.4s
-; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: usdot v4.4s, v0.16b, v1.16b
+; CHECK-SD-NEXT: addv s0, v4.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
@@ -1509,11 +1503,9 @@ define i32 @test_usdot_swapped_operands_v16i8_double(<16 x i8> %a, <16 x i8> %b,
; CHECK-SD-LABEL: test_usdot_swapped_operands_v16i8_double:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
-; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT: usdot v5.4s, v1.16b, v0.16b
; CHECK-SD-NEXT: usdot v4.4s, v3.16b, v2.16b
-; CHECK-SD-NEXT: add v0.4s, v5.4s, v4.4s
-; CHECK-SD-NEXT: addv s0, v0.4s
+; CHECK-SD-NEXT: usdot v4.4s, v1.16b, v0.16b
+; CHECK-SD-NEXT: addv s0, v4.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
;
@@ -4384,12 +4376,10 @@ define i32 @test_usdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %
; CHECK-SD-LABEL: test_usdot_v32i8:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
-; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
-; CHECK-SD-NEXT: ldp q2, q3, [x0]
-; CHECK-SD-NEXT: ldp q4, q5, [x1]
-; CHECK-SD-NEXT: usdot v1.4s, v3.16b, v5.16b
-; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v4.16b
-; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: ldp q1, q3, [x0]
+; CHECK-SD-NEXT: ldp q2, q4, [x1]
+; CHECK-SD-NEXT: usdot v0.4s, v3.16b, v4.16b
+; CHECK-SD-NEXT: usdot v0.4s, v1.16b, v2.16b
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w8, s0
; CHECK-SD-NEXT: add w0, w8, w2
@@ -4438,15 +4428,11 @@ define i32 @test_usdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <3
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
; CHECK-SD-NEXT: movi v17.2d, #0000000000000000
-; CHECK-SD-NEXT: movi v18.2d, #0000000000000000
-; CHECK-SD-NEXT: movi v19.2d, #0000000000000000
-; CHECK-SD-NEXT: usdot v16.4s, v1.16b, v3.16b
-; CHECK-SD-NEXT: usdot v18.4s, v0.16b, v2.16b
-; CHECK-SD-NEXT: usdot v17.4s, v4.16b, v6.16b
-; CHECK-SD-NEXT: usdot v19.4s, v5.16b, v7.16b
-; CHECK-SD-NEXT: add v0.4s, v18.4s, v16.4s
-; CHECK-SD-NEXT: add v1.4s, v17.4s, v19.4s
-; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT: usdot v17.4s, v1.16b, v3.16b
+; CHECK-SD-NEXT: usdot v16.4s, v5.16b, v7.16b
+; CHECK-SD-NEXT: usdot v17.4s, v0.16b, v2.16b
+; CHECK-SD-NEXT: usdot v16.4s, v4.16b, v6.16b
+; CHECK-SD-NEXT: add v0.4s, v17.4s, v16.4s
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
; CHECK-SD-NEXT: ret
@@ -8781,20 +8767,16 @@ define i32 @test_usdot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %
; CHECK-SD-LABEL: test_usdot_v64i8:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: movi v0.2d, #0000000000000000
-; CHECK-SD-NEXT: movi v3.2d, #0000000000000000
-; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
-; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT: ldp q1, q2, [x0, #32]
-; CHECK-SD-NEXT: ldp q6, q7, [x1, #32]
-; CHECK-SD-NEXT: ldp q16, q17, [x0]
-; CHECK-SD-NEXT: ldp q18, q19, [x1]
-; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v7.16b
-; CHECK-SD-NEXT: usdot v5.4s, v1.16b, v6.16b
-; CHECK-SD-NEXT: usdot v4.4s, v17.16b, v19.16b
-; CHECK-SD-NEXT: usdot v3.4s, v16.16b, v18.16b
-; CHECK-SD-NEXT: add v0.4s, v4.4s, v0.4s
-; CHECK-SD-NEXT: add v1.4s, v3.4s, v5.4s
-; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT: ldp q2, q3, [x0, #32]
+; CHECK-SD-NEXT: ldp q4, q5, [x1, #32]
+; CHECK-SD-NEXT: usdot v1.4s, v3.16b, v5.16b
+; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v4.16b
+; CHECK-SD-NEXT: ldp q2, q3, [x0]
+; CHECK-SD-NEXT: ldp q4, q5, [x1]
+; CHECK-SD-NEXT: usdot v1.4s, v3.16b, v5.16b
+; CHECK-SD-NEXT: usdot v0.4s, v2.16b, v4.16b
+; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w8, s0
; CHECK-SD-NEXT: add w0, w8, w2
@@ -8863,32 +8845,24 @@ entry:
define i32 @test_usdot_v64i8_double(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
; CHECK-SD-LABEL: test_usdot_v64i8_double:
; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v17.2d, #0000000000000000
; CHECK-SD-NEXT: movi v18.2d, #0000000000000000
-; CHECK-SD-NEXT: movi v21.2d, #0000000000000000
-; CHECK-SD-NEXT: movi v22.2d, #0000000000000000
-; CHECK-SD-NEXT: movi v23.2d, #0000000000000000
-; CHECK-SD-NEXT: ldp q16, q17, [sp, #64]
-; CHECK-SD-NEXT: movi v24.2d, #0000000000000000
-; CHECK-SD-NEXT: movi v25.2d, #0000000000000000
-; CHECK-SD-NEXT: movi v26.2d, #0000000000000000
-; CHECK-SD-NEXT: movi v27.2d, #0000000000000000
-; CHECK-SD-NEXT: ldp q19, q20, [sp, #96]
-; CHECK-SD-NEXT: usdot v18.4s, v3.16b, v7.16b
-; CHECK-SD-NEXT: ldp q3, q7, [sp, #32]
-; CHECK-SD-NEXT: usdot v21.4s, v1.16b, v5.16b
-; CHECK-SD-NEXT: ldp q1, q5, [sp]
-; CHECK-SD-NEXT: usdot v22.4s, v2.16b, v6.16b
-; CHECK-SD-NEXT: usdot v23.4s, v0.16b, v4.16b
-; CHECK-SD-NEXT: usdot v24.4s, v7.16b, v20.16b
-; CHECK-SD-NEXT: usdot v27.4s, v3.16b, v19.16b
-; CHECK-SD-NEXT: usdot v26.4s, v5.16b, v17.16b
-; CHECK-SD-NEXT: usdot v25.4s, v1.16b, v16.16b
-; CHECK-SD-NEXT: add v0.4s, v21.4s, v18.4s
-; CHECK-SD-NEXT: add v1.4s, v23.4s, v22.4s
-; CHECK-SD-NEXT: add v2.4s, v26.4s, v24.4s
-; CHECK-SD-NEXT: add v3.4s, v25.4s, v27.4s
-; CHECK-SD-NEXT: add v0.4s, v1.4s, v0.4s
-; CHECK-SD-NEXT: add v1.4s, v3.4s, v2.4s
+; CHECK-SD-NEXT: movi v19.2d, #0000000000000000
+; CHECK-SD-NEXT: ldp q20, q21, [sp, #96]
+; CHECK-SD-NEXT: ldp q22, q23, [sp, #32]
+; CHECK-SD-NEXT: usdot v16.4s, v3.16b, v7.16b
+; CHECK-SD-NEXT: usdot v18.4s, v2.16b, v6.16b
+; CHECK-SD-NEXT: usdot v19.4s, v23.16b, v21.16b
+; CHECK-SD-NEXT: usdot v17.4s, v22.16b, v20.16b
+; CHECK-SD-NEXT: ldp q2, q3, [sp, #64]
+; CHECK-SD-NEXT: ldp q6, q7, [sp]
+; CHECK-SD-NEXT: usdot v16.4s, v1.16b, v5.16b
+; CHECK-SD-NEXT: usdot v18.4s, v0.16b, v4.16b
+; CHECK-SD-NEXT: usdot v19.4s, v7.16b, v3.16b
+; CHECK-SD-NEXT: usdot v17.4s, v6.16b, v2.16b
+; CHECK-SD-NEXT: add v0.4s, v18.4s, v16.4s
+; CHECK-SD-NEXT: add v1.4s, v17.4s, v19.4s
; CHECK-SD-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-NEXT: addv s0, v0.4s
; CHECK-SD-NEXT: fmov w0, s0
More information about the llvm-commits
mailing list