[llvm] 35b2317 - [AArch64] Support USDOT in performAddDotCombine (#171864)

via llvm-commits llvm-commits at lists.llvm.org
Sun Dec 14 21:46:47 PST 2025


Author: David Green
Date: 2025-12-15T05:46:43Z
New Revision: 35b23172c590613d546727f2cc6bb20a651a142b

URL: https://github.com/llvm/llvm-project/commit/35b23172c590613d546727f2cc6bb20a651a142b
DIFF: https://github.com/llvm/llvm-project/commit/35b23172c590613d546727f2cc6bb20a651a142b.diff

LOG: [AArch64] Support USDOT in performAddDotCombine (#171864)

This function does
// ADD(UDOT(zero, x, y), A) -->  UDOT(A, x, y)

Which can equally apply to USDOT too now that we have a node for it.

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/aarch64-matmul.ll
    llvm/test/CodeGen/AArch64/neon-dotreduce.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 426d950f850e6..9145492eb5d71 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21825,7 +21825,8 @@ static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
   // Handle commutivity
   auto isZeroDot = [](SDValue Dot) {
     return (Dot.getOpcode() == AArch64ISD::UDOT ||
-            Dot.getOpcode() == AArch64ISD::SDOT) &&
+            Dot.getOpcode() == AArch64ISD::SDOT ||
+            Dot.getOpcode() == AArch64ISD::USDOT) &&
            isZerosVector(Dot.getOperand(0).getNode());
   };
   if (!isZeroDot(Dot))

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
index e7e9ee7330613..c6776f3dd2513 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm    < %s | FileCheck %s
-; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm -global-isel < %s | FileCheck %s
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm    < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: smmla.v4i32.v16i8:
@@ -160,6 +160,42 @@ entry:
   ret <4 x i32> %vusdot1.i
 }
 
+define <2 x i32> @usdot_add_zero.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+; CHECK-SD-LABEL: usdot_add_zero.v2i32.v8i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    usdot v0.2s, v1.8b, v2.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: usdot_add_zero.v2i32.v8i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-GI-NEXT:    usdot v3.2s, v1.8b, v2.8b
+; CHECK-GI-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-GI-NEXT:    ret
+entry:
+  %x = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %a, <8 x i8> %b)
+  %y = add <2 x i32> %x, %r
+  ret <2 x i32> %y
+}
+
+define <4 x i32> @usdot_add_zero.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-SD-LABEL: usdot_add_zero.v4i32.v16i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    usdot v0.4s, v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: usdot_add_zero.v4i32.v16i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-GI-NEXT:    usdot v3.4s, v1.16b, v2.16b
+; CHECK-GI-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-GI-NEXT:    ret
+entry:
+  %x = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %a, <16 x i8> %b)
+  %y = add <4 x i32> %x, %r
+  ret <4 x i32> %y
+}
+
 declare <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
 declare <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
 declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2

diff  --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 4b0d110632959..dbbe00c89eecf 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -1375,11 +1375,9 @@ define i32 @test_usdot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i
 ; CHECK-SD-LABEL: test_usdot_v8i8_double:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT:    usdot v5.2s, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    usdot v4.2s, v2.8b, v3.8b
-; CHECK-SD-NEXT:    add v0.2s, v5.2s, v4.2s
-; CHECK-SD-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-SD-NEXT:    usdot v4.2s, v0.8b, v1.8b
+; CHECK-SD-NEXT:    addp v0.2s, v4.2s, v4.2s
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1416,11 +1414,9 @@ define i32 @test_usdot_swapped_operands_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8
 ; CHECK-SD-LABEL: test_usdot_swapped_operands_v8i8_double:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT:    usdot v5.2s, v1.8b, v0.8b
 ; CHECK-SD-NEXT:    usdot v4.2s, v3.8b, v2.8b
-; CHECK-SD-NEXT:    add v0.2s, v5.2s, v4.2s
-; CHECK-SD-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-SD-NEXT:    usdot v4.2s, v1.8b, v0.8b
+; CHECK-SD-NEXT:    addp v0.2s, v4.2s, v4.2s
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1457,11 +1453,9 @@ define i32 @test_usdot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <1
 ; CHECK-SD-LABEL: test_usdot_v16i8_double:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT:    usdot v5.4s, v0.16b, v1.16b
 ; CHECK-SD-NEXT:    usdot v4.4s, v2.16b, v3.16b
-; CHECK-SD-NEXT:    add v0.4s, v5.4s, v4.4s
-; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    usdot v4.4s, v0.16b, v1.16b
+; CHECK-SD-NEXT:    addv s0, v4.4s
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1509,11 +1503,9 @@ define i32 @test_usdot_swapped_operands_v16i8_double(<16 x i8> %a, <16 x i8> %b,
 ; CHECK-SD-LABEL: test_usdot_swapped_operands_v16i8_double:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT:    usdot v5.4s, v1.16b, v0.16b
 ; CHECK-SD-NEXT:    usdot v4.4s, v3.16b, v2.16b
-; CHECK-SD-NEXT:    add v0.4s, v5.4s, v4.4s
-; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    usdot v4.4s, v1.16b, v0.16b
+; CHECK-SD-NEXT:    addv s0, v4.4s
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -4384,12 +4376,10 @@ define i32 @test_usdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %
 ; CHECK-SD-LABEL: test_usdot_v32i8:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-SD-NEXT:    ldp q2, q3, [x0]
-; CHECK-SD-NEXT:    ldp q4, q5, [x1]
-; CHECK-SD-NEXT:    usdot v1.4s, v3.16b, v5.16b
-; CHECK-SD-NEXT:    usdot v0.4s, v2.16b, v4.16b
-; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ldp q1, q3, [x0]
+; CHECK-SD-NEXT:    ldp q2, q4, [x1]
+; CHECK-SD-NEXT:    usdot v0.4s, v3.16b, v4.16b
+; CHECK-SD-NEXT:    usdot v0.4s, v1.16b, v2.16b
 ; CHECK-SD-NEXT:    addv s0, v0.4s
 ; CHECK-SD-NEXT:    fmov w8, s0
 ; CHECK-SD-NEXT:    add w0, w8, w2
@@ -4438,15 +4428,11 @@ define i32 @test_usdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <3
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    movi v16.2d, #0000000000000000
 ; CHECK-SD-NEXT:    movi v17.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v18.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v19.2d, #0000000000000000
-; CHECK-SD-NEXT:    usdot v16.4s, v1.16b, v3.16b
-; CHECK-SD-NEXT:    usdot v18.4s, v0.16b, v2.16b
-; CHECK-SD-NEXT:    usdot v17.4s, v4.16b, v6.16b
-; CHECK-SD-NEXT:    usdot v19.4s, v5.16b, v7.16b
-; CHECK-SD-NEXT:    add v0.4s, v18.4s, v16.4s
-; CHECK-SD-NEXT:    add v1.4s, v17.4s, v19.4s
-; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    usdot v17.4s, v1.16b, v3.16b
+; CHECK-SD-NEXT:    usdot v16.4s, v5.16b, v7.16b
+; CHECK-SD-NEXT:    usdot v17.4s, v0.16b, v2.16b
+; CHECK-SD-NEXT:    usdot v16.4s, v4.16b, v6.16b
+; CHECK-SD-NEXT:    add v0.4s, v17.4s, v16.4s
 ; CHECK-SD-NEXT:    addv s0, v0.4s
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ret
@@ -8781,20 +8767,16 @@ define i32 @test_usdot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %
 ; CHECK-SD-LABEL: test_usdot_v64i8:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT:    ldp q1, q2, [x0, #32]
-; CHECK-SD-NEXT:    ldp q6, q7, [x1, #32]
-; CHECK-SD-NEXT:    ldp q16, q17, [x0]
-; CHECK-SD-NEXT:    ldp q18, q19, [x1]
-; CHECK-SD-NEXT:    usdot v0.4s, v2.16b, v7.16b
-; CHECK-SD-NEXT:    usdot v5.4s, v1.16b, v6.16b
-; CHECK-SD-NEXT:    usdot v4.4s, v17.16b, v19.16b
-; CHECK-SD-NEXT:    usdot v3.4s, v16.16b, v18.16b
-; CHECK-SD-NEXT:    add v0.4s, v4.4s, v0.4s
-; CHECK-SD-NEXT:    add v1.4s, v3.4s, v5.4s
-; CHECK-SD-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT:    ldp q2, q3, [x0, #32]
+; CHECK-SD-NEXT:    ldp q4, q5, [x1, #32]
+; CHECK-SD-NEXT:    usdot v1.4s, v3.16b, v5.16b
+; CHECK-SD-NEXT:    usdot v0.4s, v2.16b, v4.16b
+; CHECK-SD-NEXT:    ldp q2, q3, [x0]
+; CHECK-SD-NEXT:    ldp q4, q5, [x1]
+; CHECK-SD-NEXT:    usdot v1.4s, v3.16b, v5.16b
+; CHECK-SD-NEXT:    usdot v0.4s, v2.16b, v4.16b
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-SD-NEXT:    addv s0, v0.4s
 ; CHECK-SD-NEXT:    fmov w8, s0
 ; CHECK-SD-NEXT:    add w0, w8, w2
@@ -8863,32 +8845,24 @@ entry:
 define i32 @test_usdot_v64i8_double(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
 ; CHECK-SD-LABEL: test_usdot_v64i8_double:
 ; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v17.2d, #0000000000000000
 ; CHECK-SD-NEXT:    movi v18.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v21.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v22.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v23.2d, #0000000000000000
-; CHECK-SD-NEXT:    ldp q16, q17, [sp, #64]
-; CHECK-SD-NEXT:    movi v24.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v25.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v26.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v27.2d, #0000000000000000
-; CHECK-SD-NEXT:    ldp q19, q20, [sp, #96]
-; CHECK-SD-NEXT:    usdot v18.4s, v3.16b, v7.16b
-; CHECK-SD-NEXT:    ldp q3, q7, [sp, #32]
-; CHECK-SD-NEXT:    usdot v21.4s, v1.16b, v5.16b
-; CHECK-SD-NEXT:    ldp q1, q5, [sp]
-; CHECK-SD-NEXT:    usdot v22.4s, v2.16b, v6.16b
-; CHECK-SD-NEXT:    usdot v23.4s, v0.16b, v4.16b
-; CHECK-SD-NEXT:    usdot v24.4s, v7.16b, v20.16b
-; CHECK-SD-NEXT:    usdot v27.4s, v3.16b, v19.16b
-; CHECK-SD-NEXT:    usdot v26.4s, v5.16b, v17.16b
-; CHECK-SD-NEXT:    usdot v25.4s, v1.16b, v16.16b
-; CHECK-SD-NEXT:    add v0.4s, v21.4s, v18.4s
-; CHECK-SD-NEXT:    add v1.4s, v23.4s, v22.4s
-; CHECK-SD-NEXT:    add v2.4s, v26.4s, v24.4s
-; CHECK-SD-NEXT:    add v3.4s, v25.4s, v27.4s
-; CHECK-SD-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-SD-NEXT:    add v1.4s, v3.4s, v2.4s
+; CHECK-SD-NEXT:    movi v19.2d, #0000000000000000
+; CHECK-SD-NEXT:    ldp q20, q21, [sp, #96]
+; CHECK-SD-NEXT:    ldp q22, q23, [sp, #32]
+; CHECK-SD-NEXT:    usdot v16.4s, v3.16b, v7.16b
+; CHECK-SD-NEXT:    usdot v18.4s, v2.16b, v6.16b
+; CHECK-SD-NEXT:    usdot v19.4s, v23.16b, v21.16b
+; CHECK-SD-NEXT:    usdot v17.4s, v22.16b, v20.16b
+; CHECK-SD-NEXT:    ldp q2, q3, [sp, #64]
+; CHECK-SD-NEXT:    ldp q6, q7, [sp]
+; CHECK-SD-NEXT:    usdot v16.4s, v1.16b, v5.16b
+; CHECK-SD-NEXT:    usdot v18.4s, v0.16b, v4.16b
+; CHECK-SD-NEXT:    usdot v19.4s, v7.16b, v3.16b
+; CHECK-SD-NEXT:    usdot v17.4s, v6.16b, v2.16b
+; CHECK-SD-NEXT:    add v0.4s, v18.4s, v16.4s
+; CHECK-SD-NEXT:    add v1.4s, v17.4s, v19.4s
 ; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-SD-NEXT:    addv s0, v0.4s
 ; CHECK-SD-NEXT:    fmov w0, s0


        


More information about the llvm-commits mailing list