[llvm] [AArch64] Add USDOT to the instruction we perform performAddDotCombine. (PR #171864)

David Green via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 11 09:13:49 PST 2025


https://github.com/davemgreen updated https://github.com/llvm/llvm-project/pull/171864

>From 1df2384f5aed2114e05a30a4251353f137d922fb Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Thu, 11 Dec 2025 16:23:35 +0000
Subject: [PATCH 1/3] [AArch64] Add tests for add(usdot(zero), a). NFC

---
 llvm/test/CodeGen/AArch64/aarch64-matmul.ll | 33 +++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
index e7e9ee7330613..bdb64ef699f34 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm    < %s | FileCheck %s
-; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm -global-isel < %s | FileCheck %s
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm    < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+i8mm -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: smmla.v4i32.v16i8:
@@ -160,8 +160,37 @@ entry:
   ret <4 x i32> %vusdot1.i
 }
 
+define <2 x i32> @usdot_add_zero.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: usdot_add_zero.v2i32.v8i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-NEXT:    usdot v3.2s, v1.8b, v2.8b
+; CHECK-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NEXT:    ret
+entry:
+  %x = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %a, <8 x i8> %b)
+  %y = add <2 x i32> %x, %r
+  ret <2 x i32> %y
+}
+
+define <4 x i32> @usdot_add_zero.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: usdot_add_zero.v4i32.v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-NEXT:    usdot v3.4s, v1.16b, v2.16b
+; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-NEXT:    ret
+entry:
+  %x = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %a, <16 x i8> %b)
+  %y = add <4 x i32> %x, %r
+  ret <4 x i32> %y
+}
+
 declare <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
 declare <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
 declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
 declare <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) #2
 declare <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-GI: {{.*}}
+; CHECK-SD: {{.*}}

>From 9241ad0fbfaade9c9006bb46aa127a709734ed59 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Thu, 11 Dec 2025 16:30:05 +0000
Subject: [PATCH 2/3] [AArch64] Add USDOT to the instruction we perform
 performAddDotCombine.

This function does
// ADD(UDOT(zero, x, y), A) -->  UDOT(A, x, y)

Which can equally apply to USDOT too now that we have a node for it.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  3 +-
 llvm/test/CodeGen/AArch64/aarch64-matmul.ll   | 37 +++++++++++--------
 2 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 41caa817c11a4..35d40eb4e6e3f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21813,7 +21813,8 @@ static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
   // Handle commutivity
   auto isZeroDot = [](SDValue Dot) {
     return (Dot.getOpcode() == AArch64ISD::UDOT ||
-            Dot.getOpcode() == AArch64ISD::SDOT) &&
+            Dot.getOpcode() == AArch64ISD::SDOT ||
+            Dot.getOpcode() == AArch64ISD::USDOT) &&
            isZerosVector(Dot.getOperand(0).getNode());
   };
   if (!isZeroDot(Dot))
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
index bdb64ef699f34..c6776f3dd2513 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll
@@ -161,12 +161,17 @@ entry:
 }
 
 define <2 x i32> @usdot_add_zero.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
-; CHECK-LABEL: usdot_add_zero.v2i32.v8i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    usdot v3.2s, v1.8b, v2.8b
-; CHECK-NEXT:    add v0.2s, v3.2s, v0.2s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: usdot_add_zero.v2i32.v8i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    usdot v0.2s, v1.8b, v2.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: usdot_add_zero.v2i32.v8i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-GI-NEXT:    usdot v3.2s, v1.8b, v2.8b
+; CHECK-GI-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-GI-NEXT:    ret
 entry:
   %x = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %a, <8 x i8> %b)
   %y = add <2 x i32> %x, %r
@@ -174,12 +179,17 @@ entry:
 }
 
 define <4 x i32> @usdot_add_zero.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
-; CHECK-LABEL: usdot_add_zero.v4i32.v16i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-NEXT:    usdot v3.4s, v1.16b, v2.16b
-; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: usdot_add_zero.v4i32.v16i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    usdot v0.4s, v1.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: usdot_add_zero.v4i32.v16i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-GI-NEXT:    usdot v3.4s, v1.16b, v2.16b
+; CHECK-GI-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %x = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %a, <16 x i8> %b)
   %y = add <4 x i32> %x, %r
@@ -191,6 +201,3 @@ declare <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16
 declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
 declare <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) #2
 declare <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-GI: {{.*}}
-; CHECK-SD: {{.*}}

>From 68a8cc92672893973c6363ecadf17923da47d518 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Thu, 11 Dec 2025 17:11:40 +0000
Subject: [PATCH 3/3] Update neon-dotreduce.ll

---
 llvm/test/CodeGen/AArch64/neon-dotreduce.ll | 114 ++++++++------------
 1 file changed, 44 insertions(+), 70 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 4b0d110632959..dbbe00c89eecf 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -1375,11 +1375,9 @@ define i32 @test_usdot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i
 ; CHECK-SD-LABEL: test_usdot_v8i8_double:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT:    usdot v5.2s, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    usdot v4.2s, v2.8b, v3.8b
-; CHECK-SD-NEXT:    add v0.2s, v5.2s, v4.2s
-; CHECK-SD-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-SD-NEXT:    usdot v4.2s, v0.8b, v1.8b
+; CHECK-SD-NEXT:    addp v0.2s, v4.2s, v4.2s
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1416,11 +1414,9 @@ define i32 @test_usdot_swapped_operands_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8
 ; CHECK-SD-LABEL: test_usdot_swapped_operands_v8i8_double:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT:    usdot v5.2s, v1.8b, v0.8b
 ; CHECK-SD-NEXT:    usdot v4.2s, v3.8b, v2.8b
-; CHECK-SD-NEXT:    add v0.2s, v5.2s, v4.2s
-; CHECK-SD-NEXT:    addp v0.2s, v0.2s, v0.2s
+; CHECK-SD-NEXT:    usdot v4.2s, v1.8b, v0.8b
+; CHECK-SD-NEXT:    addp v0.2s, v4.2s, v4.2s
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1457,11 +1453,9 @@ define i32 @test_usdot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <1
 ; CHECK-SD-LABEL: test_usdot_v16i8_double:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT:    usdot v5.4s, v0.16b, v1.16b
 ; CHECK-SD-NEXT:    usdot v4.4s, v2.16b, v3.16b
-; CHECK-SD-NEXT:    add v0.4s, v5.4s, v4.4s
-; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    usdot v4.4s, v0.16b, v1.16b
+; CHECK-SD-NEXT:    addv s0, v4.4s
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1509,11 +1503,9 @@ define i32 @test_usdot_swapped_operands_v16i8_double(<16 x i8> %a, <16 x i8> %b,
 ; CHECK-SD-LABEL: test_usdot_swapped_operands_v16i8_double:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT:    usdot v5.4s, v1.16b, v0.16b
 ; CHECK-SD-NEXT:    usdot v4.4s, v3.16b, v2.16b
-; CHECK-SD-NEXT:    add v0.4s, v5.4s, v4.4s
-; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    usdot v4.4s, v1.16b, v0.16b
+; CHECK-SD-NEXT:    addv s0, v4.4s
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -4384,12 +4376,10 @@ define i32 @test_usdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %
 ; CHECK-SD-LABEL: test_usdot_v32i8:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-SD-NEXT:    ldp q2, q3, [x0]
-; CHECK-SD-NEXT:    ldp q4, q5, [x1]
-; CHECK-SD-NEXT:    usdot v1.4s, v3.16b, v5.16b
-; CHECK-SD-NEXT:    usdot v0.4s, v2.16b, v4.16b
-; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ldp q1, q3, [x0]
+; CHECK-SD-NEXT:    ldp q2, q4, [x1]
+; CHECK-SD-NEXT:    usdot v0.4s, v3.16b, v4.16b
+; CHECK-SD-NEXT:    usdot v0.4s, v1.16b, v2.16b
 ; CHECK-SD-NEXT:    addv s0, v0.4s
 ; CHECK-SD-NEXT:    fmov w8, s0
 ; CHECK-SD-NEXT:    add w0, w8, w2
@@ -4438,15 +4428,11 @@ define i32 @test_usdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <3
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    movi v16.2d, #0000000000000000
 ; CHECK-SD-NEXT:    movi v17.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v18.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v19.2d, #0000000000000000
-; CHECK-SD-NEXT:    usdot v16.4s, v1.16b, v3.16b
-; CHECK-SD-NEXT:    usdot v18.4s, v0.16b, v2.16b
-; CHECK-SD-NEXT:    usdot v17.4s, v4.16b, v6.16b
-; CHECK-SD-NEXT:    usdot v19.4s, v5.16b, v7.16b
-; CHECK-SD-NEXT:    add v0.4s, v18.4s, v16.4s
-; CHECK-SD-NEXT:    add v1.4s, v17.4s, v19.4s
-; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    usdot v17.4s, v1.16b, v3.16b
+; CHECK-SD-NEXT:    usdot v16.4s, v5.16b, v7.16b
+; CHECK-SD-NEXT:    usdot v17.4s, v0.16b, v2.16b
+; CHECK-SD-NEXT:    usdot v16.4s, v4.16b, v6.16b
+; CHECK-SD-NEXT:    add v0.4s, v17.4s, v16.4s
 ; CHECK-SD-NEXT:    addv s0, v0.4s
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ret
@@ -8781,20 +8767,16 @@ define i32 @test_usdot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %
 ; CHECK-SD-LABEL: test_usdot_v64i8:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT:    ldp q1, q2, [x0, #32]
-; CHECK-SD-NEXT:    ldp q6, q7, [x1, #32]
-; CHECK-SD-NEXT:    ldp q16, q17, [x0]
-; CHECK-SD-NEXT:    ldp q18, q19, [x1]
-; CHECK-SD-NEXT:    usdot v0.4s, v2.16b, v7.16b
-; CHECK-SD-NEXT:    usdot v5.4s, v1.16b, v6.16b
-; CHECK-SD-NEXT:    usdot v4.4s, v17.16b, v19.16b
-; CHECK-SD-NEXT:    usdot v3.4s, v16.16b, v18.16b
-; CHECK-SD-NEXT:    add v0.4s, v4.4s, v0.4s
-; CHECK-SD-NEXT:    add v1.4s, v3.4s, v5.4s
-; CHECK-SD-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT:    ldp q2, q3, [x0, #32]
+; CHECK-SD-NEXT:    ldp q4, q5, [x1, #32]
+; CHECK-SD-NEXT:    usdot v1.4s, v3.16b, v5.16b
+; CHECK-SD-NEXT:    usdot v0.4s, v2.16b, v4.16b
+; CHECK-SD-NEXT:    ldp q2, q3, [x0]
+; CHECK-SD-NEXT:    ldp q4, q5, [x1]
+; CHECK-SD-NEXT:    usdot v1.4s, v3.16b, v5.16b
+; CHECK-SD-NEXT:    usdot v0.4s, v2.16b, v4.16b
+; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-SD-NEXT:    addv s0, v0.4s
 ; CHECK-SD-NEXT:    fmov w8, s0
 ; CHECK-SD-NEXT:    add w0, w8, w2
@@ -8863,32 +8845,24 @@ entry:
 define i32 @test_usdot_v64i8_double(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
 ; CHECK-SD-LABEL: test_usdot_v64i8_double:
 ; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v17.2d, #0000000000000000
 ; CHECK-SD-NEXT:    movi v18.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v21.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v22.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v23.2d, #0000000000000000
-; CHECK-SD-NEXT:    ldp q16, q17, [sp, #64]
-; CHECK-SD-NEXT:    movi v24.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v25.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v26.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v27.2d, #0000000000000000
-; CHECK-SD-NEXT:    ldp q19, q20, [sp, #96]
-; CHECK-SD-NEXT:    usdot v18.4s, v3.16b, v7.16b
-; CHECK-SD-NEXT:    ldp q3, q7, [sp, #32]
-; CHECK-SD-NEXT:    usdot v21.4s, v1.16b, v5.16b
-; CHECK-SD-NEXT:    ldp q1, q5, [sp]
-; CHECK-SD-NEXT:    usdot v22.4s, v2.16b, v6.16b
-; CHECK-SD-NEXT:    usdot v23.4s, v0.16b, v4.16b
-; CHECK-SD-NEXT:    usdot v24.4s, v7.16b, v20.16b
-; CHECK-SD-NEXT:    usdot v27.4s, v3.16b, v19.16b
-; CHECK-SD-NEXT:    usdot v26.4s, v5.16b, v17.16b
-; CHECK-SD-NEXT:    usdot v25.4s, v1.16b, v16.16b
-; CHECK-SD-NEXT:    add v0.4s, v21.4s, v18.4s
-; CHECK-SD-NEXT:    add v1.4s, v23.4s, v22.4s
-; CHECK-SD-NEXT:    add v2.4s, v26.4s, v24.4s
-; CHECK-SD-NEXT:    add v3.4s, v25.4s, v27.4s
-; CHECK-SD-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-SD-NEXT:    add v1.4s, v3.4s, v2.4s
+; CHECK-SD-NEXT:    movi v19.2d, #0000000000000000
+; CHECK-SD-NEXT:    ldp q20, q21, [sp, #96]
+; CHECK-SD-NEXT:    ldp q22, q23, [sp, #32]
+; CHECK-SD-NEXT:    usdot v16.4s, v3.16b, v7.16b
+; CHECK-SD-NEXT:    usdot v18.4s, v2.16b, v6.16b
+; CHECK-SD-NEXT:    usdot v19.4s, v23.16b, v21.16b
+; CHECK-SD-NEXT:    usdot v17.4s, v22.16b, v20.16b
+; CHECK-SD-NEXT:    ldp q2, q3, [sp, #64]
+; CHECK-SD-NEXT:    ldp q6, q7, [sp]
+; CHECK-SD-NEXT:    usdot v16.4s, v1.16b, v5.16b
+; CHECK-SD-NEXT:    usdot v18.4s, v0.16b, v4.16b
+; CHECK-SD-NEXT:    usdot v19.4s, v7.16b, v3.16b
+; CHECK-SD-NEXT:    usdot v17.4s, v6.16b, v2.16b
+; CHECK-SD-NEXT:    add v0.4s, v18.4s, v16.4s
+; CHECK-SD-NEXT:    add v1.4s, v17.4s, v19.4s
 ; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-SD-NEXT:    addv s0, v0.4s
 ; CHECK-SD-NEXT:    fmov w0, s0



More information about the llvm-commits mailing list