[llvm] [AArch64][SelectionDAG] Enable new partial reduction lowering by default (PR #143565)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 10 09:03:49 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Nicholas Guy (NickGuy-Arm)
<details>
<summary>Changes</summary>
---
Patch is 193.16 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/143565.diff
4 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+29-51)
- (modified) llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll (+919-580)
- (modified) llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll (+995-931)
- (modified) llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll (+59-231)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 766599d567efd..e610a63598a18 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -153,13 +153,6 @@ cl::opt<bool> EnableSVEGISel(
cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
cl::init(false));
-// FIXME : This is a temporary flag, and is used to help transition to
-// performing lowering the proper way using the new PARTIAL_REDUCE_MLA ISD
-// nodes.
-static cl::opt<bool> EnablePartialReduceNodes(
- "aarch64-enable-partial-reduce-nodes", cl::init(false), cl::ReallyHidden,
- cl::desc("Use the new method of lowering partial reductions."));
-
/// Value type used for condition codes.
static const MVT MVT_CC = MVT::i32;
@@ -1457,7 +1450,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
setOperationAction(ISD::FADD, VT, Custom);
- if (EnablePartialReduceNodes && Subtarget->hasDotProd()) {
+ if (Subtarget->hasDotProd()) {
static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
ISD::PARTIAL_REDUCE_UMLA};
@@ -1895,14 +1888,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
// Handle partial reduction operations
- if (EnablePartialReduceNodes && Subtarget->isSVEorStreamingSVEAvailable()) {
+ if (Subtarget->isSVEorStreamingSVEAvailable()) {
// Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
// Other pairs will default to 'Expand'.
static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
ISD::PARTIAL_REDUCE_UMLA};
setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv8i16, Legal);
setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv16i8, Legal);
-
setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv16i8, Custom);
if (Subtarget->hasMatMulInt8()) {
@@ -1957,17 +1949,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv2i64,
Custom);
- if (EnablePartialReduceNodes) {
- static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
- ISD::PARTIAL_REDUCE_UMLA};
- // Must be lowered to SVE instructions.
- setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
- setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
- setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
- setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
- setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
- setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
- }
+ static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
+ ISD::PARTIAL_REDUCE_UMLA};
+ // Must be lowered to SVE instructions.
+ setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
+ setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
+ setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
+ setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
+ setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
+ setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
}
}
@@ -2165,16 +2155,6 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
assert(I->getIntrinsicID() ==
Intrinsic::experimental_vector_partial_reduce_add &&
"Unexpected intrinsic!");
- if (EnablePartialReduceNodes)
- return true;
-
- EVT VT = EVT::getEVT(I->getType());
- auto Op1 = I->getOperand(1);
- EVT Op1VT = EVT::getEVT(Op1->getType());
- if (Op1VT.getVectorElementType() == VT.getVectorElementType() &&
- (VT.getVectorElementCount() * 4 == Op1VT.getVectorElementCount() ||
- VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount()))
- return false;
return true;
}
@@ -2252,26 +2232,24 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
- if (EnablePartialReduceNodes) {
- static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
- ISD::PARTIAL_REDUCE_UMLA};
- unsigned NumElts = VT.getVectorNumElements();
- if (VT.getVectorElementType() == MVT::i64) {
- setPartialReduceMLAAction(MLAOps, VT,
- MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
- setPartialReduceMLAAction(
- MLAOps, VT, MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
- setPartialReduceMLAAction(
- MLAOps, VT, MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
- } else if (VT.getVectorElementType() == MVT::i32) {
- setPartialReduceMLAAction(MLAOps, VT,
- MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
- setPartialReduceMLAAction(
- MLAOps, VT, MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
- } else if (VT.getVectorElementType() == MVT::i16) {
- setPartialReduceMLAAction(MLAOps, VT,
- MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
- }
+ static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
+ ISD::PARTIAL_REDUCE_UMLA};
+ unsigned NumElts = VT.getVectorNumElements();
+ if (VT.getVectorElementType() == MVT::i64) {
+ setPartialReduceMLAAction(MLAOps, VT,
+ MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
+ setPartialReduceMLAAction(
+ MLAOps, VT, MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
+ setPartialReduceMLAAction(
+ MLAOps, VT, MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
+ } else if (VT.getVectorElementType() == MVT::i32) {
+ setPartialReduceMLAAction(MLAOps, VT,
+ MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
+ setPartialReduceMLAAction(
+ MLAOps, VT, MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
+ } else if (VT.getVectorElementType() == MVT::i16) {
+ setPartialReduceMLAAction(MLAOps, VT,
+ MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
}
// Lower fixed length vector operations to scalable equivalents.
diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index 0c7b3c7d3c138..43404d1f871fe 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -1,15 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NOI8MM
-; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM,CHECK-NODOT
-; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-I8MM
-; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm -aarch64-enable-partial-reduce-nodes < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NEWLOWERING-I8MM
+; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefix=CHECK-NODOT
+; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefix=CHECK-DOT
+; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefix=CHECK-DOT-I8MM
define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
-; CHECK-DOT-LABEL: udot:
-; CHECK-DOT: // %bb.0:
-; CHECK-DOT-NEXT: udot v0.4s, v2.16b, v1.16b
-; CHECK-DOT-NEXT: ret
-;
; CHECK-NODOT-LABEL: udot:
; CHECK-NODOT: // %bb.0:
; CHECK-NODOT-NEXT: umull v3.8h, v2.8b, v1.8b
@@ -19,6 +13,16 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
; CHECK-NODOT-NEXT: ret
+;
+; CHECK-DOT-LABEL: udot:
+; CHECK-DOT: // %bb.0:
+; CHECK-DOT-NEXT: udot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-NEXT: ret
+;
+; CHECK-DOT-I8MM-LABEL: udot:
+; CHECK-DOT-I8MM: // %bb.0:
+; CHECK-DOT-I8MM-NEXT: udot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT: ret
%u.wide = zext <16 x i8> %u to <16 x i32>
%s.wide = zext <16 x i8> %s to <16 x i32>
%mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
@@ -27,22 +31,6 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
}
define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){
-; CHECK-DOT-LABEL: udot_in_loop:
-; CHECK-DOT: // %bb.0: // %entry
-; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000
-; CHECK-DOT-NEXT: mov x8, xzr
-; CHECK-DOT-NEXT: .LBB1_1: // %vector.body
-; CHECK-DOT-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-DOT-NEXT: ldr q2, [x0, x8]
-; CHECK-DOT-NEXT: ldr q3, [x1, x8]
-; CHECK-DOT-NEXT: mov v0.16b, v1.16b
-; CHECK-DOT-NEXT: add x8, x8, #16
-; CHECK-DOT-NEXT: udot v1.4s, v2.16b, v3.16b
-; CHECK-DOT-NEXT: cmp x8, #16
-; CHECK-DOT-NEXT: b.ne .LBB1_1
-; CHECK-DOT-NEXT: // %bb.2: // %end
-; CHECK-DOT-NEXT: ret
-;
; CHECK-NODOT-LABEL: udot_in_loop:
; CHECK-NODOT: // %bb.0: // %entry
; CHECK-NODOT-NEXT: movi v1.2d, #0000000000000000
@@ -63,6 +51,38 @@ define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){
; CHECK-NODOT-NEXT: b.ne .LBB1_1
; CHECK-NODOT-NEXT: // %bb.2: // %end
; CHECK-NODOT-NEXT: ret
+;
+; CHECK-DOT-LABEL: udot_in_loop:
+; CHECK-DOT: // %bb.0: // %entry
+; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT: mov x8, xzr
+; CHECK-DOT-NEXT: .LBB1_1: // %vector.body
+; CHECK-DOT-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT: ldr q2, [x0, x8]
+; CHECK-DOT-NEXT: ldr q3, [x1, x8]
+; CHECK-DOT-NEXT: mov v0.16b, v1.16b
+; CHECK-DOT-NEXT: add x8, x8, #16
+; CHECK-DOT-NEXT: udot v1.4s, v2.16b, v3.16b
+; CHECK-DOT-NEXT: cmp x8, #16
+; CHECK-DOT-NEXT: b.ne .LBB1_1
+; CHECK-DOT-NEXT: // %bb.2: // %end
+; CHECK-DOT-NEXT: ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_in_loop:
+; CHECK-DOT-I8MM: // %bb.0: // %entry
+; CHECK-DOT-I8MM-NEXT: movi v1.2d, #0000000000000000
+; CHECK-DOT-I8MM-NEXT: mov x8, xzr
+; CHECK-DOT-I8MM-NEXT: .LBB1_1: // %vector.body
+; CHECK-DOT-I8MM-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-I8MM-NEXT: ldr q2, [x0, x8]
+; CHECK-DOT-I8MM-NEXT: ldr q3, [x1, x8]
+; CHECK-DOT-I8MM-NEXT: mov v0.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT: add x8, x8, #16
+; CHECK-DOT-I8MM-NEXT: udot v1.4s, v2.16b, v3.16b
+; CHECK-DOT-I8MM-NEXT: cmp x8, #16
+; CHECK-DOT-I8MM-NEXT: b.ne .LBB1_1
+; CHECK-DOT-I8MM-NEXT: // %bb.2: // %end
+; CHECK-DOT-I8MM-NEXT: ret
entry:
br label %vector.body
@@ -86,11 +106,6 @@ end:
}
define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
-; CHECK-DOT-LABEL: udot_narrow:
-; CHECK-DOT: // %bb.0:
-; CHECK-DOT-NEXT: udot v0.2s, v2.8b, v1.8b
-; CHECK-DOT-NEXT: ret
-;
; CHECK-NODOT-LABEL: udot_narrow:
; CHECK-NODOT: // %bb.0:
; CHECK-NODOT-NEXT: umull v1.8h, v2.8b, v1.8b
@@ -105,6 +120,16 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s
; CHECK-NODOT-NEXT: ret
+;
+; CHECK-DOT-LABEL: udot_narrow:
+; CHECK-DOT: // %bb.0:
+; CHECK-DOT-NEXT: udot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-NEXT: ret
+;
+; CHECK-DOT-I8MM-LABEL: udot_narrow:
+; CHECK-DOT-I8MM: // %bb.0:
+; CHECK-DOT-I8MM-NEXT: udot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-I8MM-NEXT: ret
%u.wide = zext <8 x i8> %u to <8 x i32>
%s.wide = zext <8 x i8> %s to <8 x i32>
%mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -113,11 +138,6 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
}
define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
-; CHECK-DOT-LABEL: sdot:
-; CHECK-DOT: // %bb.0:
-; CHECK-DOT-NEXT: sdot v0.4s, v2.16b, v1.16b
-; CHECK-DOT-NEXT: ret
-;
; CHECK-NODOT-LABEL: sdot:
; CHECK-NODOT: // %bb.0:
; CHECK-NODOT-NEXT: smull v3.8h, v2.8b, v1.8b
@@ -127,6 +147,16 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h
; CHECK-NODOT-NEXT: ret
+;
+; CHECK-DOT-LABEL: sdot:
+; CHECK-DOT: // %bb.0:
+; CHECK-DOT-NEXT: sdot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-NEXT: ret
+;
+; CHECK-DOT-I8MM-LABEL: sdot:
+; CHECK-DOT-I8MM: // %bb.0:
+; CHECK-DOT-I8MM-NEXT: sdot v0.4s, v2.16b, v1.16b
+; CHECK-DOT-I8MM-NEXT: ret
%u.wide = sext <16 x i8> %u to <16 x i32>
%s.wide = sext <16 x i8> %s to <16 x i32>
%mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
@@ -135,11 +165,6 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
}
define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
-; CHECK-DOT-LABEL: sdot_narrow:
-; CHECK-DOT: // %bb.0:
-; CHECK-DOT-NEXT: sdot v0.2s, v2.8b, v1.8b
-; CHECK-DOT-NEXT: ret
-;
; CHECK-NODOT-LABEL: sdot_narrow:
; CHECK-NODOT: // %bb.0:
; CHECK-NODOT-NEXT: smull v1.8h, v2.8b, v1.8b
@@ -154,6 +179,16 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s
; CHECK-NODOT-NEXT: ret
+;
+; CHECK-DOT-LABEL: sdot_narrow:
+; CHECK-DOT: // %bb.0:
+; CHECK-DOT-NEXT: sdot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-NEXT: ret
+;
+; CHECK-DOT-I8MM-LABEL: sdot_narrow:
+; CHECK-DOT-I8MM: // %bb.0:
+; CHECK-DOT-I8MM-NEXT: sdot v0.2s, v2.8b, v1.8b
+; CHECK-DOT-I8MM-NEXT: ret
%u.wide = sext <8 x i8> %u to <8 x i32>
%s.wide = sext <8 x i8> %s to <8 x i32>
%mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
@@ -162,27 +197,34 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
}
define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
-; CHECK-NOI8MM-LABEL: usdot:
-; CHECK-NOI8MM: // %bb.0:
-; CHECK-NOI8MM-NEXT: ushll v3.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT: sshll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT: ushll2 v1.8h, v1.16b, #0
-; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT: smlal v0.4s, v4.4h, v3.4h
-; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v4.8h, v3.8h
-; CHECK-NOI8MM-NEXT: smlal v0.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT: ret
+; CHECK-NODOT-LABEL: usdot:
+; CHECK-NODOT: // %bb.0:
+; CHECK-NODOT-NEXT: ushll v3.8h, v1.8b, #0
+; CHECK-NODOT-NEXT: sshll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0
+; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT: smlal v0.4s, v4.4h, v3.4h
+; CHECK-NODOT-NEXT: smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-NODOT-NEXT: smlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT: smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT: ret
;
-; CHECK-I8MM-LABEL: usdot:
-; CHECK-I8MM: // %bb.0:
-; CHECK-I8MM-NEXT: usdot v0.4s, v1.16b, v2.16b
-; CHECK-I8MM-NEXT: ret
+; CHECK-DOT-LABEL: usdot:
+; CHECK-DOT: // %bb.0:
+; CHECK-DOT-NEXT: ushll v3.8h, v1.8b, #0
+; CHECK-DOT-NEXT: sshll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT: ushll2 v1.8h, v1.16b, #0
+; CHECK-DOT-NEXT: sshll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT: smlal v0.4s, v4.4h, v3.4h
+; CHECK-DOT-NEXT: smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-DOT-NEXT: smlal v0.4s, v2.4h, v1.4h
+; CHECK-DOT-NEXT: smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-DOT-NEXT: ret
;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot:
-; CHECK-NEWLOWERING-I8MM: // %bb.0:
-; CHECK-NEWLOWERING-I8MM-NEXT: usdot v0.4s, v1.16b, v2.16b
-; CHECK-NEWLOWERING-I8MM-NEXT: ret
+; CHECK-DOT-I8MM-LABEL: usdot:
+; CHECK-DOT-I8MM: // %bb.0:
+; CHECK-DOT-I8MM-NEXT: usdot v0.4s, v1.16b, v2.16b
+; CHECK-DOT-I8MM-NEXT: ret
%u.wide = zext <16 x i8> %u to <16 x i32>
%s.wide = sext <16 x i8> %s to <16 x i32>
%mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
@@ -191,60 +233,67 @@ define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
}
define <4 x i32> @usdot_in_loop(ptr %p1, ptr %p2){
-; CHECK-NOI8MM-LABEL: usdot_in_loop:
-; CHECK-NOI8MM: // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT: movi v1.2d, #0000000000000000
-; CHECK-NOI8MM-NEXT: mov x8, xzr
-; CHECK-NOI8MM-NEXT: .LBB6_1: // %vector.body
-; CHECK-NOI8MM-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NOI8MM-NEXT: ldr q2, [x0, x8]
-; CHECK-NOI8MM-NEXT: ldr q3, [x1, x8]
-; CHECK-NOI8MM-NEXT: mov v0.16b, v1.16b
-; CHECK-NOI8MM-NEXT: add x8, x8, #16
-; CHECK-NOI8MM-NEXT: sshll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT: ushll v5.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT: ushll2 v3.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT: cmp x8, #16
-; CHECK-NOI8MM-NEXT: smlal v1.4s, v4.4h, v5.4h
-; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v4.8h, v5.8h
-; CHECK-NOI8MM-NEXT: smlal v1.4s, v2.4h, v3.4h
-; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v2.8h, v3.8h
-; CHECK-NOI8MM-NEXT: b.ne .LBB6_1
-; CHECK-NOI8MM-NEXT: // %bb.2: // %end
-; CHECK-NOI8MM-NEXT: ret
+; CHECK-NODOT-LABEL: usdot_in_loop:
+; CHECK-NODOT: // %bb.0: // %entry
+; CHECK-NODOT-NEXT: movi v1.2d, #0000000000000000
+; CHECK-NODOT-NEXT: mov x8, xzr
+; CHECK-NODOT-NEXT: .LBB6_1: // %vector.body
+; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NODOT-NEXT: ldr q2, [x0, x8]
+; CHECK-NODOT-NEXT: ldr q3, [x1, x8]
+; CHECK-NODOT-NEXT: mov v0.16b, v1.16b
+; CHECK-NODOT-NEXT: add x8, x8, #16
+; CHECK-NODOT-NEXT: sshll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT: ushll v5.8h, v3.8b, #0
+; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT: ushll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT: cmp x8, #16
+; CHECK-NODOT-NEXT: smlal v1.4s, v4.4h, v5.4h
+; CHECK-NODOT-NEXT: smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-NODOT-NEXT: smlal v1.4s, v2.4h, v3.4h
+; CHECK-NODOT-NEXT: smlal2 v1.4s, v2.8h, v3.8h
+; CHECK-NODOT-NEXT: b.ne .LBB6_1
+; CHECK-NODOT-NEXT: // %bb.2: // %end
+; CHECK-NODOT-NEXT: ret
;
-; CHECK-I8MM-LABEL: usdot_in_loop:
-; CHECK-I8MM: // %bb.0: // %entry
-; CHECK-I8MM-NEXT: movi v1.2d, #0000000000000000
-; CHECK-I8MM-NEXT: mov x8, xzr
-; CHECK-I8MM-NEXT: .LBB6_1: // %vector.body
-; CHECK-I8MM-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-I8MM-NEXT: ldr q2, [x0, x8]
-; CHECK-I8MM-NEXT: ldr q3, [x1, x8]
-; CHECK-I8MM-NEXT: mov v0.16b, v1.16b
-; CHECK-I8MM-NEXT: add x8, x8, #16
-; CHECK-I8MM-NEXT: usdot v1.4s, v3.16b, v2.16b
-; CHECK-I8MM-NEXT: cmp x8, #16
-; CHECK-I8MM-NEXT: b.ne .LBB6_1
-; CHECK-I8MM-NEXT: // %bb.2: // %end
-; CHECK-I8MM-NEXT: ret
+; CHECK-DOT-LABEL: usdot_in_loop:
+; CHECK-DOT: // %bb.0: // %entry
+; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000
+; CHECK-DOT-NEXT: mov x8, xzr
+; CHECK-DOT-NEXT: .LBB6_1: // %vector.body
+; CHECK-DOT-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-DOT-NEXT: ldr q2, [x0, x8]
+; CHECK-DOT-NEXT: ldr q3, [x1, x8]
+; CHECK-DOT-NEXT: mov v0.16b, v1.16b
+; CHECK-DOT-NEXT: add x8, x8, #16
+; CHECK-DOT-NEXT: sshll v4.8h, v2.8b, #0
+; CHECK-DOT-NEXT: ushll v5.8h, v3.8b, #0
+; CHECK-DOT-NEXT: sshll2 v2.8h, v2.16b, #0
+; CHECK-DOT-NEXT: ushll2 v3.8h, v3.16b, #0
+; CHECK-DOT-NEXT: cmp x8, #16
+; CHECK-DOT-NEXT: smlal v1.4s, v4.4h, v5.4h
+; CHECK-DOT-NEXT: smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-DOT-NEXT: smlal v1.4s, v2.4h, v3.4h
+; CHECK-DOT-NEXT: smlal2 v1.4s, v2.8h, v3.8h
+; CHECK-DOT-NEXT: b.ne .LBB6_1
+; CHECK-DOT-NEXT: // %bb.2: // %end
+; CHECK-DOT-NEXT: ret
;
-; CHECK-NEWLOWERING-I8MM-LABEL: usdot_in_loop:
-; CHECK-NEWLOWERING-I8MM: // %bb.0: // %entry
-; CHECK-NEWLOWERING-I8MM-NEXT: movi v1.2d, #000...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/143565
More information about the llvm-commits
mailing list