[llvm] [SelectionDAG] Improve type legalisation for PARTIAL_REDUCE_MLA (PR #130935)
Nicholas Guy via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 30 07:12:32 PDT 2025
https://github.com/NickGuy-Arm updated https://github.com/llvm/llvm-project/pull/130935
>From a27a811233d6248a95d830c4ea6b6370c1305d7b Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Fri, 28 Feb 2025 17:10:50 +0000
Subject: [PATCH 01/10] [SelectionDAG] Improve type legalisation for
PARTIAL_REDUCE_MLA
Implement proper splitting functions for PARTIAL_REDUCE_MLA ISD
nodes.
This makes the udot_8to64 and sdot_8to64 tests generate dot product
instructions for when the new ISD nodes are used.
---
llvm/include/llvm/CodeGen/TargetLowering.h | 6 +++++
.../SelectionDAG/LegalizeVectorTypes.cpp | 26 ++++++++++++++++---
.../AArch64/sve-partial-reduce-dot-product.ll | 4 +++
3 files changed, 33 insertions(+), 3 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index abe261728a3e6..7b0e15f951681 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1668,6 +1668,12 @@ class TargetLoweringBase {
return Action == Legal || Action == Custom;
}
+ /// Return true if a PARTIAL_REDUCE_U/SMLA node with the specified types is
+ /// legal for this target.
+ bool isPartialReduceMLALegal(EVT AccVT, EVT InputVT) const {
+ return getPartialReduceMLAAction(AccVT, InputVT) == Legal;
+ }
+
/// If the action for this operation is to promote, this method returns the
/// ValueType to promote to.
MVT getTypeToPromoteTo(unsigned Op, MVT VT) const {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index a01e1cff74564..d0ae436a8758f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3220,8 +3220,26 @@ void DAGTypeLegalizer::SplitVecRes_VP_REVERSE(SDNode *N, SDValue &Lo,
void DAGTypeLegalizer::SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo,
SDValue &Hi) {
SDLoc DL(N);
- SDValue Expanded = TLI.expandPartialReduceMLA(N, DAG);
- std::tie(Lo, Hi) = DAG.SplitVector(Expanded, DL);
+ SDValue Acc = N->getOperand(0);
+ SDValue Input1 = N->getOperand(1);
+
+ // If the node has not gone through the DAG combine, then do not attempt to
+ // legalise, just expand.
+ if (!TLI.isPartialReduceMLALegal(Acc.getValueType(), Input1.getValueType())) {
+ SDValue Expanded = TLI.expandPartialReduceMLA(N, DAG);
+ std::tie(Lo, Hi) = DAG.SplitVector(Expanded, DL);
+ return;
+ }
+
+ SDValue AccLo, AccHi, Input1Lo, Input1Hi, Input2Lo, Input2Hi;
+ std::tie(AccLo, AccHi) = DAG.SplitVector(Acc, DL);
+ std::tie(Input1Lo, Input1Hi) = DAG.SplitVector(Input1, DL);
+ std::tie(Input2Lo, Input2Hi) = DAG.SplitVector(N->getOperand(2), DL);
+ unsigned Opcode = N->getOpcode();
+ EVT ResultVT = AccLo.getValueType();
+
+ Lo = DAG.getNode(Opcode, DL, ResultVT, AccLo, Input1Lo, Input2Lo);
+ Hi = DAG.getNode(Opcode, DL, ResultVT, AccHi, Input1Hi, Input2Hi);
}
void DAGTypeLegalizer::SplitVecRes_VECTOR_DEINTERLEAVE(SDNode *N) {
@@ -4501,7 +4519,9 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECTOR_HISTOGRAM(SDNode *N) {
}
SDValue DAGTypeLegalizer::SplitVecOp_PARTIAL_REDUCE_MLA(SDNode *N) {
- return TLI.expandPartialReduceMLA(N, DAG);
+ SDValue Lo, Hi;
+ SplitVecRes_PARTIAL_REDUCE_MLA(N, Lo, Hi);
+ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), N->getValueType(0), Lo, Hi);
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index ed27f40aba774..71936b686be15 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -259,6 +259,8 @@ define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: udot z0.d, z5.h, z4.h
+; CHECK-NEWLOWERING-NEXT: udot z1.d, z2.h, z3.h
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -293,6 +295,8 @@ define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z3.b
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT: sdot z0.d, z5.h, z4.h
+; CHECK-NEWLOWERING-NEXT: sdot z1.d, z2.h, z3.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
>From 4ee499082d5f1058dc192eb5eabee7662ad7a866 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Tue, 15 Apr 2025 13:12:49 +0100
Subject: [PATCH 02/10] Explicitly set PartialReduceMLAActions
---
.../Target/AArch64/AArch64ISelLowering.cpp | 28 +++++++++++++++++++
1 file changed, 28 insertions(+)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 447794cc2b744..810d42635e7b2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1604,6 +1604,26 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::MSTORE, VT, Custom);
}
+ if (EnablePartialReduceNodes) {
+ for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
+ for (MVT InnerVT : MVT::integer_scalable_vector_valuetypes()) {
+ // 1. Set all combinations where a type is illegal to "Legal"
+ // - These will be legalized to a legal type pair
+ // - Avoid expanding them too early (or preventing folds)
+ if (!isTypeLegal(VT) || !isTypeLegal(InnerVT)) {
+ setPartialReduceMLAAction(VT, InnerVT, Legal);
+ continue;
+ }
+ // 2. Set all legal combinations to "Expand"
+ // - Not all of these can be lowered (via a Legal or Custom lowering).
+ setPartialReduceMLAAction(VT, InnerVT, Expand);
+ }
+ }
+ // 3. Mark known legal pairs as 'Legal' (these will expand to USDOT).
+ setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv8i16, Legal);
+ setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv16i8, Legal);
+ }
+
// Firstly, exclude all scalable vector extending loads/truncating stores,
// include both integer and floating scalable vector.
for (MVT VT : MVT::scalable_vector_valuetypes()) {
@@ -1856,6 +1876,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// Other pairs will default to 'Expand'.
setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv8i16, Legal);
setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv16i8, Legal);
+
+ setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv8i64, Custom);
+ setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv16i32, Custom);
+
+ setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv4i64, Custom);
+ setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv8i32, Custom);
+ setPartialReduceMLAAction(MVT::nxv8i16, MVT::nxv16i16, Custom);
+ setPartialReduceMLAAction(MVT::nxv16i8, MVT::nxv32i8, Custom);
}
// Handle operations that are only available in non-streaming SVE mode.
>From 2ff7e91f9374a90f5146a5321c9ae06e15d3cb8d Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Thu, 24 Apr 2025 16:27:21 +0100
Subject: [PATCH 03/10] Re-generate tests
---
.../AArch64/sve-partial-reduce-dot-product.ll | 292 +++++++++---------
.../AArch64/sve-partial-reduce-wide-add.ll | 92 ++++--
2 files changed, 214 insertions(+), 170 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index 71936b686be15..e36c56b7487ee 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -214,53 +214,51 @@ define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT: uunpkhi z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT: uunpklo z2.h, z2.b
; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z5.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z5.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z26.d, z6.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z6.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z7.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z5.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z6.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z7.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z24.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z24.d, z24.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z6.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z25.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z25.d, z25.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z30.d, z4.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z31.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z8.d, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z9.d, z5.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z30.d, z24.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z31.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z8.d, z25.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z9.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d
+; CHECK-NEWLOWERING-NEXT: mul z7.d, z7.d, z24.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z28.d
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
-; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
-; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
+; CHECK-NEWLOWERING-NEXT: mul z6.d, z6.d, z25.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z29.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z5.d
+; CHECK-NEWLOWERING-NEXT: movprfx z2, z7
+; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z31.d, z9.d
; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
-; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z4.d, z3.d
+; CHECK-NEWLOWERING-NEXT: movprfx z3, z6
; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: udot z0.d, z5.h, z4.h
-; CHECK-NEWLOWERING-NEXT: udot z1.d, z2.h, z3.h
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -292,47 +290,45 @@ define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT: sunpkhi z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT: sunpklo z2.h, z2.b
; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: sdot z0.d, z5.h, z4.h
-; CHECK-NEWLOWERING-NEXT: sdot z1.d, z2.h, z3.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z2.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z5.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z5.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z26.d, z6.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z6.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z7.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z5.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z6.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z7.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z24.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z24.d, z24.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z6.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z25.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z25.d, z25.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z30.d, z4.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z31.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z8.d, z3.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z9.d, z5.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z30.d, z24.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z31.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z8.d, z25.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z9.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d
+; CHECK-NEWLOWERING-NEXT: mul z7.d, z7.d, z24.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z28.d
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
-; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
-; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
+; CHECK-NEWLOWERING-NEXT: mul z6.d, z6.d, z25.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z29.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z5.d
+; CHECK-NEWLOWERING-NEXT: movprfx z2, z7
+; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z31.d, z9.d
; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
-; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z4.d, z3.d
+; CHECK-NEWLOWERING-NEXT: movprfx z3, z6
; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
@@ -427,45 +423,45 @@ define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT: uunpkhi z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT: uunpklo z2.h, z2.b
; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z2.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z5.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z5.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z26.d, z6.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z6.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z7.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z5.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z6.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z7.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z24.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z24.d, z24.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z6.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z25.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z25.d, z25.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z30.d, z4.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z31.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z8.d, z3.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z9.d, z5.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z30.d, z24.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z31.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z8.d, z25.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z9.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d
+; CHECK-NEWLOWERING-NEXT: mul z7.d, z7.d, z24.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z28.d
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
-; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
-; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
+; CHECK-NEWLOWERING-NEXT: mul z6.d, z6.d, z25.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z29.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z5.d
+; CHECK-NEWLOWERING-NEXT: movprfx z2, z7
+; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z31.d, z9.d
; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
-; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z4.d, z3.d
+; CHECK-NEWLOWERING-NEXT: movprfx z3, z6
; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
@@ -560,45 +556,45 @@ define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT: sunpkhi z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT: sunpklo z2.h, z2.b
; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z5.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z5.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z26.d, z6.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z6.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z7.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z5.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z6.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z7.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z24.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z24.d, z24.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z6.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z25.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z25.d, z25.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z30.d, z4.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z31.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z8.d, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z9.d, z5.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z30.d, z24.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z31.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z8.d, z25.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z9.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d
+; CHECK-NEWLOWERING-NEXT: mul z7.d, z7.d, z24.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z28.d
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
-; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
-; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
+; CHECK-NEWLOWERING-NEXT: mul z6.d, z6.d, z25.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z29.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z5.d
+; CHECK-NEWLOWERING-NEXT: movprfx z2, z7
+; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z31.d, z9.d
; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
-; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z4.d, z3.d
+; CHECK-NEWLOWERING-NEXT: movprfx z3, z6
; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
@@ -731,27 +727,27 @@ define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale
;
; CHECK-NEWLOWERING-LABEL: udot_no_bin_op_8to64:
; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z2.b
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z2.b
+; CHECK-NEWLOWERING-NEXT: uunpklo z2.h, z2.b
; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z2.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z5.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z5.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z24.d, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z4.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z7.d
; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z6.d
-; CHECK-NEWLOWERING-NEXT: add z4.d, z25.d, z24.d
-; CHECK-NEWLOWERING-NEXT: add z2.d, z3.d, z2.d
+; CHECK-NEWLOWERING-NEXT: add z5.d, z5.d, z25.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
+; CHECK-NEWLOWERING-NEXT: add z2.d, z4.d, z3.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z24.d, z1.d
; CHECK-NEWLOWERING-NEXT: add z0.d, z5.d, z0.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z7.d, z1.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z4.d, z0.d
; CHECK-NEWLOWERING-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEWLOWERING-NEXT: ret
%a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -773,27 +769,27 @@ define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale
;
; CHECK-NEWLOWERING-LABEL: sdot_no_bin_op_8to64:
; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z2.b
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z2.b
+; CHECK-NEWLOWERING-NEXT: sunpklo z2.h, z2.b
; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z3.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z2.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z5.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z3.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z5.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z24.d, z3.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z3.d, z3.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z4.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z7.d
; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z6.d
-; CHECK-NEWLOWERING-NEXT: add z4.d, z25.d, z24.d
-; CHECK-NEWLOWERING-NEXT: add z2.d, z3.d, z2.d
+; CHECK-NEWLOWERING-NEXT: add z5.d, z5.d, z25.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
+; CHECK-NEWLOWERING-NEXT: add z2.d, z4.d, z3.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z24.d, z1.d
; CHECK-NEWLOWERING-NEXT: add z0.d, z5.d, z0.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z7.d, z1.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z4.d, z0.d
; CHECK-NEWLOWERING-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEWLOWERING-NEXT: ret
%a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
index 11fb60ead4fb2..602aa9df33b08 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
@@ -203,17 +203,41 @@ entry:
}
define <vscale x 4 x i64> @signed_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vscale x 8 x i32> %input){
-; CHECK-LABEL: signed_wide_add_nxv8i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: sunpkhi z4.d, z2.s
-; CHECK-NEXT: sunpklo z2.d, z2.s
-; CHECK-NEXT: sunpkhi z5.d, z3.s
-; CHECK-NEXT: sunpklo z3.d, z3.s
-; CHECK-NEXT: add z0.d, z0.d, z2.d
-; CHECK-NEXT: add z1.d, z1.d, z4.d
-; CHECK-NEXT: add z0.d, z3.d, z0.d
-; CHECK-NEXT: add z1.d, z5.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv8i32:
+; CHECK-SVE2: // %bb.0: // %entry
+; CHECK-SVE2-NEXT: sunpkhi z4.d, z2.s
+; CHECK-SVE2-NEXT: sunpklo z2.d, z2.s
+; CHECK-SVE2-NEXT: sunpkhi z5.d, z3.s
+; CHECK-SVE2-NEXT: sunpklo z3.d, z3.s
+; CHECK-SVE2-NEXT: add z0.d, z0.d, z2.d
+; CHECK-SVE2-NEXT: add z1.d, z1.d, z4.d
+; CHECK-SVE2-NEXT: add z0.d, z3.d, z0.d
+; CHECK-SVE2-NEXT: add z1.d, z5.d, z1.d
+; CHECK-SVE2-NEXT: ret
+;
+; CHECK-SVE-LABEL: signed_wide_add_nxv8i32:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: sunpkhi z4.d, z2.s
+; CHECK-SVE-NEXT: sunpklo z2.d, z2.s
+; CHECK-SVE-NEXT: sunpkhi z5.d, z3.s
+; CHECK-SVE-NEXT: sunpklo z3.d, z3.s
+; CHECK-SVE-NEXT: add z0.d, z0.d, z2.d
+; CHECK-SVE-NEXT: add z1.d, z1.d, z4.d
+; CHECK-SVE-NEXT: add z0.d, z3.d, z0.d
+; CHECK-SVE-NEXT: add z1.d, z5.d, z1.d
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-NEWLOWERING-LABEL: signed_wide_add_nxv8i32:
+; CHECK-NEWLOWERING: // %bb.0: // %entry
+; CHECK-NEWLOWERING-NEXT: sunpklo z4.d, z3.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z5.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z4.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
+; CHECK-NEWLOWERING-NEXT: ret
entry:
%input.wide = sext <vscale x 8 x i32> %input to <vscale x 8 x i64>
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(<vscale x 4 x i64> %acc, <vscale x 8 x i64> %input.wide)
@@ -221,17 +245,41 @@ entry:
}
define <vscale x 4 x i64> @unsigned_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vscale x 8 x i32> %input){
-; CHECK-LABEL: unsigned_wide_add_nxv8i32:
-; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: uunpkhi z4.d, z2.s
-; CHECK-NEXT: uunpklo z2.d, z2.s
-; CHECK-NEXT: uunpkhi z5.d, z3.s
-; CHECK-NEXT: uunpklo z3.d, z3.s
-; CHECK-NEXT: add z0.d, z0.d, z2.d
-; CHECK-NEXT: add z1.d, z1.d, z4.d
-; CHECK-NEXT: add z0.d, z3.d, z0.d
-; CHECK-NEXT: add z1.d, z5.d, z1.d
-; CHECK-NEXT: ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i32:
+; CHECK-SVE2: // %bb.0: // %entry
+; CHECK-SVE2-NEXT: uunpkhi z4.d, z2.s
+; CHECK-SVE2-NEXT: uunpklo z2.d, z2.s
+; CHECK-SVE2-NEXT: uunpkhi z5.d, z3.s
+; CHECK-SVE2-NEXT: uunpklo z3.d, z3.s
+; CHECK-SVE2-NEXT: add z0.d, z0.d, z2.d
+; CHECK-SVE2-NEXT: add z1.d, z1.d, z4.d
+; CHECK-SVE2-NEXT: add z0.d, z3.d, z0.d
+; CHECK-SVE2-NEXT: add z1.d, z5.d, z1.d
+; CHECK-SVE2-NEXT: ret
+;
+; CHECK-SVE-LABEL: unsigned_wide_add_nxv8i32:
+; CHECK-SVE: // %bb.0: // %entry
+; CHECK-SVE-NEXT: uunpkhi z4.d, z2.s
+; CHECK-SVE-NEXT: uunpklo z2.d, z2.s
+; CHECK-SVE-NEXT: uunpkhi z5.d, z3.s
+; CHECK-SVE-NEXT: uunpklo z3.d, z3.s
+; CHECK-SVE-NEXT: add z0.d, z0.d, z2.d
+; CHECK-SVE-NEXT: add z1.d, z1.d, z4.d
+; CHECK-SVE-NEXT: add z0.d, z3.d, z0.d
+; CHECK-SVE-NEXT: add z1.d, z5.d, z1.d
+; CHECK-SVE-NEXT: ret
+;
+; CHECK-NEWLOWERING-LABEL: unsigned_wide_add_nxv8i32:
+; CHECK-NEWLOWERING: // %bb.0: // %entry
+; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z5.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z4.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
+; CHECK-NEWLOWERING-NEXT: ret
entry:
%input.wide = zext <vscale x 8 x i32> %input to <vscale x 8 x i64>
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(<vscale x 4 x i64> %acc, <vscale x 8 x i64> %input.wide)
>From 76d2744f6a5f58ef00c6d0d00e4993d1db9d6ecc Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Thu, 24 Apr 2025 16:30:20 +0100
Subject: [PATCH 04/10] Remove erroneous setPartialReduceMLAAction calls
---
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 8 --------
1 file changed, 8 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 810d42635e7b2..bbaa690e55980 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1876,14 +1876,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// Other pairs will default to 'Expand'.
setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv8i16, Legal);
setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv16i8, Legal);
-
- setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv8i64, Custom);
- setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv16i32, Custom);
-
- setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv4i64, Custom);
- setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv8i32, Custom);
- setPartialReduceMLAAction(MVT::nxv8i16, MVT::nxv16i16, Custom);
- setPartialReduceMLAAction(MVT::nxv16i8, MVT::nxv32i8, Custom);
}
// Handle operations that are only available in non-streaming SVE mode.
>From b36ca2bb7d7090eab042ccc2cd229a2299b7a27b Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Thu, 24 Apr 2025 16:36:26 +0100
Subject: [PATCH 05/10] Remove dead/duplicate code
---
.../Target/AArch64/AArch64ISelLowering.cpp | 20 -------------------
1 file changed, 20 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bbaa690e55980..447794cc2b744 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1604,26 +1604,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::MSTORE, VT, Custom);
}
- if (EnablePartialReduceNodes) {
- for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
- for (MVT InnerVT : MVT::integer_scalable_vector_valuetypes()) {
- // 1. Set all combinations where a type is illegal to "Legal"
- // - These will be legalized to a legal type pair
- // - Avoid expanding them too early (or preventing folds)
- if (!isTypeLegal(VT) || !isTypeLegal(InnerVT)) {
- setPartialReduceMLAAction(VT, InnerVT, Legal);
- continue;
- }
- // 2. Set all legal combinations to "Expand"
- // - Not all of these can be lowered (via a Legal or Custom lowering).
- setPartialReduceMLAAction(VT, InnerVT, Expand);
- }
- }
- // 3. Mark known legal pairs as 'Legal' (these will expand to USDOT).
- setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv8i16, Legal);
- setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv16i8, Legal);
- }
-
// Firstly, exclude all scalable vector extending loads/truncating stores,
// include both integer and floating scalable vector.
for (MVT VT : MVT::scalable_vector_valuetypes()) {
>From 6c782ea7b2f90aa6f2fa70536dfcfe13e7297ef9 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Tue, 29 Apr 2025 14:43:06 +0100
Subject: [PATCH 06/10] Address comments and alter how PARTIAL_REDUCE_MLA
operands are split
---
.../SelectionDAG/LegalizeVectorTypes.cpp | 24 +-
.../neon-partial-reduce-dot-product.ll | 344 +++++----
.../CodeGen/AArch64/partial-reduction-add.ll | 16 +-
.../AArch64/sve-partial-reduce-dot-product.ll | 668 +++++++-----------
.../AArch64/sve-partial-reduce-wide-add.ll | 116 +--
5 files changed, 487 insertions(+), 681 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index d0ae436a8758f..7d690ea2205d8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3223,14 +3223,6 @@ void DAGTypeLegalizer::SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo,
SDValue Acc = N->getOperand(0);
SDValue Input1 = N->getOperand(1);
- // If the node has not gone through the DAG combine, then do not attempt to
- // legalise, just expand.
- if (!TLI.isPartialReduceMLALegal(Acc.getValueType(), Input1.getValueType())) {
- SDValue Expanded = TLI.expandPartialReduceMLA(N, DAG);
- std::tie(Lo, Hi) = DAG.SplitVector(Expanded, DL);
- return;
- }
-
SDValue AccLo, AccHi, Input1Lo, Input1Hi, Input2Lo, Input2Hi;
std::tie(AccLo, AccHi) = DAG.SplitVector(Acc, DL);
std::tie(Input1Lo, Input1Hi) = DAG.SplitVector(Input1, DL);
@@ -4519,9 +4511,19 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECTOR_HISTOGRAM(SDNode *N) {
}
SDValue DAGTypeLegalizer::SplitVecOp_PARTIAL_REDUCE_MLA(SDNode *N) {
- SDValue Lo, Hi;
- SplitVecRes_PARTIAL_REDUCE_MLA(N, Lo, Hi);
- return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), N->getValueType(0), Lo, Hi);
+
+ SDLoc DL(N);
+ SDValue Acc = N->getOperand(0);
+ SDValue Input1 = N->getOperand(1);
+
+ SDValue Input1Lo, Input1Hi, Input2Lo, Input2Hi;
+ std::tie(Input1Lo, Input1Hi) = DAG.SplitVector(Input1, DL);
+ std::tie(Input2Lo, Input2Hi) = DAG.SplitVector(N->getOperand(2), DL);
+ unsigned Opcode = N->getOpcode();
+ EVT ResultVT = Acc.getValueType();
+
+ auto Lo = DAG.getNode(Opcode, DL, ResultVT, Acc, Input1Lo, Input2Lo);
+ return DAG.getNode(Opcode, DL, ResultVT, Lo, Input1Hi, Input2Hi);
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index 9e305056abce2..ab9813aa796e3 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -14,11 +14,10 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
; CHECK-NODOT: // %bb.0:
; CHECK-NODOT-NEXT: umull v3.8h, v2.8b, v1.8b
; CHECK-NODOT-NEXT: umull2 v1.8h, v2.16b, v1.16b
-; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v3.4h
-; CHECK-NODOT-NEXT: uaddw2 v2.4s, v2.4s, v3.8h
+; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v3.8h
+; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
-; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
; CHECK-NODOT-NEXT: ret
%u.wide = zext <16 x i8> %u to <16 x i32>
%s.wide = zext <16 x i8> %s to <16 x i32>
@@ -50,18 +49,17 @@ define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){
; CHECK-NODOT-NEXT: mov x8, xzr
; CHECK-NODOT-NEXT: .LBB1_1: // %vector.body
; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NODOT-NEXT: ldr q0, [x0, x8]
-; CHECK-NODOT-NEXT: ldr q2, [x1, x8]
+; CHECK-NODOT-NEXT: ldr q2, [x0, x8]
+; CHECK-NODOT-NEXT: ldr q3, [x1, x8]
+; CHECK-NODOT-NEXT: mov v0.16b, v1.16b
; CHECK-NODOT-NEXT: add x8, x8, #16
+; CHECK-NODOT-NEXT: umull v4.8h, v2.8b, v3.8b
+; CHECK-NODOT-NEXT: umull2 v2.8h, v2.16b, v3.16b
; CHECK-NODOT-NEXT: cmp x8, #16
-; CHECK-NODOT-NEXT: umull v3.8h, v0.8b, v2.8b
-; CHECK-NODOT-NEXT: umull2 v2.8h, v0.16b, v2.16b
-; CHECK-NODOT-NEXT: mov v0.16b, v1.16b
-; CHECK-NODOT-NEXT: ushll v1.4s, v2.4h, #0
-; CHECK-NODOT-NEXT: uaddw v4.4s, v0.4s, v3.4h
-; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v3.8h
-; CHECK-NODOT-NEXT: uaddw2 v2.4s, v4.4s, v2.8h
-; CHECK-NODOT-NEXT: add v1.4s, v1.4s, v2.4s
+; CHECK-NODOT-NEXT: uaddw v1.4s, v1.4s, v4.4h
+; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v4.8h
+; CHECK-NODOT-NEXT: uaddw v1.4s, v1.4s, v2.4h
+; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v2.8h
; CHECK-NODOT-NEXT: b.ne .LBB1_1
; CHECK-NODOT-NEXT: // %bb.2: // %end
; CHECK-NODOT-NEXT: ret
@@ -98,14 +96,14 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
; CHECK-NODOT-NEXT: umull v1.8h, v2.8b, v1.8b
; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
-; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0
-; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
-; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
-; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
-; CHECK-NODOT-NEXT: uaddw v1.4s, v2.4s, v4.4h
-; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s
+; CHECK-NODOT-NEXT: ext v2.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s
; CHECK-NODOT-NEXT: ret
%u.wide = zext <8 x i8> %u to <8 x i32>
%s.wide = zext <8 x i8> %s to <8 x i32>
@@ -124,11 +122,10 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
; CHECK-NODOT: // %bb.0:
; CHECK-NODOT-NEXT: smull v3.8h, v2.8b, v1.8b
; CHECK-NODOT-NEXT: smull2 v1.8h, v2.16b, v1.16b
-; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v3.4h
-; CHECK-NODOT-NEXT: saddw2 v2.4s, v2.4s, v3.8h
+; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v3.8h
+; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h
-; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
; CHECK-NODOT-NEXT: ret
%u.wide = sext <16 x i8> %u to <16 x i32>
%s.wide = sext <16 x i8> %s to <16 x i32>
@@ -148,14 +145,14 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
; CHECK-NODOT-NEXT: smull v1.8h, v2.8b, v1.8b
; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
-; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0
-; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
-; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
-; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
-; CHECK-NODOT-NEXT: saddw v1.4s, v2.4s, v4.4h
-; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s
+; CHECK-NODOT-NEXT: ext v2.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
+; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s
; CHECK-NODOT-NEXT: ret
%u.wide = sext <8 x i8> %u to <8 x i32>
%s.wide = sext <8 x i8> %s to <8 x i32>
@@ -168,14 +165,13 @@ define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
; CHECK-NOI8MM-LABEL: usdot:
; CHECK-NOI8MM: // %bb.0:
; CHECK-NOI8MM-NEXT: ushll v3.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT: ushll2 v1.8h, v1.16b, #0
; CHECK-NOI8MM-NEXT: sshll v4.8h, v2.8b, #0
+; CHECK-NOI8MM-NEXT: ushll2 v1.8h, v1.16b, #0
; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0
; CHECK-NOI8MM-NEXT: smlal v0.4s, v4.4h, v3.4h
-; CHECK-NOI8MM-NEXT: smull v5.4s, v2.4h, v1.4h
+; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-NOI8MM-NEXT: smlal v0.4s, v2.4h, v1.4h
; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT: smlal2 v5.4s, v4.8h, v3.8h
-; CHECK-NOI8MM-NEXT: add v0.4s, v5.4s, v0.4s
; CHECK-NOI8MM-NEXT: ret
;
; CHECK-I8MM-LABEL: usdot:
@@ -196,20 +192,19 @@ define <4 x i32> @usdot_in_loop(ptr %p1, ptr %p2){
; CHECK-NOI8MM-NEXT: mov x8, xzr
; CHECK-NOI8MM-NEXT: .LBB6_1: // %vector.body
; CHECK-NOI8MM-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NOI8MM-NEXT: ldr q0, [x0, x8]
-; CHECK-NOI8MM-NEXT: ldr q2, [x1, x8]
+; CHECK-NOI8MM-NEXT: ldr q2, [x0, x8]
+; CHECK-NOI8MM-NEXT: ldr q3, [x1, x8]
+; CHECK-NOI8MM-NEXT: mov v0.16b, v1.16b
; CHECK-NOI8MM-NEXT: add x8, x8, #16
+; CHECK-NOI8MM-NEXT: sshll v4.8h, v2.8b, #0
+; CHECK-NOI8MM-NEXT: ushll v5.8h, v3.8b, #0
+; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0
+; CHECK-NOI8MM-NEXT: ushll2 v3.8h, v3.16b, #0
; CHECK-NOI8MM-NEXT: cmp x8, #16
-; CHECK-NOI8MM-NEXT: sshll v3.8h, v0.8b, #0
-; CHECK-NOI8MM-NEXT: sshll2 v4.8h, v0.16b, #0
-; CHECK-NOI8MM-NEXT: ushll v5.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT: mov v0.16b, v1.16b
-; CHECK-NOI8MM-NEXT: smlal v1.4s, v3.4h, v5.4h
-; CHECK-NOI8MM-NEXT: smull v6.4s, v4.4h, v2.4h
-; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v4.8h, v2.8h
-; CHECK-NOI8MM-NEXT: smlal2 v6.4s, v3.8h, v5.8h
-; CHECK-NOI8MM-NEXT: add v1.4s, v6.4s, v1.4s
+; CHECK-NOI8MM-NEXT: smlal v1.4s, v4.4h, v5.4h
+; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-NOI8MM-NEXT: smlal v1.4s, v2.4h, v3.4h
+; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v2.8h, v3.8h
; CHECK-NOI8MM-NEXT: b.ne .LBB6_1
; CHECK-NOI8MM-NEXT: // %bb.2: // %end
; CHECK-NOI8MM-NEXT: ret
@@ -258,15 +253,15 @@ define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
; CHECK-NOI8MM-NEXT: sshll v2.8h, v2.8b, #0
; CHECK-NOI8MM-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NOI8MM-NEXT: smull v3.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT: smull2 v4.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT: ext v5.16b, v1.16b, v1.16b, #8
-; CHECK-NOI8MM-NEXT: ext v6.16b, v2.16b, v2.16b, #8
; CHECK-NOI8MM-NEXT: smlal v0.4s, v2.4h, v1.4h
+; CHECK-NOI8MM-NEXT: ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NOI8MM-NEXT: ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-NOI8MM-NEXT: smull2 v1.4s, v2.8h, v1.8h
; CHECK-NOI8MM-NEXT: ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NOI8MM-NEXT: ext v1.16b, v4.16b, v4.16b, #8
-; CHECK-NOI8MM-NEXT: smlal v3.4s, v6.4h, v5.4h
-; CHECK-NOI8MM-NEXT: add v0.2s, v1.2s, v0.2s
+; CHECK-NOI8MM-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-NOI8MM-NEXT: add v0.2s, v3.2s, v0.2s
+; CHECK-NOI8MM-NEXT: smlal v0.4s, v5.4h, v4.4h
+; CHECK-NOI8MM-NEXT: add v0.2s, v1.2s, v0.2s
; CHECK-NOI8MM-NEXT: ret
;
; CHECK-I8MM-LABEL: usdot_narrow:
@@ -284,14 +279,13 @@ define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{
; CHECK-NOI8MM-LABEL: sudot:
; CHECK-NOI8MM: // %bb.0:
; CHECK-NOI8MM-NEXT: sshll v3.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT: sshll2 v1.8h, v1.16b, #0
; CHECK-NOI8MM-NEXT: ushll v4.8h, v2.8b, #0
+; CHECK-NOI8MM-NEXT: sshll2 v1.8h, v1.16b, #0
; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0
; CHECK-NOI8MM-NEXT: smlal v0.4s, v4.4h, v3.4h
-; CHECK-NOI8MM-NEXT: smull v5.4s, v2.4h, v1.4h
+; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-NOI8MM-NEXT: smlal v0.4s, v2.4h, v1.4h
; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT: smlal2 v5.4s, v4.8h, v3.8h
-; CHECK-NOI8MM-NEXT: add v0.4s, v5.4s, v0.4s
; CHECK-NOI8MM-NEXT: ret
;
; CHECK-I8MM-LABEL: sudot:
@@ -312,20 +306,19 @@ define <4 x i32> @sudot_in_loop(ptr %p1, ptr %p2){
; CHECK-NOI8MM-NEXT: mov x8, xzr
; CHECK-NOI8MM-NEXT: .LBB9_1: // %vector.body
; CHECK-NOI8MM-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NOI8MM-NEXT: ldr q0, [x0, x8]
-; CHECK-NOI8MM-NEXT: ldr q2, [x1, x8]
+; CHECK-NOI8MM-NEXT: ldr q2, [x0, x8]
+; CHECK-NOI8MM-NEXT: ldr q3, [x1, x8]
+; CHECK-NOI8MM-NEXT: mov v0.16b, v1.16b
; CHECK-NOI8MM-NEXT: add x8, x8, #16
+; CHECK-NOI8MM-NEXT: ushll v4.8h, v2.8b, #0
+; CHECK-NOI8MM-NEXT: sshll v5.8h, v3.8b, #0
+; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0
+; CHECK-NOI8MM-NEXT: sshll2 v3.8h, v3.16b, #0
; CHECK-NOI8MM-NEXT: cmp x8, #16
-; CHECK-NOI8MM-NEXT: ushll v3.8h, v0.8b, #0
-; CHECK-NOI8MM-NEXT: ushll2 v4.8h, v0.16b, #0
-; CHECK-NOI8MM-NEXT: sshll v5.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT: mov v0.16b, v1.16b
-; CHECK-NOI8MM-NEXT: smlal v1.4s, v3.4h, v5.4h
-; CHECK-NOI8MM-NEXT: smull v6.4s, v4.4h, v2.4h
-; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v4.8h, v2.8h
-; CHECK-NOI8MM-NEXT: smlal2 v6.4s, v3.8h, v5.8h
-; CHECK-NOI8MM-NEXT: add v1.4s, v6.4s, v1.4s
+; CHECK-NOI8MM-NEXT: smlal v1.4s, v4.4h, v5.4h
+; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-NOI8MM-NEXT: smlal v1.4s, v2.4h, v3.4h
+; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v2.8h, v3.8h
; CHECK-NOI8MM-NEXT: b.ne .LBB9_1
; CHECK-NOI8MM-NEXT: // %bb.2: // %end
; CHECK-NOI8MM-NEXT: ret
@@ -374,15 +367,15 @@ define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
; CHECK-NOI8MM-NEXT: ushll v2.8h, v2.8b, #0
; CHECK-NOI8MM-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NOI8MM-NEXT: smull v3.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT: smull2 v4.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT: ext v5.16b, v1.16b, v1.16b, #8
-; CHECK-NOI8MM-NEXT: ext v6.16b, v2.16b, v2.16b, #8
; CHECK-NOI8MM-NEXT: smlal v0.4s, v2.4h, v1.4h
+; CHECK-NOI8MM-NEXT: ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NOI8MM-NEXT: ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-NOI8MM-NEXT: smull2 v1.4s, v2.8h, v1.8h
; CHECK-NOI8MM-NEXT: ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NOI8MM-NEXT: ext v1.16b, v4.16b, v4.16b, #8
-; CHECK-NOI8MM-NEXT: smlal v3.4s, v6.4h, v5.4h
-; CHECK-NOI8MM-NEXT: add v0.2s, v1.2s, v0.2s
+; CHECK-NOI8MM-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-NOI8MM-NEXT: add v0.2s, v3.2s, v0.2s
+; CHECK-NOI8MM-NEXT: smlal v0.4s, v5.4h, v4.4h
+; CHECK-NOI8MM-NEXT: add v0.2s, v1.2s, v0.2s
; CHECK-NOI8MM-NEXT: ret
;
; CHECK-I8MM-LABEL: sudot_narrow:
@@ -413,14 +406,14 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0
; CHECK-NODOT-NEXT: ushll2 v4.4s, v4.8h, #0
; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0
-; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v3.4s
+; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v5.2s
; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s
-; CHECK-NODOT-NEXT: uaddl2 v3.2d, v4.4s, v5.4s
-; CHECK-NODOT-NEXT: uaddl v4.2d, v4.2s, v5.2s
+; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v5.4s
+; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v3.4s
+; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v2.2s
+; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v4.2s
; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s
-; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s
-; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d
-; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d
+; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v4.4s
; CHECK-NODOT-NEXT: ret
entry:
%a.wide = zext <16 x i8> %a to <16 x i64>
@@ -448,14 +441,14 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0
; CHECK-NODOT-NEXT: sshll2 v4.4s, v4.8h, #0
; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0
-; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v3.4s
+; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v5.2s
; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s
-; CHECK-NODOT-NEXT: saddl2 v3.2d, v4.4s, v5.4s
-; CHECK-NODOT-NEXT: saddl v4.2d, v4.2s, v5.2s
+; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v5.4s
+; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v3.4s
+; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v2.2s
+; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v4.2s
; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s
-; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s
-; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d
-; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d
+; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v4.4s
; CHECK-NODOT-NEXT: ret
entry:
%a.wide = sext <16 x i8> %a to <16 x i64>
@@ -470,27 +463,25 @@ define <4 x i64> @usdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
; CHECK-NOI8MM-LABEL: usdot_8to64:
; CHECK-NOI8MM: // %bb.0: // %entry
; CHECK-NOI8MM-NEXT: ushll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT: sshll v5.8h, v3.8b, #0
; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0
+; CHECK-NOI8MM-NEXT: sshll v5.8h, v3.8b, #0
; CHECK-NOI8MM-NEXT: sshll2 v3.8h, v3.16b, #0
; CHECK-NOI8MM-NEXT: ushll v6.4s, v4.4h, #0
-; CHECK-NOI8MM-NEXT: sshll v7.4s, v5.4h, #0
+; CHECK-NOI8MM-NEXT: ushll v7.4s, v2.4h, #0
+; CHECK-NOI8MM-NEXT: sshll v16.4s, v5.4h, #0
+; CHECK-NOI8MM-NEXT: sshll v17.4s, v3.4h, #0
; CHECK-NOI8MM-NEXT: ushll2 v4.4s, v4.8h, #0
+; CHECK-NOI8MM-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-NOI8MM-NEXT: sshll2 v5.4s, v5.8h, #0
-; CHECK-NOI8MM-NEXT: ushll2 v16.4s, v2.8h, #0
-; CHECK-NOI8MM-NEXT: sshll2 v17.4s, v3.8h, #0
-; CHECK-NOI8MM-NEXT: ushll v2.4s, v2.4h, #0
-; CHECK-NOI8MM-NEXT: sshll v3.4s, v3.4h, #0
-; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v6.4s, v7.4s
-; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v7.2s
-; CHECK-NOI8MM-NEXT: smull v18.2d, v4.2s, v5.2s
-; CHECK-NOI8MM-NEXT: smull2 v4.2d, v4.4s, v5.4s
-; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v16.4s, v17.4s
-; CHECK-NOI8MM-NEXT: smlal v0.2d, v16.2s, v17.2s
-; CHECK-NOI8MM-NEXT: smlal2 v4.2d, v2.4s, v3.4s
-; CHECK-NOI8MM-NEXT: smlal v18.2d, v2.2s, v3.2s
-; CHECK-NOI8MM-NEXT: add v1.2d, v4.2d, v1.2d
-; CHECK-NOI8MM-NEXT: add v0.2d, v18.2d, v0.2d
+; CHECK-NOI8MM-NEXT: sshll2 v3.4s, v3.8h, #0
+; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v16.2s
+; CHECK-NOI8MM-NEXT: smlal v1.2d, v7.2s, v17.2s
+; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v6.4s, v16.4s
+; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v7.4s, v17.4s
+; CHECK-NOI8MM-NEXT: smlal v0.2d, v4.2s, v5.2s
+; CHECK-NOI8MM-NEXT: smlal v1.2d, v2.2s, v3.2s
+; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v2.4s, v3.4s
; CHECK-NOI8MM-NEXT: ret
;
; CHECK-I8MM-LABEL: usdot_8to64:
@@ -513,27 +504,25 @@ define <4 x i64> @sudot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
; CHECK-NOI8MM-LABEL: sudot_8to64:
; CHECK-NOI8MM: // %bb.0: // %entry
; CHECK-NOI8MM-NEXT: sshll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT: ushll v5.8h, v3.8b, #0
; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0
+; CHECK-NOI8MM-NEXT: ushll v5.8h, v3.8b, #0
; CHECK-NOI8MM-NEXT: ushll2 v3.8h, v3.16b, #0
; CHECK-NOI8MM-NEXT: sshll v6.4s, v4.4h, #0
-; CHECK-NOI8MM-NEXT: ushll v7.4s, v5.4h, #0
+; CHECK-NOI8MM-NEXT: sshll v7.4s, v2.4h, #0
+; CHECK-NOI8MM-NEXT: ushll v16.4s, v5.4h, #0
+; CHECK-NOI8MM-NEXT: ushll v17.4s, v3.4h, #0
; CHECK-NOI8MM-NEXT: sshll2 v4.4s, v4.8h, #0
+; CHECK-NOI8MM-NEXT: sshll2 v2.4s, v2.8h, #0
; CHECK-NOI8MM-NEXT: ushll2 v5.4s, v5.8h, #0
-; CHECK-NOI8MM-NEXT: sshll2 v16.4s, v2.8h, #0
-; CHECK-NOI8MM-NEXT: ushll2 v17.4s, v3.8h, #0
-; CHECK-NOI8MM-NEXT: sshll v2.4s, v2.4h, #0
-; CHECK-NOI8MM-NEXT: ushll v3.4s, v3.4h, #0
-; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v6.4s, v7.4s
-; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v7.2s
-; CHECK-NOI8MM-NEXT: smull v18.2d, v4.2s, v5.2s
-; CHECK-NOI8MM-NEXT: smull2 v4.2d, v4.4s, v5.4s
-; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v16.4s, v17.4s
-; CHECK-NOI8MM-NEXT: smlal v0.2d, v16.2s, v17.2s
-; CHECK-NOI8MM-NEXT: smlal2 v4.2d, v2.4s, v3.4s
-; CHECK-NOI8MM-NEXT: smlal v18.2d, v2.2s, v3.2s
-; CHECK-NOI8MM-NEXT: add v1.2d, v4.2d, v1.2d
-; CHECK-NOI8MM-NEXT: add v0.2d, v18.2d, v0.2d
+; CHECK-NOI8MM-NEXT: ushll2 v3.4s, v3.8h, #0
+; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v16.2s
+; CHECK-NOI8MM-NEXT: smlal v1.2d, v7.2s, v17.2s
+; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v6.4s, v16.4s
+; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v7.4s, v17.4s
+; CHECK-NOI8MM-NEXT: smlal v0.2d, v4.2s, v5.2s
+; CHECK-NOI8MM-NEXT: smlal v1.2d, v2.2s, v3.2s
+; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v2.4s, v3.4s
; CHECK-NOI8MM-NEXT: ret
;
; CHECK-I8MM-LABEL: sudot_8to64:
@@ -563,11 +552,10 @@ define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
; CHECK-NODOT: // %bb.0:
; CHECK-NODOT-NEXT: ushll v2.8h, v1.8b, #0
; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0
-; CHECK-NODOT-NEXT: ushll v3.4s, v1.4h, #0
; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v2.4h
-; CHECK-NODOT-NEXT: uaddw2 v2.4s, v3.4s, v2.8h
+; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v2.8h
+; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
-; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
; CHECK-NODOT-NEXT: ret
%a.wide = zext <16 x i8> %a to <16 x i32>
%partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
@@ -597,17 +585,16 @@ define <4 x i32> @udot_no_bin_op_in_loop(ptr %p){
; CHECK-NODOT-NEXT: mov x8, xzr
; CHECK-NODOT-NEXT: .LBB16_1: // %vector.body
; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NODOT-NEXT: ldr q0, [x0, x8]
+; CHECK-NODOT-NEXT: ldr q2, [x0, x8]
+; CHECK-NODOT-NEXT: mov v0.16b, v1.16b
; CHECK-NODOT-NEXT: add x8, x8, #16
; CHECK-NODOT-NEXT: cmp x8, #16
-; CHECK-NODOT-NEXT: ushll v2.8h, v0.8b, #0
-; CHECK-NODOT-NEXT: ushll2 v3.8h, v0.16b, #0
-; CHECK-NODOT-NEXT: mov v0.16b, v1.16b
-; CHECK-NODOT-NEXT: ushll v1.4s, v3.4h, #0
-; CHECK-NODOT-NEXT: uaddw v4.4s, v0.4s, v2.4h
+; CHECK-NODOT-NEXT: ushll v3.8h, v2.8b, #0
+; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT: uaddw v1.4s, v1.4s, v3.4h
+; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v3.8h
+; CHECK-NODOT-NEXT: uaddw v1.4s, v1.4s, v2.4h
; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v2.8h
-; CHECK-NODOT-NEXT: uaddw2 v2.4s, v4.4s, v3.8h
-; CHECK-NODOT-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-NODOT-NEXT: b.ne .LBB16_1
; CHECK-NODOT-NEXT: // %bb.2: // %end
; CHECK-NODOT-NEXT: ret
@@ -641,11 +628,10 @@ define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
; CHECK-NODOT: // %bb.0:
; CHECK-NODOT-NEXT: sshll v2.8h, v1.8b, #0
; CHECK-NODOT-NEXT: sshll2 v1.8h, v1.16b, #0
-; CHECK-NODOT-NEXT: sshll v3.4s, v1.4h, #0
; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v2.4h
-; CHECK-NODOT-NEXT: saddw2 v2.4s, v3.4s, v2.8h
+; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v2.8h
+; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h
-; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
; CHECK-NODOT-NEXT: ret
%a.wide = sext <16 x i8> %a to <16 x i32>
%partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
@@ -664,14 +650,14 @@ define <2 x i32> @udot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
; CHECK-NODOT-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
-; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0
-; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
-; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
-; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
-; CHECK-NODOT-NEXT: uaddw v1.4s, v2.4s, v4.4h
-; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s
+; CHECK-NODOT-NEXT: ext v2.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s
; CHECK-NODOT-NEXT: ret
%a.wide = zext <8 x i8> %a to <8 x i32>
%partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
@@ -690,14 +676,14 @@ define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
; CHECK-NODOT-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
-; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0
-; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
-; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT: ext v1.16b, v1.16b, v1.16b, #8
; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
-; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
-; CHECK-NODOT-NEXT: saddw v1.4s, v2.4s, v4.4h
-; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s
+; CHECK-NODOT-NEXT: ext v2.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
+; CHECK-NODOT-NEXT: add v0.2s, v2.2s, v0.2s
; CHECK-NODOT-NEXT: ret
%a.wide = sext <8 x i8> %a to <8 x i32>
%partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
@@ -722,14 +708,14 @@ define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0
; CHECK-NODOT-NEXT: ushll2 v3.4s, v3.8h, #0
; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0
-; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v4.4s
+; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v5.2s
; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v4.2s
-; CHECK-NODOT-NEXT: uaddl2 v4.2d, v3.4s, v5.4s
-; CHECK-NODOT-NEXT: uaddl v3.2d, v3.2s, v5.2s
+; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v5.4s
+; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v4.4s
+; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v2.2s
+; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s
; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s
-; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s
-; CHECK-NODOT-NEXT: add v1.2d, v4.2d, v1.2d
-; CHECK-NODOT-NEXT: add v0.2d, v3.2d, v0.2d
+; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v3.4s
; CHECK-NODOT-NEXT: ret
%a.wide = zext <16 x i8> %a to <16 x i64>
%partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
@@ -754,14 +740,14 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0
; CHECK-NODOT-NEXT: sshll2 v3.4s, v3.8h, #0
; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0
-; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s
+; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v5.2s
; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v4.2s
-; CHECK-NODOT-NEXT: saddl2 v4.2d, v3.4s, v5.4s
-; CHECK-NODOT-NEXT: saddl v3.2d, v3.2s, v5.2s
+; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v5.4s
+; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v2.2s
+; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s
; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s
-; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s
-; CHECK-NODOT-NEXT: add v1.2d, v4.2d, v1.2d
-; CHECK-NODOT-NEXT: add v0.2d, v3.2d, v0.2d
+; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v3.4s
; CHECK-NODOT-NEXT: ret
%a.wide = sext <16 x i8> %a to <16 x i64>
%partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
@@ -808,11 +794,10 @@ define <2 x i64> @udot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b
; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-NEXT: ushll v4.4s, v2.4h, #0
; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0
-; CHECK-NEXT: umull v5.2d, v1.2s, v2.2s
; CHECK-NEXT: umlal v0.2d, v3.2s, v4.2s
+; CHECK-NEXT: umlal2 v0.2d, v3.4s, v4.4s
+; CHECK-NEXT: umlal v0.2d, v1.2s, v2.2s
; CHECK-NEXT: umlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT: umlal2 v5.2d, v3.4s, v4.4s
-; CHECK-NEXT: add v0.2d, v5.2d, v0.2d
; CHECK-NEXT: ret
entry:
%a.wide = zext <8 x i16> %a to <8 x i64>
@@ -830,11 +815,10 @@ define <2 x i64> @sdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b
; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-NEXT: sshll v4.4s, v2.4h, #0
; CHECK-NEXT: sshll2 v2.4s, v2.8h, #0
-; CHECK-NEXT: smull v5.2d, v1.2s, v2.2s
; CHECK-NEXT: smlal v0.2d, v3.2s, v4.2s
+; CHECK-NEXT: smlal2 v0.2d, v3.4s, v4.4s
+; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s
; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT: smlal2 v5.2d, v3.4s, v4.4s
-; CHECK-NEXT: add v0.2d, v5.2d, v0.2d
; CHECK-NEXT: ret
entry:
%a.wide = sext <8 x i16> %a to <8 x i64>
@@ -852,11 +836,10 @@ define <2 x i64> @usdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %
; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-NEXT: sshll v4.4s, v2.4h, #0
; CHECK-NEXT: sshll2 v2.4s, v2.8h, #0
-; CHECK-NEXT: smull v5.2d, v1.2s, v2.2s
; CHECK-NEXT: smlal v0.2d, v3.2s, v4.2s
+; CHECK-NEXT: smlal2 v0.2d, v3.4s, v4.4s
+; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s
; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT: smlal2 v5.2d, v3.4s, v4.4s
-; CHECK-NEXT: add v0.2d, v5.2d, v0.2d
; CHECK-NEXT: ret
entry:
%a.wide = zext <8 x i16> %a to <8 x i64>
@@ -874,11 +857,10 @@ define <2 x i64> @sudot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %
; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-NEXT: ushll v4.4s, v2.4h, #0
; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0
-; CHECK-NEXT: smull v5.2d, v1.2s, v2.2s
; CHECK-NEXT: smlal v0.2d, v3.2s, v4.2s
+; CHECK-NEXT: smlal2 v0.2d, v3.4s, v4.4s
+; CHECK-NEXT: smlal v0.2d, v1.2s, v2.2s
; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT: smlal2 v5.2d, v3.4s, v4.4s
-; CHECK-NEXT: add v0.2d, v5.2d, v0.2d
; CHECK-NEXT: ret
entry:
%a.wide = sext <8 x i16> %a to <8 x i64>
@@ -897,26 +879,24 @@ define <4 x i32> @usdot_multiple_zext_users(ptr %p1, ptr %p2, ptr %p3) {
; CHECK-NOI8MM-NEXT: .LBB28_1: // %vector.body
; CHECK-NOI8MM-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NOI8MM-NEXT: ldr q2, [x0, x8]
-; CHECK-NOI8MM-NEXT: ldr q3, [x2, x8]
-; CHECK-NOI8MM-NEXT: ldr q4, [x1, x8]
+; CHECK-NOI8MM-NEXT: ldr q3, [x1, x8]
+; CHECK-NOI8MM-NEXT: ldr q4, [x2, x8]
; CHECK-NOI8MM-NEXT: add x8, x8, #16
; CHECK-NOI8MM-NEXT: sshll v5.8h, v2.8b, #0
+; CHECK-NOI8MM-NEXT: ushll v6.8h, v4.8b, #0
+; CHECK-NOI8MM-NEXT: sshll v7.8h, v3.8b, #0
; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT: ushll2 v6.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT: ushll v3.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT: sshll v7.8h, v4.8b, #0
-; CHECK-NOI8MM-NEXT: sshll2 v4.8h, v4.16b, #0
+; CHECK-NOI8MM-NEXT: ushll2 v4.8h, v4.16b, #0
+; CHECK-NOI8MM-NEXT: sshll2 v3.8h, v3.16b, #0
; CHECK-NOI8MM-NEXT: cmp x8, #1024
-; CHECK-NOI8MM-NEXT: smull v16.4s, v2.4h, v6.4h
-; CHECK-NOI8MM-NEXT: smlal v0.4s, v5.4h, v3.4h
-; CHECK-NOI8MM-NEXT: smull v17.4s, v4.4h, v6.4h
-; CHECK-NOI8MM-NEXT: smlal v1.4s, v7.4h, v3.4h
-; CHECK-NOI8MM-NEXT: smlal2 v16.4s, v5.8h, v3.8h
-; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v2.8h, v6.8h
-; CHECK-NOI8MM-NEXT: smlal2 v17.4s, v7.8h, v3.8h
-; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v4.8h, v6.8h
-; CHECK-NOI8MM-NEXT: add v0.4s, v16.4s, v0.4s
-; CHECK-NOI8MM-NEXT: add v1.4s, v17.4s, v1.4s
+; CHECK-NOI8MM-NEXT: smlal v0.4s, v5.4h, v6.4h
+; CHECK-NOI8MM-NEXT: smlal v1.4s, v7.4h, v6.4h
+; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v5.8h, v6.8h
+; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v7.8h, v6.8h
+; CHECK-NOI8MM-NEXT: smlal v0.4s, v2.4h, v4.4h
+; CHECK-NOI8MM-NEXT: smlal v1.4s, v3.4h, v4.4h
+; CHECK-NOI8MM-NEXT: smlal2 v0.4s, v2.8h, v4.8h
+; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v3.8h, v4.8h
; CHECK-NOI8MM-NEXT: b.ne .LBB28_1
; CHECK-NOI8MM-NEXT: // %bb.2: // %end
; CHECK-NOI8MM-NEXT: add v0.4s, v1.4s, v0.4s
diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
index ae681ee54e687..c3828c3d695c4 100644
--- a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
+++ b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
@@ -18,7 +18,7 @@ define <4 x i32> @partial_reduce_add_fixed_half(<4 x i32> %accumulator, <8 x i32
; CHECK-LABEL: partial_reduce_add_fixed_half:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: add v0.4s, v2.4s, v0.4s
+; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-NEXT: ret
entry:
%partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accumulator, <8 x i32> %0)
@@ -39,7 +39,7 @@ define <vscale x 4 x i32> @partial_reduce_add_half(<vscale x 4 x i32> %accumulat
; CHECK-LABEL: partial_reduce_add_half:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: add z0.s, z0.s, z1.s
-; CHECK-NEXT: add z0.s, z2.s, z0.s
+; CHECK-NEXT: add z0.s, z0.s, z2.s
; CHECK-NEXT: ret
entry:
%partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0)
@@ -49,10 +49,10 @@ entry:
define <vscale x 4 x i32> @partial_reduce_add_quart(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0) #0 {
; CHECK-LABEL: partial_reduce_add_quart:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: add z0.s, z0.s, z1.s
; CHECK-NEXT: add z2.s, z2.s, z3.s
-; CHECK-NEXT: add z0.s, z4.s, z0.s
-; CHECK-NEXT: add z0.s, z2.s, z0.s
+; CHECK-NEXT: add z0.s, z0.s, z1.s
+; CHECK-NEXT: add z0.s, z0.s, z2.s
+; CHECK-NEXT: add z0.s, z0.s, z4.s
; CHECK-NEXT: ret
entry:
%partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0)
@@ -63,9 +63,9 @@ define <vscale x 8 x i32> @partial_reduce_add_half_8(<vscale x 8 x i32> %accumul
; CHECK-LABEL: partial_reduce_add_half_8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: add z0.s, z0.s, z2.s
-; CHECK-NEXT: add z1.s, z1.s, z3.s
-; CHECK-NEXT: add z0.s, z4.s, z0.s
-; CHECK-NEXT: add z1.s, z5.s, z1.s
+; CHECK-NEXT: add z1.s, z1.s, z4.s
+; CHECK-NEXT: add z0.s, z0.s, z3.s
+; CHECK-NEXT: add z1.s, z1.s, z5.s
; CHECK-NEXT: ret
entry:
%partial.reduce = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0)
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index e36c56b7487ee..1b754fc3d320e 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -85,46 +85,42 @@ define <vscale x 4 x i32> @usdot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a,
; CHECK-NOI8MM: // %bb.0: // %entry
; CHECK-NOI8MM-NEXT: uunpklo z3.h, z1.b
; CHECK-NOI8MM-NEXT: sunpklo z4.h, z2.b
+; CHECK-NOI8MM-NEXT: ptrue p0.s
; CHECK-NOI8MM-NEXT: uunpkhi z1.h, z1.b
; CHECK-NOI8MM-NEXT: sunpkhi z2.h, z2.b
-; CHECK-NOI8MM-NEXT: ptrue p0.s
; CHECK-NOI8MM-NEXT: uunpklo z5.s, z3.h
-; CHECK-NOI8MM-NEXT: uunpkhi z3.s, z3.h
; CHECK-NOI8MM-NEXT: sunpklo z6.s, z4.h
+; CHECK-NOI8MM-NEXT: uunpkhi z3.s, z3.h
; CHECK-NOI8MM-NEXT: sunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT: uunpklo z7.s, z1.h
+; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z5.s, z6.s
+; CHECK-NOI8MM-NEXT: uunpklo z5.s, z1.h
+; CHECK-NOI8MM-NEXT: sunpklo z6.s, z2.h
; CHECK-NOI8MM-NEXT: uunpkhi z1.s, z1.h
-; CHECK-NOI8MM-NEXT: sunpklo z24.s, z2.h
; CHECK-NOI8MM-NEXT: sunpkhi z2.s, z2.h
+; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z3.s, z4.s
; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NOI8MM-NEXT: mul z3.s, z3.s, z4.s
; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NOI8MM-NEXT: movprfx z1, z3
-; CHECK-NOI8MM-NEXT: mla z1.s, p0/m, z7.s, z24.s
-; CHECK-NOI8MM-NEXT: add z0.s, z1.s, z0.s
; CHECK-NOI8MM-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: usdot:
; CHECK-NEWLOWERING: // %bb.0: // %entry
; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z1.b
; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT: ptrue p0.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: ptrue p0.s
; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z1.h
+; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z1.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z2.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z3.s, z4.s
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s
-; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
@@ -144,46 +140,42 @@ define <vscale x 4 x i32> @sudot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a,
; CHECK-NOI8MM: // %bb.0: // %entry
; CHECK-NOI8MM-NEXT: sunpklo z3.h, z1.b
; CHECK-NOI8MM-NEXT: uunpklo z4.h, z2.b
+; CHECK-NOI8MM-NEXT: ptrue p0.s
; CHECK-NOI8MM-NEXT: sunpkhi z1.h, z1.b
; CHECK-NOI8MM-NEXT: uunpkhi z2.h, z2.b
-; CHECK-NOI8MM-NEXT: ptrue p0.s
; CHECK-NOI8MM-NEXT: sunpklo z5.s, z3.h
-; CHECK-NOI8MM-NEXT: sunpkhi z3.s, z3.h
; CHECK-NOI8MM-NEXT: uunpklo z6.s, z4.h
+; CHECK-NOI8MM-NEXT: sunpkhi z3.s, z3.h
; CHECK-NOI8MM-NEXT: uunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT: sunpklo z7.s, z1.h
+; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z5.s, z6.s
+; CHECK-NOI8MM-NEXT: sunpklo z5.s, z1.h
+; CHECK-NOI8MM-NEXT: uunpklo z6.s, z2.h
; CHECK-NOI8MM-NEXT: sunpkhi z1.s, z1.h
-; CHECK-NOI8MM-NEXT: uunpklo z24.s, z2.h
; CHECK-NOI8MM-NEXT: uunpkhi z2.s, z2.h
+; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z3.s, z4.s
; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NOI8MM-NEXT: mul z3.s, z3.s, z4.s
; CHECK-NOI8MM-NEXT: mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NOI8MM-NEXT: movprfx z1, z3
-; CHECK-NOI8MM-NEXT: mla z1.s, p0/m, z7.s, z24.s
-; CHECK-NOI8MM-NEXT: add z0.s, z1.s, z0.s
; CHECK-NOI8MM-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: sudot:
; CHECK-NEWLOWERING: // %bb.0: // %entry
; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z1.b
; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT: ptrue p0.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: ptrue p0.s
; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z3.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z1.h
+; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z1.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z2.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z3.s, z4.s
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s
-; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
@@ -206,18 +198,10 @@ define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
;
; CHECK-NEWLOWERING-LABEL: udot_8to64:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
-; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
-; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.h, z2.b
; CHECK-NEWLOWERING-NEXT: uunpklo z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b
+; CHECK-NEWLOWERING-NEXT: uunpkhi z5.h, z3.b
+; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z2.h
@@ -225,40 +209,32 @@ define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z6.s
; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z7.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z24.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z6.s
; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z25.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z6.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z24.d, z24.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z30.d, z4.s
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z4.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z5.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z3.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z31.d, z2.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z8.d, z3.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z9.d, z5.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: mul z7.d, z7.d, z24.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z28.d
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mul z6.d, z6.d, z25.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z29.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z5.d
-; CHECK-NEWLOWERING-NEXT: movprfx z2, z7
-; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z31.d, z9.d
-; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z4.d, z3.d
-; CHECK-NEWLOWERING-NEXT: movprfx z3, z6
-; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
-; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
-; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
-; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z6.d, z24.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z7.d, z25.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z3.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -282,18 +258,10 @@ define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
;
; CHECK-NEWLOWERING-LABEL: sdot_8to64:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
-; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
-; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.h, z2.b
; CHECK-NEWLOWERING-NEXT: sunpklo z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
+; CHECK-NEWLOWERING-NEXT: sunpkhi z5.h, z3.b
+; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z2.h
@@ -301,40 +269,32 @@ define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z6.s
; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z7.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z24.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z6.s
; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z25.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z6.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z24.d, z24.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z30.d, z4.s
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z4.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z5.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z3.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z31.d, z2.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z8.d, z3.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z9.d, z5.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: mul z7.d, z7.d, z24.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z28.d
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mul z6.d, z6.d, z25.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z29.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z5.d
-; CHECK-NEWLOWERING-NEXT: movprfx z2, z7
-; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z31.d, z9.d
-; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z4.d, z3.d
-; CHECK-NEWLOWERING-NEXT: movprfx z3, z6
-; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
-; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
-; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
-; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z6.d, z24.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z7.d, z25.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z3.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -358,75 +318,51 @@ define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
;
; CHECK-NOI8MM-LABEL: usdot_8to64:
; CHECK-NOI8MM: // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NOI8MM-NEXT: addvl sp, sp, #-2
-; CHECK-NOI8MM-NEXT: str z9, [sp] // 16-byte Folded Spill
-; CHECK-NOI8MM-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NOI8MM-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NOI8MM-NEXT: .cfi_offset w29, -16
-; CHECK-NOI8MM-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NOI8MM-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-; CHECK-NOI8MM-NEXT: uunpklo z4.h, z2.b
-; CHECK-NOI8MM-NEXT: sunpklo z5.h, z3.b
-; CHECK-NOI8MM-NEXT: uunpkhi z2.h, z2.b
-; CHECK-NOI8MM-NEXT: sunpkhi z3.h, z3.b
+; CHECK-NOI8MM-NEXT: uunpkhi z4.h, z2.b
+; CHECK-NOI8MM-NEXT: uunpklo z2.h, z2.b
+; CHECK-NOI8MM-NEXT: sunpkhi z5.h, z3.b
+; CHECK-NOI8MM-NEXT: sunpklo z3.h, z3.b
; CHECK-NOI8MM-NEXT: ptrue p0.d
; CHECK-NOI8MM-NEXT: uunpklo z6.s, z4.h
+; CHECK-NOI8MM-NEXT: uunpklo z7.s, z2.h
+; CHECK-NOI8MM-NEXT: sunpklo z24.s, z5.h
+; CHECK-NOI8MM-NEXT: sunpklo z25.s, z3.h
; CHECK-NOI8MM-NEXT: uunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT: sunpklo z7.s, z5.h
-; CHECK-NOI8MM-NEXT: sunpkhi z5.s, z5.h
-; CHECK-NOI8MM-NEXT: uunpklo z24.s, z2.h
; CHECK-NOI8MM-NEXT: uunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT: sunpklo z25.s, z3.h
+; CHECK-NOI8MM-NEXT: sunpkhi z5.s, z5.h
; CHECK-NOI8MM-NEXT: sunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT: uunpkhi z26.d, z6.s
-; CHECK-NOI8MM-NEXT: uunpklo z6.d, z6.s
-; CHECK-NOI8MM-NEXT: uunpklo z27.d, z4.s
-; CHECK-NOI8MM-NEXT: sunpklo z28.d, z7.s
-; CHECK-NOI8MM-NEXT: sunpklo z29.d, z5.s
+; CHECK-NOI8MM-NEXT: uunpklo z26.d, z6.s
+; CHECK-NOI8MM-NEXT: uunpklo z27.d, z7.s
+; CHECK-NOI8MM-NEXT: sunpklo z28.d, z24.s
+; CHECK-NOI8MM-NEXT: sunpklo z29.d, z25.s
+; CHECK-NOI8MM-NEXT: uunpkhi z6.d, z6.s
+; CHECK-NOI8MM-NEXT: uunpkhi z7.d, z7.s
+; CHECK-NOI8MM-NEXT: sunpkhi z24.d, z24.s
+; CHECK-NOI8MM-NEXT: sunpkhi z25.d, z25.s
+; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NOI8MM-NEXT: uunpklo z26.d, z4.s
+; CHECK-NOI8MM-NEXT: sunpklo z28.d, z5.s
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NOI8MM-NEXT: uunpklo z27.d, z2.s
+; CHECK-NOI8MM-NEXT: sunpklo z29.d, z3.s
; CHECK-NOI8MM-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NOI8MM-NEXT: sunpkhi z7.d, z7.s
+; CHECK-NOI8MM-NEXT: uunpkhi z2.d, z2.s
; CHECK-NOI8MM-NEXT: sunpkhi z5.d, z5.s
-; CHECK-NOI8MM-NEXT: uunpkhi z30.d, z24.s
-; CHECK-NOI8MM-NEXT: uunpkhi z31.d, z2.s
-; CHECK-NOI8MM-NEXT: uunpklo z24.d, z24.s
-; CHECK-NOI8MM-NEXT: uunpklo z2.d, z2.s
-; CHECK-NOI8MM-NEXT: sunpkhi z8.d, z25.s
-; CHECK-NOI8MM-NEXT: sunpklo z25.d, z25.s
-; CHECK-NOI8MM-NEXT: sunpklo z9.d, z3.s
-; CHECK-NOI8MM-NEXT: mul z27.d, z27.d, z29.d
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z6.d, z28.d
; CHECK-NOI8MM-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NOI8MM-NEXT: mul z4.d, z4.d, z5.d
-; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z26.d, z7.d
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z2.d, z9.d
-; CHECK-NOI8MM-NEXT: movprfx z2, z27
-; CHECK-NOI8MM-NEXT: mla z2.d, p0/m, z24.d, z25.d
-; CHECK-NOI8MM-NEXT: ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z31.d, z3.d
-; CHECK-NOI8MM-NEXT: movprfx z3, z4
-; CHECK-NOI8MM-NEXT: mla z3.d, p0/m, z30.d, z8.d
-; CHECK-NOI8MM-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NOI8MM-NEXT: add z0.d, z2.d, z0.d
-; CHECK-NOI8MM-NEXT: add z1.d, z3.d, z1.d
-; CHECK-NOI8MM-NEXT: addvl sp, sp, #2
-; CHECK-NOI8MM-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z6.d, z24.d
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z7.d, z25.d
+; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z4.d, z5.d
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z2.d, z3.d
; CHECK-NOI8MM-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: usdot_8to64:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
-; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
-; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.h, z2.b
; CHECK-NEWLOWERING-NEXT: uunpklo z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
+; CHECK-NEWLOWERING-NEXT: sunpkhi z5.h, z3.b
+; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z2.h
@@ -434,40 +370,32 @@ define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z6.s
; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z7.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z24.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z6.s
; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z25.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z6.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z24.d, z24.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z30.d, z4.s
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z4.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z5.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z3.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z31.d, z2.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z8.d, z3.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z9.d, z5.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: mul z7.d, z7.d, z24.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z28.d
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mul z6.d, z6.d, z25.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z29.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z5.d
-; CHECK-NEWLOWERING-NEXT: movprfx z2, z7
-; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z31.d, z9.d
-; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z4.d, z3.d
-; CHECK-NEWLOWERING-NEXT: movprfx z3, z6
-; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
-; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
-; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
-; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z6.d, z24.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z7.d, z25.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z3.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -491,75 +419,51 @@ define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
;
; CHECK-NOI8MM-LABEL: sudot_8to64:
; CHECK-NOI8MM: // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NOI8MM-NEXT: addvl sp, sp, #-2
-; CHECK-NOI8MM-NEXT: str z9, [sp] // 16-byte Folded Spill
-; CHECK-NOI8MM-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NOI8MM-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NOI8MM-NEXT: .cfi_offset w29, -16
-; CHECK-NOI8MM-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NOI8MM-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-; CHECK-NOI8MM-NEXT: sunpklo z4.h, z2.b
-; CHECK-NOI8MM-NEXT: uunpklo z5.h, z3.b
-; CHECK-NOI8MM-NEXT: sunpkhi z2.h, z2.b
-; CHECK-NOI8MM-NEXT: uunpkhi z3.h, z3.b
+; CHECK-NOI8MM-NEXT: sunpkhi z4.h, z2.b
+; CHECK-NOI8MM-NEXT: sunpklo z2.h, z2.b
+; CHECK-NOI8MM-NEXT: uunpkhi z5.h, z3.b
+; CHECK-NOI8MM-NEXT: uunpklo z3.h, z3.b
; CHECK-NOI8MM-NEXT: ptrue p0.d
; CHECK-NOI8MM-NEXT: sunpklo z6.s, z4.h
+; CHECK-NOI8MM-NEXT: sunpklo z7.s, z2.h
+; CHECK-NOI8MM-NEXT: uunpklo z24.s, z5.h
+; CHECK-NOI8MM-NEXT: uunpklo z25.s, z3.h
; CHECK-NOI8MM-NEXT: sunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT: uunpklo z7.s, z5.h
-; CHECK-NOI8MM-NEXT: uunpkhi z5.s, z5.h
-; CHECK-NOI8MM-NEXT: sunpklo z24.s, z2.h
; CHECK-NOI8MM-NEXT: sunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT: uunpklo z25.s, z3.h
+; CHECK-NOI8MM-NEXT: uunpkhi z5.s, z5.h
; CHECK-NOI8MM-NEXT: uunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT: sunpkhi z26.d, z6.s
-; CHECK-NOI8MM-NEXT: sunpklo z6.d, z6.s
-; CHECK-NOI8MM-NEXT: sunpklo z27.d, z4.s
-; CHECK-NOI8MM-NEXT: uunpklo z28.d, z7.s
-; CHECK-NOI8MM-NEXT: uunpklo z29.d, z5.s
+; CHECK-NOI8MM-NEXT: sunpklo z26.d, z6.s
+; CHECK-NOI8MM-NEXT: sunpklo z27.d, z7.s
+; CHECK-NOI8MM-NEXT: uunpklo z28.d, z24.s
+; CHECK-NOI8MM-NEXT: uunpklo z29.d, z25.s
+; CHECK-NOI8MM-NEXT: sunpkhi z6.d, z6.s
+; CHECK-NOI8MM-NEXT: sunpkhi z7.d, z7.s
+; CHECK-NOI8MM-NEXT: uunpkhi z24.d, z24.s
+; CHECK-NOI8MM-NEXT: uunpkhi z25.d, z25.s
+; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NOI8MM-NEXT: sunpklo z26.d, z4.s
+; CHECK-NOI8MM-NEXT: uunpklo z28.d, z5.s
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NOI8MM-NEXT: sunpklo z27.d, z2.s
+; CHECK-NOI8MM-NEXT: uunpklo z29.d, z3.s
; CHECK-NOI8MM-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NOI8MM-NEXT: uunpkhi z7.d, z7.s
+; CHECK-NOI8MM-NEXT: sunpkhi z2.d, z2.s
; CHECK-NOI8MM-NEXT: uunpkhi z5.d, z5.s
-; CHECK-NOI8MM-NEXT: sunpkhi z30.d, z24.s
-; CHECK-NOI8MM-NEXT: sunpkhi z31.d, z2.s
-; CHECK-NOI8MM-NEXT: sunpklo z24.d, z24.s
-; CHECK-NOI8MM-NEXT: sunpklo z2.d, z2.s
-; CHECK-NOI8MM-NEXT: uunpkhi z8.d, z25.s
-; CHECK-NOI8MM-NEXT: uunpklo z25.d, z25.s
-; CHECK-NOI8MM-NEXT: uunpklo z9.d, z3.s
-; CHECK-NOI8MM-NEXT: mul z27.d, z27.d, z29.d
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z6.d, z28.d
; CHECK-NOI8MM-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NOI8MM-NEXT: mul z4.d, z4.d, z5.d
-; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z26.d, z7.d
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z2.d, z9.d
-; CHECK-NOI8MM-NEXT: movprfx z2, z27
-; CHECK-NOI8MM-NEXT: mla z2.d, p0/m, z24.d, z25.d
-; CHECK-NOI8MM-NEXT: ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z31.d, z3.d
-; CHECK-NOI8MM-NEXT: movprfx z3, z4
-; CHECK-NOI8MM-NEXT: mla z3.d, p0/m, z30.d, z8.d
-; CHECK-NOI8MM-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NOI8MM-NEXT: add z0.d, z2.d, z0.d
-; CHECK-NOI8MM-NEXT: add z1.d, z3.d, z1.d
-; CHECK-NOI8MM-NEXT: addvl sp, sp, #2
-; CHECK-NOI8MM-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z6.d, z24.d
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z7.d, z25.d
+; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z4.d, z5.d
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z2.d, z3.d
; CHECK-NOI8MM-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: sudot_8to64:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
-; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
-; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.h, z2.b
; CHECK-NEWLOWERING-NEXT: sunpklo z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b
+; CHECK-NEWLOWERING-NEXT: uunpkhi z5.h, z3.b
+; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z2.h
@@ -567,40 +471,32 @@ define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z6.s
; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z7.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z24.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z6.s
; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z25.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z6.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z24.d, z24.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z30.d, z4.s
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z4.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z5.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z3.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z31.d, z2.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z8.d, z3.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z9.d, z5.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: mul z7.d, z7.d, z24.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z28.d
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mul z6.d, z6.d, z25.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z29.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z5.d
-; CHECK-NEWLOWERING-NEXT: movprfx z2, z7
-; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z31.d, z9.d
-; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z4.d, z3.d
-; CHECK-NEWLOWERING-NEXT: movprfx z3, z6
-; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
-; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
-; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
-; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z6.d, z24.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z7.d, z25.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z3.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -620,16 +516,16 @@ define <vscale x 4 x i32> @udot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16
;
; CHECK-NEWLOWERING-LABEL: udot_no_bin_op:
; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: uunpklo z2.h, z1.b
-; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z1.b
+; CHECK-NEWLOWERING-NEXT: uunpklo z1.h, z1.b
; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z2.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z1.h
; CHECK-NEWLOWERING-NEXT: uunpklo z1.s, z1.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: add z0.s, z0.s, z3.s
-; CHECK-NEWLOWERING-NEXT: add z1.s, z2.s, z1.s
-; CHECK-NEWLOWERING-NEXT: add z0.s, z4.s, z0.s
-; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s
+; CHECK-NEWLOWERING-NEXT: add z0.s, z0.s, z1.s
+; CHECK-NEWLOWERING-NEXT: add z1.s, z4.s, z3.s
+; CHECK-NEWLOWERING-NEXT: add z0.s, z0.s, z1.s
+; CHECK-NEWLOWERING-NEXT: add z0.s, z0.s, z2.s
; CHECK-NEWLOWERING-NEXT: ret
%a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
@@ -645,16 +541,16 @@ define <vscale x 4 x i32> @sdot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16
;
; CHECK-NEWLOWERING-LABEL: sdot_no_bin_op:
; CHECK-NEWLOWERING: // %bb.0:
-; CHECK-NEWLOWERING-NEXT: sunpklo z2.h, z1.b
-; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z1.b
+; CHECK-NEWLOWERING-NEXT: sunpklo z1.h, z1.b
; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z2.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z1.h
; CHECK-NEWLOWERING-NEXT: sunpklo z1.s, z1.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: add z0.s, z0.s, z3.s
-; CHECK-NEWLOWERING-NEXT: add z1.s, z2.s, z1.s
-; CHECK-NEWLOWERING-NEXT: add z0.s, z4.s, z0.s
-; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s
+; CHECK-NEWLOWERING-NEXT: add z0.s, z0.s, z1.s
+; CHECK-NEWLOWERING-NEXT: add z1.s, z4.s, z3.s
+; CHECK-NEWLOWERING-NEXT: add z0.s, z0.s, z1.s
+; CHECK-NEWLOWERING-NEXT: add z0.s, z0.s, z2.s
; CHECK-NEWLOWERING-NEXT: ret
%a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
%partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
@@ -670,16 +566,16 @@ define <vscale x 2 x i64> @udot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale
;
; CHECK-NEWLOWERING-LABEL: udot_no_bin_op_wide:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z1.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z1.s, z1.h
; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z2.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z1.s
; CHECK-NEWLOWERING-NEXT: uunpklo z1.d, z1.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z2.d, z1.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z4.d, z0.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z1.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z4.d, z3.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z1.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
@@ -696,16 +592,16 @@ define <vscale x 2 x i64> @sdot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale
;
; CHECK-NEWLOWERING-LABEL: sdot_no_bin_op_wide:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: sunpklo z2.s, z1.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z1.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z1.s, z1.h
; CHECK-NEWLOWERING-NEXT: sunpklo z3.d, z2.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z1.s
; CHECK-NEWLOWERING-NEXT: sunpklo z1.d, z1.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z2.d, z1.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z4.d, z0.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z1.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z4.d, z3.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z1.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
@@ -729,26 +625,26 @@ define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale
; CHECK-NEWLOWERING: // %bb.0:
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z2.b
; CHECK-NEWLOWERING-NEXT: uunpklo z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z3.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z3.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z3.h
; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z5.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z24.d, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z5.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z3.s
; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z2.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z3.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z7.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z6.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z24.d
; CHECK-NEWLOWERING-NEXT: add z5.d, z5.d, z25.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
-; CHECK-NEWLOWERING-NEXT: add z2.d, z4.d, z3.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z24.d, z1.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z5.d, z0.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
+; CHECK-NEWLOWERING-NEXT: add z3.d, z7.d, z6.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z5.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z4.d
; CHECK-NEWLOWERING-NEXT: ret
%a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
@@ -771,26 +667,26 @@ define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale
; CHECK-NEWLOWERING: // %bb.0:
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z2.b
; CHECK-NEWLOWERING-NEXT: sunpklo z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z3.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z3.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z3.h
; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z5.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z24.d, z3.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z3.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z5.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z3.d, z3.s
; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z2.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z3.d, z3.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z7.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z6.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z24.d
; CHECK-NEWLOWERING-NEXT: add z5.d, z5.d, z25.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
-; CHECK-NEWLOWERING-NEXT: add z2.d, z4.d, z3.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z24.d, z1.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z5.d, z0.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
+; CHECK-NEWLOWERING-NEXT: add z3.d, z7.d, z6.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z5.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z4.d
; CHECK-NEWLOWERING-NEXT: ret
%a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
@@ -870,46 +766,42 @@ define <vscale x 2 x i64> @not_usdot(<vscale x 2 x i64> %acc, <vscale x 8 x i16>
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uunpklo z3.s, z1.h
; CHECK-NEXT: sunpklo z4.s, z2.h
+; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpkhi z1.s, z1.h
; CHECK-NEXT: sunpkhi z2.s, z2.h
-; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpklo z5.d, z3.s
-; CHECK-NEXT: uunpkhi z3.d, z3.s
; CHECK-NEXT: sunpklo z6.d, z4.s
+; CHECK-NEXT: uunpkhi z3.d, z3.s
; CHECK-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NEXT: uunpklo z7.d, z1.s
+; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT: uunpklo z5.d, z1.s
+; CHECK-NEXT: sunpklo z6.d, z2.s
; CHECK-NEXT: uunpkhi z1.d, z1.s
-; CHECK-NEXT: sunpklo z24.d, z2.s
; CHECK-NEXT: sunpkhi z2.d, z2.s
+; CHECK-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT: mul z3.d, z3.d, z4.d
; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT: movprfx z1, z3
-; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: not_usdot:
; CHECK-NEWLOWERING: // %bb.0: // %entry
; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h
; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z2.h
+; CHECK-NEWLOWERING-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z1.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z1.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z2.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z2.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
@@ -924,46 +816,42 @@ define <vscale x 2 x i64> @not_sudot(<vscale x 2 x i64> %acc, <vscale x 8 x i16>
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sunpklo z3.s, z1.h
; CHECK-NEXT: uunpklo z4.s, z2.h
+; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sunpkhi z1.s, z1.h
; CHECK-NEXT: uunpkhi z2.s, z2.h
-; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sunpklo z5.d, z3.s
-; CHECK-NEXT: sunpkhi z3.d, z3.s
; CHECK-NEXT: uunpklo z6.d, z4.s
+; CHECK-NEXT: sunpkhi z3.d, z3.s
; CHECK-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NEXT: sunpklo z7.d, z1.s
+; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT: sunpklo z5.d, z1.s
+; CHECK-NEXT: uunpklo z6.d, z2.s
; CHECK-NEXT: sunpkhi z1.d, z1.s
-; CHECK-NEXT: uunpklo z24.d, z2.s
; CHECK-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT: mul z3.d, z3.d, z4.d
; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT: movprfx z1, z3
-; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: not_sudot:
; CHECK-NEWLOWERING: // %bb.0: // %entry
; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z1.h
; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h
+; CHECK-NEWLOWERING-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z1.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z1.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z2.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z2.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
@@ -978,48 +866,44 @@ define <vscale x 2 x i64> @udot_different_types(<vscale x 2 x i64> %acc, <vscale
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: and z2.h, z2.h, #0xff
; CHECK-NEXT: uunpklo z3.s, z1.h
-; CHECK-NEXT: uunpkhi z1.s, z1.h
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpkhi z1.s, z1.h
; CHECK-NEXT: uunpklo z4.s, z2.h
; CHECK-NEXT: uunpkhi z2.s, z2.h
; CHECK-NEXT: uunpklo z5.d, z3.s
; CHECK-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NEXT: uunpklo z7.d, z1.s
-; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: uunpklo z6.d, z4.s
; CHECK-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NEXT: uunpklo z24.d, z2.s
+; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT: uunpklo z5.d, z1.s
+; CHECK-NEXT: uunpklo z6.d, z2.s
+; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: uunpkhi z2.d, z2.s
-; CHECK-NEXT: mul z3.d, z3.d, z4.d
+; CHECK-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT: movprfx z1, z3
-; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: udot_different_types:
; CHECK-NEWLOWERING: // %bb.0: // %entry
; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff
; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
+; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z3.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z1.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z2.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z1.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
@@ -1039,20 +923,18 @@ define <vscale x 2 x i64> @sdot_different_types(<vscale x 2 x i64> %acc, <vscale
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: sunpklo z5.d, z3.s
; CHECK-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NEXT: sunpklo z7.d, z1.s
; CHECK-NEXT: sunpklo z4.s, z2.h
; CHECK-NEXT: sunpkhi z2.s, z2.h
-; CHECK-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEXT: sunpklo z6.d, z4.s
; CHECK-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NEXT: sunpklo z24.d, z2.s
+; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT: sunpklo z5.d, z1.s
+; CHECK-NEXT: sunpklo z6.d, z2.s
+; CHECK-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEXT: sunpkhi z2.d, z2.s
-; CHECK-NEXT: mul z3.d, z3.d, z4.d
+; CHECK-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT: movprfx z1, z3
-; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: sdot_different_types:
@@ -1064,20 +946,18 @@ define <vscale x 2 x i64> @sdot_different_types(<vscale x 2 x i64> %acc, <vscale
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z3.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z1.s
; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z2.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z2.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z1.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
@@ -1097,20 +977,18 @@ define <vscale x 2 x i64> @usdot_different_types(<vscale x 2 x i64> %acc, <vscal
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpklo z5.d, z3.s
; CHECK-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NEXT: uunpklo z7.d, z1.s
; CHECK-NEXT: sunpklo z4.s, z2.h
; CHECK-NEXT: sunpkhi z2.s, z2.h
-; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: sunpklo z6.d, z4.s
; CHECK-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NEXT: sunpklo z24.d, z2.s
+; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT: uunpklo z5.d, z1.s
+; CHECK-NEXT: sunpklo z6.d, z2.s
+; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: sunpkhi z2.d, z2.s
-; CHECK-NEXT: mul z3.d, z3.d, z4.d
+; CHECK-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT: movprfx z1, z3
-; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: usdot_different_types:
@@ -1122,20 +1000,18 @@ define <vscale x 2 x i64> @usdot_different_types(<vscale x 2 x i64> %acc, <vscal
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z3.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z1.s
; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z2.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z2.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z1.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
@@ -1150,48 +1026,44 @@ define <vscale x 2 x i64> @sudot_different_types(<vscale x 2 x i64> %acc, <vscal
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: and z2.h, z2.h, #0xff
; CHECK-NEXT: sunpklo z3.s, z1.h
-; CHECK-NEXT: sunpkhi z1.s, z1.h
; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: sunpkhi z1.s, z1.h
; CHECK-NEXT: uunpklo z4.s, z2.h
; CHECK-NEXT: uunpkhi z2.s, z2.h
; CHECK-NEXT: sunpklo z5.d, z3.s
; CHECK-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NEXT: sunpklo z7.d, z1.s
-; CHECK-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEXT: uunpklo z6.d, z4.s
; CHECK-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NEXT: uunpklo z24.d, z2.s
+; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT: sunpklo z5.d, z1.s
+; CHECK-NEXT: uunpklo z6.d, z2.s
+; CHECK-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEXT: uunpkhi z2.d, z2.s
-; CHECK-NEXT: mul z3.d, z3.d, z4.d
+; CHECK-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT: movprfx z1, z3
-; CHECK-NEXT: mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: sudot_different_types:
; CHECK-NEWLOWERING: // %bb.0: // %entry
; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff
; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
+; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h
; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z3.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z1.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z2.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z1.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
index 602aa9df33b08..5148d3da6c737 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
@@ -15,7 +15,7 @@ define <vscale x 2 x i64> @signed_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vsc
; CHECK-SVE-NEXT: sunpklo z2.d, z1.s
; CHECK-SVE-NEXT: sunpkhi z1.d, z1.s
; CHECK-SVE-NEXT: add z0.d, z0.d, z2.d
-; CHECK-SVE-NEXT: add z0.d, z1.d, z0.d
+; CHECK-SVE-NEXT: add z0.d, z0.d, z1.d
; CHECK-SVE-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: signed_wide_add_nxv4i32:
@@ -23,7 +23,7 @@ define <vscale x 2 x i64> @signed_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vsc
; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z1.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z1.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
%input.wide = sext <vscale x 4 x i32> %input to <vscale x 4 x i64>
@@ -43,7 +43,7 @@ define <vscale x 2 x i64> @unsigned_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <v
; CHECK-SVE-NEXT: uunpklo z2.d, z1.s
; CHECK-SVE-NEXT: uunpkhi z1.d, z1.s
; CHECK-SVE-NEXT: add z0.d, z0.d, z2.d
-; CHECK-SVE-NEXT: add z0.d, z1.d, z0.d
+; CHECK-SVE-NEXT: add z0.d, z0.d, z1.d
; CHECK-SVE-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: unsigned_wide_add_nxv4i32:
@@ -51,7 +51,7 @@ define <vscale x 2 x i64> @unsigned_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <v
; CHECK-NEWLOWERING-NEXT: uunpklo z2.d, z1.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z1.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
%input.wide = zext <vscale x 4 x i32> %input to <vscale x 4 x i64>
@@ -71,7 +71,7 @@ define <vscale x 4 x i32> @signed_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vsc
; CHECK-SVE-NEXT: sunpklo z2.s, z1.h
; CHECK-SVE-NEXT: sunpkhi z1.s, z1.h
; CHECK-SVE-NEXT: add z0.s, z0.s, z2.s
-; CHECK-SVE-NEXT: add z0.s, z1.s, z0.s
+; CHECK-SVE-NEXT: add z0.s, z0.s, z1.s
; CHECK-SVE-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: signed_wide_add_nxv8i16:
@@ -79,7 +79,7 @@ define <vscale x 4 x i32> @signed_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vsc
; CHECK-NEWLOWERING-NEXT: sunpklo z2.s, z1.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h
; CHECK-NEWLOWERING-NEXT: add z0.s, z0.s, z2.s
-; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s
+; CHECK-NEWLOWERING-NEXT: add z0.s, z0.s, z1.s
; CHECK-NEWLOWERING-NEXT: ret
entry:
%input.wide = sext <vscale x 8 x i16> %input to <vscale x 8 x i32>
@@ -99,7 +99,7 @@ define <vscale x 4 x i32> @unsigned_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <v
; CHECK-SVE-NEXT: uunpklo z2.s, z1.h
; CHECK-SVE-NEXT: uunpkhi z1.s, z1.h
; CHECK-SVE-NEXT: add z0.s, z0.s, z2.s
-; CHECK-SVE-NEXT: add z0.s, z1.s, z0.s
+; CHECK-SVE-NEXT: add z0.s, z0.s, z1.s
; CHECK-SVE-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: unsigned_wide_add_nxv8i16:
@@ -107,7 +107,7 @@ define <vscale x 4 x i32> @unsigned_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <v
; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z1.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
; CHECK-NEWLOWERING-NEXT: add z0.s, z0.s, z2.s
-; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s
+; CHECK-NEWLOWERING-NEXT: add z0.s, z0.s, z1.s
; CHECK-NEWLOWERING-NEXT: ret
entry:
%input.wide = zext <vscale x 8 x i16> %input to <vscale x 8 x i32>
@@ -127,7 +127,7 @@ define <vscale x 8 x i16> @signed_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vsc
; CHECK-SVE-NEXT: sunpklo z2.h, z1.b
; CHECK-SVE-NEXT: sunpkhi z1.h, z1.b
; CHECK-SVE-NEXT: add z0.h, z0.h, z2.h
-; CHECK-SVE-NEXT: add z0.h, z1.h, z0.h
+; CHECK-SVE-NEXT: add z0.h, z0.h, z1.h
; CHECK-SVE-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: signed_wide_add_nxv16i8:
@@ -135,7 +135,7 @@ define <vscale x 8 x i16> @signed_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vsc
; CHECK-NEWLOWERING-NEXT: sunpklo z2.h, z1.b
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b
; CHECK-NEWLOWERING-NEXT: add z0.h, z0.h, z2.h
-; CHECK-NEWLOWERING-NEXT: add z0.h, z1.h, z0.h
+; CHECK-NEWLOWERING-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEWLOWERING-NEXT: ret
entry:
%input.wide = sext <vscale x 16 x i8> %input to <vscale x 16 x i16>
@@ -155,7 +155,7 @@ define <vscale x 8 x i16> @unsigned_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <v
; CHECK-SVE-NEXT: uunpklo z2.h, z1.b
; CHECK-SVE-NEXT: uunpkhi z1.h, z1.b
; CHECK-SVE-NEXT: add z0.h, z0.h, z2.h
-; CHECK-SVE-NEXT: add z0.h, z1.h, z0.h
+; CHECK-SVE-NEXT: add z0.h, z0.h, z1.h
; CHECK-SVE-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: unsigned_wide_add_nxv16i8:
@@ -163,7 +163,7 @@ define <vscale x 8 x i16> @unsigned_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <v
; CHECK-NEWLOWERING-NEXT: uunpklo z2.h, z1.b
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b
; CHECK-NEWLOWERING-NEXT: add z0.h, z0.h, z2.h
-; CHECK-NEWLOWERING-NEXT: add z0.h, z1.h, z0.h
+; CHECK-NEWLOWERING-NEXT: add z0.h, z0.h, z1.h
; CHECK-NEWLOWERING-NEXT: ret
entry:
%input.wide = zext <vscale x 16 x i8> %input to <vscale x 16 x i16>
@@ -203,41 +203,17 @@ entry:
}
define <vscale x 4 x i64> @signed_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vscale x 8 x i32> %input){
-; CHECK-SVE2-LABEL: signed_wide_add_nxv8i32:
-; CHECK-SVE2: // %bb.0: // %entry
-; CHECK-SVE2-NEXT: sunpkhi z4.d, z2.s
-; CHECK-SVE2-NEXT: sunpklo z2.d, z2.s
-; CHECK-SVE2-NEXT: sunpkhi z5.d, z3.s
-; CHECK-SVE2-NEXT: sunpklo z3.d, z3.s
-; CHECK-SVE2-NEXT: add z0.d, z0.d, z2.d
-; CHECK-SVE2-NEXT: add z1.d, z1.d, z4.d
-; CHECK-SVE2-NEXT: add z0.d, z3.d, z0.d
-; CHECK-SVE2-NEXT: add z1.d, z5.d, z1.d
-; CHECK-SVE2-NEXT: ret
-;
-; CHECK-SVE-LABEL: signed_wide_add_nxv8i32:
-; CHECK-SVE: // %bb.0: // %entry
-; CHECK-SVE-NEXT: sunpkhi z4.d, z2.s
-; CHECK-SVE-NEXT: sunpklo z2.d, z2.s
-; CHECK-SVE-NEXT: sunpkhi z5.d, z3.s
-; CHECK-SVE-NEXT: sunpklo z3.d, z3.s
-; CHECK-SVE-NEXT: add z0.d, z0.d, z2.d
-; CHECK-SVE-NEXT: add z1.d, z1.d, z4.d
-; CHECK-SVE-NEXT: add z0.d, z3.d, z0.d
-; CHECK-SVE-NEXT: add z1.d, z5.d, z1.d
-; CHECK-SVE-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: signed_wide_add_nxv8i32:
-; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: sunpklo z4.d, z3.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z5.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z4.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
-; CHECK-NEWLOWERING-NEXT: ret
+; CHECK-LABEL: signed_wide_add_nxv8i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sunpklo z4.d, z3.s
+; CHECK-NEXT: sunpklo z5.d, z2.s
+; CHECK-NEXT: sunpkhi z3.d, z3.s
+; CHECK-NEXT: sunpkhi z2.d, z2.s
+; CHECK-NEXT: add z0.d, z0.d, z5.d
+; CHECK-NEXT: add z1.d, z1.d, z4.d
+; CHECK-NEXT: add z0.d, z0.d, z2.d
+; CHECK-NEXT: add z1.d, z1.d, z3.d
+; CHECK-NEXT: ret
entry:
%input.wide = sext <vscale x 8 x i32> %input to <vscale x 8 x i64>
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(<vscale x 4 x i64> %acc, <vscale x 8 x i64> %input.wide)
@@ -245,41 +221,17 @@ entry:
}
define <vscale x 4 x i64> @unsigned_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vscale x 8 x i32> %input){
-; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i32:
-; CHECK-SVE2: // %bb.0: // %entry
-; CHECK-SVE2-NEXT: uunpkhi z4.d, z2.s
-; CHECK-SVE2-NEXT: uunpklo z2.d, z2.s
-; CHECK-SVE2-NEXT: uunpkhi z5.d, z3.s
-; CHECK-SVE2-NEXT: uunpklo z3.d, z3.s
-; CHECK-SVE2-NEXT: add z0.d, z0.d, z2.d
-; CHECK-SVE2-NEXT: add z1.d, z1.d, z4.d
-; CHECK-SVE2-NEXT: add z0.d, z3.d, z0.d
-; CHECK-SVE2-NEXT: add z1.d, z5.d, z1.d
-; CHECK-SVE2-NEXT: ret
-;
-; CHECK-SVE-LABEL: unsigned_wide_add_nxv8i32:
-; CHECK-SVE: // %bb.0: // %entry
-; CHECK-SVE-NEXT: uunpkhi z4.d, z2.s
-; CHECK-SVE-NEXT: uunpklo z2.d, z2.s
-; CHECK-SVE-NEXT: uunpkhi z5.d, z3.s
-; CHECK-SVE-NEXT: uunpklo z3.d, z3.s
-; CHECK-SVE-NEXT: add z0.d, z0.d, z2.d
-; CHECK-SVE-NEXT: add z1.d, z1.d, z4.d
-; CHECK-SVE-NEXT: add z0.d, z3.d, z0.d
-; CHECK-SVE-NEXT: add z1.d, z5.d, z1.d
-; CHECK-SVE-NEXT: ret
-;
-; CHECK-NEWLOWERING-LABEL: unsigned_wide_add_nxv8i32:
-; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z3.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z5.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z4.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
-; CHECK-NEWLOWERING-NEXT: ret
+; CHECK-LABEL: unsigned_wide_add_nxv8i32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: uunpklo z4.d, z3.s
+; CHECK-NEXT: uunpklo z5.d, z2.s
+; CHECK-NEXT: uunpkhi z3.d, z3.s
+; CHECK-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEXT: add z0.d, z0.d, z5.d
+; CHECK-NEXT: add z1.d, z1.d, z4.d
+; CHECK-NEXT: add z0.d, z0.d, z2.d
+; CHECK-NEXT: add z1.d, z1.d, z3.d
+; CHECK-NEXT: ret
entry:
%input.wide = zext <vscale x 8 x i32> %input to <vscale x 8 x i64>
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(<vscale x 4 x i64> %acc, <vscale x 8 x i64> %input.wide)
>From 60b4ec6a1bbbab975d5473c70fd70cc022a7b6c2 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Wed, 30 Apr 2025 14:51:39 +0100
Subject: [PATCH 07/10] Split only the operands if the accumulator doesn't need
it.
---
.../SelectionDAG/LegalizeVectorTypes.cpp | 24 +-
.../neon-partial-reduce-dot-product.ll | 100 ++---
.../CodeGen/AArch64/partial-reduction-add.ll | 4 +-
.../AArch64/sve-partial-reduce-dot-product.ll | 400 +++++++++---------
.../AArch64/sve-partial-reduce-wide-add.ll | 20 +-
5 files changed, 282 insertions(+), 266 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 7d690ea2205d8..884a428ea9a60 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3222,12 +3222,24 @@ void DAGTypeLegalizer::SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo,
SDLoc DL(N);
SDValue Acc = N->getOperand(0);
SDValue Input1 = N->getOperand(1);
+ SDValue Input2 = N->getOperand(2);
- SDValue AccLo, AccHi, Input1Lo, Input1Hi, Input2Lo, Input2Hi;
+ SDValue AccLo, AccHi;
std::tie(AccLo, AccHi) = DAG.SplitVector(Acc, DL);
- std::tie(Input1Lo, Input1Hi) = DAG.SplitVector(Input1, DL);
- std::tie(Input2Lo, Input2Hi) = DAG.SplitVector(N->getOperand(2), DL);
unsigned Opcode = N->getOpcode();
+
+ // If the input types don't need splitting, just accumulate into the
+ // low part of the accumulator.
+ if (getTypeAction(Input1.getValueType()) == TargetLowering::TypeSplitVector) {
+ Lo = DAG.getNode(Opcode, DL, AccLo.getValueType(), AccLo, Input1, Input2);
+ Hi = AccHi;
+ return;
+ }
+
+ SDValue Input1Lo, Input1Hi;
+ SDValue Input2Lo, Input2Hi;
+ std::tie(Input1Lo, Input1Hi) = DAG.SplitVector(Input1, DL);
+ std::tie(Input2Lo, Input2Hi) = DAG.SplitVector(Input2, DL);
EVT ResultVT = AccLo.getValueType();
Lo = DAG.getNode(Opcode, DL, ResultVT, AccLo, Input1Lo, Input2Lo);
@@ -4512,9 +4524,13 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECTOR_HISTOGRAM(SDNode *N) {
SDValue DAGTypeLegalizer::SplitVecOp_PARTIAL_REDUCE_MLA(SDNode *N) {
- SDLoc DL(N);
SDValue Acc = N->getOperand(0);
+ assert(getTypeAction(Acc.getValueType()) != TargetLowering::TypeSplitVector &&
+ "Accumulator should already be a legal type, and shouldn't need "
+ "further splitting");
+
SDValue Input1 = N->getOperand(1);
+ SDLoc DL(N);
SDValue Input1Lo, Input1Hi, Input2Lo, Input2Hi;
std::tie(Input1Lo, Input1Hi) = DAG.SplitVector(Input1, DL);
diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index ab9813aa796e3..06e9bc901ab36 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -402,18 +402,18 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
; CHECK-NODOT: // %bb.0: // %entry
; CHECK-NODOT-NEXT: umull v4.8h, v2.8b, v3.8b
; CHECK-NODOT-NEXT: umull2 v2.8h, v2.16b, v3.16b
-; CHECK-NODOT-NEXT: ushll v3.4s, v4.4h, #0
-; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0
+; CHECK-NODOT-NEXT: ushll v5.4s, v4.4h, #0
; CHECK-NODOT-NEXT: ushll2 v4.4s, v4.8h, #0
+; CHECK-NODOT-NEXT: ushll v3.4s, v2.4h, #0
; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0
-; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v5.2s
-; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s
-; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v5.4s
-; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v3.4s
-; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v2.2s
+; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v5.2s
+; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v5.4s
; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v4.2s
-; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s
; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v4.4s
+; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s
+; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v3.4s
+; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s
+; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v2.4s
; CHECK-NODOT-NEXT: ret
entry:
%a.wide = zext <16 x i8> %a to <16 x i64>
@@ -437,18 +437,18 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
; CHECK-NODOT: // %bb.0: // %entry
; CHECK-NODOT-NEXT: smull v4.8h, v2.8b, v3.8b
; CHECK-NODOT-NEXT: smull2 v2.8h, v2.16b, v3.16b
-; CHECK-NODOT-NEXT: sshll v3.4s, v4.4h, #0
-; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0
+; CHECK-NODOT-NEXT: sshll v5.4s, v4.4h, #0
; CHECK-NODOT-NEXT: sshll2 v4.4s, v4.8h, #0
+; CHECK-NODOT-NEXT: sshll v3.4s, v2.4h, #0
; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0
-; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v5.2s
-; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s
-; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v5.4s
-; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v3.4s
-; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v2.2s
+; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v5.2s
+; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v5.4s
; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v4.2s
-; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s
; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s
+; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v3.4s
+; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s
+; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v2.4s
; CHECK-NODOT-NEXT: ret
entry:
%a.wide = sext <16 x i8> %a to <16 x i64>
@@ -463,25 +463,25 @@ define <4 x i64> @usdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
; CHECK-NOI8MM-LABEL: usdot_8to64:
; CHECK-NOI8MM: // %bb.0: // %entry
; CHECK-NOI8MM-NEXT: ushll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0
; CHECK-NOI8MM-NEXT: sshll v5.8h, v3.8b, #0
+; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0
; CHECK-NOI8MM-NEXT: sshll2 v3.8h, v3.16b, #0
; CHECK-NOI8MM-NEXT: ushll v6.4s, v4.4h, #0
-; CHECK-NOI8MM-NEXT: ushll v7.4s, v2.4h, #0
-; CHECK-NOI8MM-NEXT: sshll v16.4s, v5.4h, #0
-; CHECK-NOI8MM-NEXT: sshll v17.4s, v3.4h, #0
+; CHECK-NOI8MM-NEXT: sshll v7.4s, v5.4h, #0
; CHECK-NOI8MM-NEXT: ushll2 v4.4s, v4.8h, #0
-; CHECK-NOI8MM-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-NOI8MM-NEXT: sshll2 v5.4s, v5.8h, #0
+; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v7.2s
+; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v6.4s, v7.4s
+; CHECK-NOI8MM-NEXT: smlal v0.2d, v4.2s, v5.2s
+; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-NOI8MM-NEXT: ushll v4.4s, v2.4h, #0
+; CHECK-NOI8MM-NEXT: sshll v5.4s, v3.4h, #0
+; CHECK-NOI8MM-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-NOI8MM-NEXT: sshll2 v3.4s, v3.8h, #0
-; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v16.2s
-; CHECK-NOI8MM-NEXT: smlal v1.2d, v7.2s, v17.2s
-; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v6.4s, v16.4s
-; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v7.4s, v17.4s
; CHECK-NOI8MM-NEXT: smlal v0.2d, v4.2s, v5.2s
-; CHECK-NOI8MM-NEXT: smlal v1.2d, v2.2s, v3.2s
; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v4.4s, v5.4s
-; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v2.4s, v3.4s
+; CHECK-NOI8MM-NEXT: smlal v0.2d, v2.2s, v3.2s
+; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v2.4s, v3.4s
; CHECK-NOI8MM-NEXT: ret
;
; CHECK-I8MM-LABEL: usdot_8to64:
@@ -504,25 +504,25 @@ define <4 x i64> @sudot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
; CHECK-NOI8MM-LABEL: sudot_8to64:
; CHECK-NOI8MM: // %bb.0: // %entry
; CHECK-NOI8MM-NEXT: sshll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0
; CHECK-NOI8MM-NEXT: ushll v5.8h, v3.8b, #0
+; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0
; CHECK-NOI8MM-NEXT: ushll2 v3.8h, v3.16b, #0
; CHECK-NOI8MM-NEXT: sshll v6.4s, v4.4h, #0
-; CHECK-NOI8MM-NEXT: sshll v7.4s, v2.4h, #0
-; CHECK-NOI8MM-NEXT: ushll v16.4s, v5.4h, #0
-; CHECK-NOI8MM-NEXT: ushll v17.4s, v3.4h, #0
+; CHECK-NOI8MM-NEXT: ushll v7.4s, v5.4h, #0
; CHECK-NOI8MM-NEXT: sshll2 v4.4s, v4.8h, #0
-; CHECK-NOI8MM-NEXT: sshll2 v2.4s, v2.8h, #0
; CHECK-NOI8MM-NEXT: ushll2 v5.4s, v5.8h, #0
+; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v7.2s
+; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v6.4s, v7.4s
+; CHECK-NOI8MM-NEXT: smlal v0.2d, v4.2s, v5.2s
+; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-NOI8MM-NEXT: sshll v4.4s, v2.4h, #0
+; CHECK-NOI8MM-NEXT: ushll v5.4s, v3.4h, #0
+; CHECK-NOI8MM-NEXT: sshll2 v2.4s, v2.8h, #0
; CHECK-NOI8MM-NEXT: ushll2 v3.4s, v3.8h, #0
-; CHECK-NOI8MM-NEXT: smlal v0.2d, v6.2s, v16.2s
-; CHECK-NOI8MM-NEXT: smlal v1.2d, v7.2s, v17.2s
-; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v6.4s, v16.4s
-; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v7.4s, v17.4s
; CHECK-NOI8MM-NEXT: smlal v0.2d, v4.2s, v5.2s
-; CHECK-NOI8MM-NEXT: smlal v1.2d, v2.2s, v3.2s
; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v4.4s, v5.4s
-; CHECK-NOI8MM-NEXT: smlal2 v1.2d, v2.4s, v3.4s
+; CHECK-NOI8MM-NEXT: smlal v0.2d, v2.2s, v3.2s
+; CHECK-NOI8MM-NEXT: smlal2 v0.2d, v2.4s, v3.4s
; CHECK-NOI8MM-NEXT: ret
;
; CHECK-I8MM-LABEL: sudot_8to64:
@@ -705,17 +705,17 @@ define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
; CHECK-NODOT-NEXT: ushll v3.8h, v2.8b, #0
; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
; CHECK-NODOT-NEXT: ushll v4.4s, v3.4h, #0
-; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0
; CHECK-NODOT-NEXT: ushll2 v3.4s, v3.8h, #0
-; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0
-; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v5.2s
; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v4.2s
-; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v5.4s
; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NODOT-NEXT: uaddw v1.2d, v1.2d, v2.2s
+; CHECK-NODOT-NEXT: ushll v4.4s, v2.4h, #0
+; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s
-; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s
; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v3.4s
+; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v4.2s
+; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v4.4s
+; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s
+; CHECK-NODOT-NEXT: uaddw2 v0.2d, v0.2d, v2.4s
; CHECK-NODOT-NEXT: ret
%a.wide = zext <16 x i8> %a to <16 x i64>
%partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
@@ -737,17 +737,17 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
; CHECK-NODOT-NEXT: sshll v3.8h, v2.8b, #0
; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
; CHECK-NODOT-NEXT: sshll v4.4s, v3.4h, #0
-; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0
; CHECK-NODOT-NEXT: sshll2 v3.4s, v3.8h, #0
-; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0
-; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v5.2s
; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v4.2s
-; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v5.4s
; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v4.4s
-; CHECK-NODOT-NEXT: saddw v1.2d, v1.2d, v2.2s
+; CHECK-NODOT-NEXT: sshll v4.4s, v2.4h, #0
+; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0
; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s
-; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s
; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v3.4s
+; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v4.2s
+; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s
+; CHECK-NODOT-NEXT: saddw2 v0.2d, v0.2d, v2.4s
; CHECK-NODOT-NEXT: ret
%a.wide = sext <16 x i8> %a to <16 x i64>
%partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
diff --git a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
index c3828c3d695c4..3810374b18fbe 100644
--- a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
+++ b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
@@ -62,10 +62,10 @@ entry:
define <vscale x 8 x i32> @partial_reduce_add_half_8(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0) #0 {
; CHECK-LABEL: partial_reduce_add_half_8:
; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: add z3.s, z3.s, z4.s
; CHECK-NEXT: add z0.s, z0.s, z2.s
-; CHECK-NEXT: add z1.s, z1.s, z4.s
; CHECK-NEXT: add z0.s, z0.s, z3.s
-; CHECK-NEXT: add z1.s, z1.s, z5.s
+; CHECK-NEXT: add z0.s, z0.s, z5.s
; CHECK-NEXT: ret
entry:
%partial.reduce = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0)
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index 1b754fc3d320e..db3852a7c02b6 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -198,42 +198,42 @@ define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
;
; CHECK-NEWLOWERING-LABEL: udot_8to64:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: uunpkhi z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT: uunpklo z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: uunpkhi z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z3.b
+; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z3.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z5.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z5.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z6.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z7.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z24.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z25.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z6.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z7.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z6.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z5.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z3.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
+; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z4.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z5.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z7.d
+; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z3.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
+; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z6.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z7.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z6.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z7.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z6.d, z24.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z7.d, z25.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z7.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z3.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
@@ -258,42 +258,42 @@ define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
;
; CHECK-NEWLOWERING-LABEL: sdot_8to64:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: sunpkhi z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT: sunpklo z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: sunpkhi z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z3.b
+; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z3.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z5.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z5.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z6.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z7.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z24.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z25.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z6.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z7.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z6.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z5.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z3.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
+; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z4.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z5.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z7.d
+; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z2.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z3.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
+; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z6.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z7.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z6.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z7.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z3.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z6.d, z24.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z7.d, z25.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z7.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z3.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
@@ -318,83 +318,83 @@ define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
;
; CHECK-NOI8MM-LABEL: usdot_8to64:
; CHECK-NOI8MM: // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT: uunpkhi z4.h, z2.b
-; CHECK-NOI8MM-NEXT: uunpklo z2.h, z2.b
-; CHECK-NOI8MM-NEXT: sunpkhi z5.h, z3.b
-; CHECK-NOI8MM-NEXT: sunpklo z3.h, z3.b
+; CHECK-NOI8MM-NEXT: uunpklo z4.h, z2.b
+; CHECK-NOI8MM-NEXT: sunpklo z5.h, z3.b
; CHECK-NOI8MM-NEXT: ptrue p0.d
+; CHECK-NOI8MM-NEXT: uunpkhi z2.h, z2.b
+; CHECK-NOI8MM-NEXT: sunpkhi z3.h, z3.b
; CHECK-NOI8MM-NEXT: uunpklo z6.s, z4.h
-; CHECK-NOI8MM-NEXT: uunpklo z7.s, z2.h
-; CHECK-NOI8MM-NEXT: sunpklo z24.s, z5.h
-; CHECK-NOI8MM-NEXT: sunpklo z25.s, z3.h
+; CHECK-NOI8MM-NEXT: sunpklo z7.s, z5.h
; CHECK-NOI8MM-NEXT: uunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT: uunpkhi z2.s, z2.h
; CHECK-NOI8MM-NEXT: sunpkhi z5.s, z5.h
-; CHECK-NOI8MM-NEXT: sunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT: uunpklo z26.d, z6.s
-; CHECK-NOI8MM-NEXT: uunpklo z27.d, z7.s
-; CHECK-NOI8MM-NEXT: sunpklo z28.d, z24.s
-; CHECK-NOI8MM-NEXT: sunpklo z29.d, z25.s
+; CHECK-NOI8MM-NEXT: uunpklo z24.d, z6.s
+; CHECK-NOI8MM-NEXT: sunpklo z25.d, z7.s
; CHECK-NOI8MM-NEXT: uunpkhi z6.d, z6.s
-; CHECK-NOI8MM-NEXT: uunpkhi z7.d, z7.s
-; CHECK-NOI8MM-NEXT: sunpkhi z24.d, z24.s
-; CHECK-NOI8MM-NEXT: sunpkhi z25.d, z25.s
-; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NOI8MM-NEXT: uunpklo z26.d, z4.s
-; CHECK-NOI8MM-NEXT: sunpklo z28.d, z5.s
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NOI8MM-NEXT: uunpklo z27.d, z2.s
-; CHECK-NOI8MM-NEXT: sunpklo z29.d, z3.s
+; CHECK-NOI8MM-NEXT: sunpkhi z7.d, z7.s
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z24.d, z25.d
+; CHECK-NOI8MM-NEXT: uunpklo z24.d, z4.s
+; CHECK-NOI8MM-NEXT: sunpklo z25.d, z5.s
; CHECK-NOI8MM-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NOI8MM-NEXT: uunpkhi z2.d, z2.s
; CHECK-NOI8MM-NEXT: sunpkhi z5.d, z5.s
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z6.d, z7.d
+; CHECK-NOI8MM-NEXT: uunpklo z6.s, z2.h
+; CHECK-NOI8MM-NEXT: sunpklo z7.s, z3.h
+; CHECK-NOI8MM-NEXT: uunpkhi z2.s, z2.h
+; CHECK-NOI8MM-NEXT: sunpkhi z3.s, z3.h
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z24.d, z25.d
+; CHECK-NOI8MM-NEXT: uunpklo z24.d, z6.s
+; CHECK-NOI8MM-NEXT: sunpklo z25.d, z7.s
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z4.d, z5.d
+; CHECK-NOI8MM-NEXT: uunpkhi z4.d, z6.s
+; CHECK-NOI8MM-NEXT: sunpkhi z5.d, z7.s
+; CHECK-NOI8MM-NEXT: uunpklo z6.d, z2.s
+; CHECK-NOI8MM-NEXT: sunpklo z7.d, z3.s
+; CHECK-NOI8MM-NEXT: uunpkhi z2.d, z2.s
; CHECK-NOI8MM-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z6.d, z24.d
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z7.d, z25.d
-; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z4.d, z5.d
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z24.d, z25.d
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z4.d, z5.d
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z6.d, z7.d
; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z2.d, z3.d
; CHECK-NOI8MM-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: usdot_8to64:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: uunpkhi z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT: uunpklo z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: sunpkhi z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z3.b
+; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z3.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z5.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z5.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z6.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z7.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z24.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z25.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z6.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z7.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z6.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z5.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z3.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
+; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z4.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z5.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z7.d
+; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z2.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z3.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
+; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z6.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z7.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z6.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z7.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z6.d, z24.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z7.d, z25.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z7.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z3.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
@@ -419,83 +419,83 @@ define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
;
; CHECK-NOI8MM-LABEL: sudot_8to64:
; CHECK-NOI8MM: // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT: sunpkhi z4.h, z2.b
-; CHECK-NOI8MM-NEXT: sunpklo z2.h, z2.b
-; CHECK-NOI8MM-NEXT: uunpkhi z5.h, z3.b
-; CHECK-NOI8MM-NEXT: uunpklo z3.h, z3.b
+; CHECK-NOI8MM-NEXT: sunpklo z4.h, z2.b
+; CHECK-NOI8MM-NEXT: uunpklo z5.h, z3.b
; CHECK-NOI8MM-NEXT: ptrue p0.d
+; CHECK-NOI8MM-NEXT: sunpkhi z2.h, z2.b
+; CHECK-NOI8MM-NEXT: uunpkhi z3.h, z3.b
; CHECK-NOI8MM-NEXT: sunpklo z6.s, z4.h
-; CHECK-NOI8MM-NEXT: sunpklo z7.s, z2.h
-; CHECK-NOI8MM-NEXT: uunpklo z24.s, z5.h
-; CHECK-NOI8MM-NEXT: uunpklo z25.s, z3.h
+; CHECK-NOI8MM-NEXT: uunpklo z7.s, z5.h
; CHECK-NOI8MM-NEXT: sunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT: sunpkhi z2.s, z2.h
; CHECK-NOI8MM-NEXT: uunpkhi z5.s, z5.h
-; CHECK-NOI8MM-NEXT: uunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT: sunpklo z26.d, z6.s
-; CHECK-NOI8MM-NEXT: sunpklo z27.d, z7.s
-; CHECK-NOI8MM-NEXT: uunpklo z28.d, z24.s
-; CHECK-NOI8MM-NEXT: uunpklo z29.d, z25.s
+; CHECK-NOI8MM-NEXT: sunpklo z24.d, z6.s
+; CHECK-NOI8MM-NEXT: uunpklo z25.d, z7.s
; CHECK-NOI8MM-NEXT: sunpkhi z6.d, z6.s
-; CHECK-NOI8MM-NEXT: sunpkhi z7.d, z7.s
-; CHECK-NOI8MM-NEXT: uunpkhi z24.d, z24.s
-; CHECK-NOI8MM-NEXT: uunpkhi z25.d, z25.s
-; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NOI8MM-NEXT: sunpklo z26.d, z4.s
-; CHECK-NOI8MM-NEXT: uunpklo z28.d, z5.s
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NOI8MM-NEXT: sunpklo z27.d, z2.s
-; CHECK-NOI8MM-NEXT: uunpklo z29.d, z3.s
+; CHECK-NOI8MM-NEXT: uunpkhi z7.d, z7.s
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z24.d, z25.d
+; CHECK-NOI8MM-NEXT: sunpklo z24.d, z4.s
+; CHECK-NOI8MM-NEXT: uunpklo z25.d, z5.s
; CHECK-NOI8MM-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NOI8MM-NEXT: sunpkhi z2.d, z2.s
; CHECK-NOI8MM-NEXT: uunpkhi z5.d, z5.s
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z6.d, z7.d
+; CHECK-NOI8MM-NEXT: sunpklo z6.s, z2.h
+; CHECK-NOI8MM-NEXT: uunpklo z7.s, z3.h
+; CHECK-NOI8MM-NEXT: sunpkhi z2.s, z2.h
+; CHECK-NOI8MM-NEXT: uunpkhi z3.s, z3.h
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z24.d, z25.d
+; CHECK-NOI8MM-NEXT: sunpklo z24.d, z6.s
+; CHECK-NOI8MM-NEXT: uunpklo z25.d, z7.s
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z4.d, z5.d
+; CHECK-NOI8MM-NEXT: sunpkhi z4.d, z6.s
+; CHECK-NOI8MM-NEXT: uunpkhi z5.d, z7.s
+; CHECK-NOI8MM-NEXT: sunpklo z6.d, z2.s
+; CHECK-NOI8MM-NEXT: uunpklo z7.d, z3.s
+; CHECK-NOI8MM-NEXT: sunpkhi z2.d, z2.s
; CHECK-NOI8MM-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z6.d, z24.d
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z7.d, z25.d
-; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z4.d, z5.d
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z24.d, z25.d
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z4.d, z5.d
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z6.d, z7.d
; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z2.d, z3.d
; CHECK-NOI8MM-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: sudot_8to64:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: sunpkhi z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT: sunpklo z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: uunpkhi z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z3.b
+; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z3.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z5.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z5.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z6.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z7.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z24.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z25.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z6.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z7.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z6.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z5.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
+; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z4.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z5.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z7.d
+; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z3.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
+; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z6.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z7.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z6.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z7.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z3.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z6.d, z24.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z7.d, z25.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z7.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z3.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
@@ -625,26 +625,26 @@ define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale
; CHECK-NEWLOWERING: // %bb.0:
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z2.b
; CHECK-NEWLOWERING-NEXT: uunpklo z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z3.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z3.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z5.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z5.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z5.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z24.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z2.d, z2.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z24.d
-; CHECK-NEWLOWERING-NEXT: add z5.d, z5.d, z25.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
-; CHECK-NEWLOWERING-NEXT: add z3.d, z7.d, z6.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z5.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z4.d
+; CHECK-NEWLOWERING-NEXT: add z2.d, z24.d, z5.d
+; CHECK-NEWLOWERING-NEXT: add z5.d, z7.d, z6.d
+; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z3.s
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
+; CHECK-NEWLOWERING-NEXT: add z2.d, z5.d, z4.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
+; CHECK-NEWLOWERING-NEXT: add z2.d, z3.d, z6.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
; CHECK-NEWLOWERING-NEXT: ret
%a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
@@ -667,26 +667,26 @@ define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale
; CHECK-NEWLOWERING: // %bb.0:
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z2.b
; CHECK-NEWLOWERING-NEXT: sunpklo z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z3.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z3.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z2.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z3.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z5.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z5.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z5.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z24.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z2.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z24.d
-; CHECK-NEWLOWERING-NEXT: add z5.d, z5.d, z25.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
-; CHECK-NEWLOWERING-NEXT: add z3.d, z7.d, z6.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z5.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z4.d
+; CHECK-NEWLOWERING-NEXT: add z2.d, z24.d, z5.d
+; CHECK-NEWLOWERING-NEXT: add z5.d, z7.d, z6.d
+; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z3.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z3.d, z3.s
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
+; CHECK-NEWLOWERING-NEXT: add z2.d, z5.d, z4.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
+; CHECK-NEWLOWERING-NEXT: add z2.d, z3.d, z6.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
; CHECK-NEWLOWERING-NEXT: ret
%a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
index 5148d3da6c737..1fe8628357783 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
@@ -206,13 +206,13 @@ define <vscale x 4 x i64> @signed_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vsc
; CHECK-LABEL: signed_wide_add_nxv8i32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sunpklo z4.d, z3.s
-; CHECK-NEXT: sunpklo z5.d, z2.s
+; CHECK-NEXT: sunpkhi z5.d, z2.s
+; CHECK-NEXT: sunpklo z2.d, z2.s
; CHECK-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NEXT: sunpkhi z2.d, z2.s
-; CHECK-NEXT: add z0.d, z0.d, z5.d
-; CHECK-NEXT: add z1.d, z1.d, z4.d
; CHECK-NEXT: add z0.d, z0.d, z2.d
-; CHECK-NEXT: add z1.d, z1.d, z3.d
+; CHECK-NEXT: add z2.d, z5.d, z4.d
+; CHECK-NEXT: add z0.d, z0.d, z2.d
+; CHECK-NEXT: add z0.d, z0.d, z3.d
; CHECK-NEXT: ret
entry:
%input.wide = sext <vscale x 8 x i32> %input to <vscale x 8 x i64>
@@ -224,13 +224,13 @@ define <vscale x 4 x i64> @unsigned_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <v
; CHECK-LABEL: unsigned_wide_add_nxv8i32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: uunpklo z4.d, z3.s
-; CHECK-NEXT: uunpklo z5.d, z2.s
+; CHECK-NEXT: uunpkhi z5.d, z2.s
+; CHECK-NEXT: uunpklo z2.d, z2.s
; CHECK-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NEXT: uunpkhi z2.d, z2.s
-; CHECK-NEXT: add z0.d, z0.d, z5.d
-; CHECK-NEXT: add z1.d, z1.d, z4.d
; CHECK-NEXT: add z0.d, z0.d, z2.d
-; CHECK-NEXT: add z1.d, z1.d, z3.d
+; CHECK-NEXT: add z2.d, z5.d, z4.d
+; CHECK-NEXT: add z0.d, z0.d, z2.d
+; CHECK-NEXT: add z0.d, z0.d, z3.d
; CHECK-NEXT: ret
entry:
%input.wide = zext <vscale x 8 x i32> %input to <vscale x 8 x i64>
>From 6693aa036cba885cf7bc399abe37a5f9225b7029 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Wed, 30 Apr 2025 14:51:53 +0100
Subject: [PATCH 08/10] Remove dead code
---
llvm/include/llvm/CodeGen/TargetLowering.h | 6 ------
1 file changed, 6 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 7b0e15f951681..abe261728a3e6 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1668,12 +1668,6 @@ class TargetLoweringBase {
return Action == Legal || Action == Custom;
}
- /// Return true if a PARTIAL_REDUCE_U/SMLA node with the specified types is
- /// legal for this target.
- bool isPartialReduceMLALegal(EVT AccVT, EVT InputVT) const {
- return getPartialReduceMLAAction(AccVT, InputVT) == Legal;
- }
-
/// If the action for this operation is to promote, this method returns the
/// ValueType to promote to.
MVT getTypeToPromoteTo(unsigned Op, MVT VT) const {
>From 2aabffb351aed75cafe17222d68fc2007cf6eba0 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Wed, 30 Apr 2025 15:09:49 +0100
Subject: [PATCH 09/10] New test precommit
---
.../AArch64/sve-partial-reduce-dot-product.ll | 456 ++++++++++--------
1 file changed, 256 insertions(+), 200 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index db3852a7c02b6..02bbb0a70dba1 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -198,42 +198,42 @@ define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
;
; CHECK-NEWLOWERING-LABEL: udot_8to64:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z3.b
+; CHECK-NEWLOWERING-NEXT: uunpkhi z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT: uunpklo z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT: uunpkhi z5.h, z3.b
+; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z5.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z5.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z6.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z7.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z6.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z7.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z24.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z25.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z6.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z5.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z24.d, z24.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z25.d, z25.s
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z4.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z5.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z3.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z7.d
-; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z6.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z7.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z6.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z7.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z3.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z7.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z6.d, z24.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z7.d, z25.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z4.d, z5.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z3.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
@@ -258,42 +258,42 @@ define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
;
; CHECK-NEWLOWERING-LABEL: sdot_8to64:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z3.b
+; CHECK-NEWLOWERING-NEXT: sunpkhi z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT: sunpklo z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT: sunpkhi z5.h, z3.b
+; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z5.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z2.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z5.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z6.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z7.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z6.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z7.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z24.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z25.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z6.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z5.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z24.d, z24.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z25.d, z25.s
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z4.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z5.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z3.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z7.d
-; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z3.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z6.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z7.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z6.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z7.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z3.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z7.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z6.d, z24.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z7.d, z25.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z4.d, z5.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z3.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
@@ -318,83 +318,83 @@ define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
;
; CHECK-NOI8MM-LABEL: usdot_8to64:
; CHECK-NOI8MM: // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT: uunpklo z4.h, z2.b
-; CHECK-NOI8MM-NEXT: sunpklo z5.h, z3.b
+; CHECK-NOI8MM-NEXT: uunpkhi z4.h, z2.b
+; CHECK-NOI8MM-NEXT: uunpklo z2.h, z2.b
+; CHECK-NOI8MM-NEXT: sunpkhi z5.h, z3.b
+; CHECK-NOI8MM-NEXT: sunpklo z3.h, z3.b
; CHECK-NOI8MM-NEXT: ptrue p0.d
-; CHECK-NOI8MM-NEXT: uunpkhi z2.h, z2.b
-; CHECK-NOI8MM-NEXT: sunpkhi z3.h, z3.b
; CHECK-NOI8MM-NEXT: uunpklo z6.s, z4.h
-; CHECK-NOI8MM-NEXT: sunpklo z7.s, z5.h
+; CHECK-NOI8MM-NEXT: uunpklo z7.s, z2.h
+; CHECK-NOI8MM-NEXT: sunpklo z24.s, z5.h
+; CHECK-NOI8MM-NEXT: sunpklo z25.s, z3.h
; CHECK-NOI8MM-NEXT: uunpkhi z4.s, z4.h
+; CHECK-NOI8MM-NEXT: uunpkhi z2.s, z2.h
; CHECK-NOI8MM-NEXT: sunpkhi z5.s, z5.h
-; CHECK-NOI8MM-NEXT: uunpklo z24.d, z6.s
-; CHECK-NOI8MM-NEXT: sunpklo z25.d, z7.s
+; CHECK-NOI8MM-NEXT: sunpkhi z3.s, z3.h
+; CHECK-NOI8MM-NEXT: uunpklo z26.d, z6.s
+; CHECK-NOI8MM-NEXT: uunpklo z27.d, z7.s
+; CHECK-NOI8MM-NEXT: sunpklo z28.d, z24.s
+; CHECK-NOI8MM-NEXT: sunpklo z29.d, z25.s
; CHECK-NOI8MM-NEXT: uunpkhi z6.d, z6.s
-; CHECK-NOI8MM-NEXT: sunpkhi z7.d, z7.s
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z24.d, z25.d
-; CHECK-NOI8MM-NEXT: uunpklo z24.d, z4.s
-; CHECK-NOI8MM-NEXT: sunpklo z25.d, z5.s
+; CHECK-NOI8MM-NEXT: uunpkhi z7.d, z7.s
+; CHECK-NOI8MM-NEXT: sunpkhi z24.d, z24.s
+; CHECK-NOI8MM-NEXT: sunpkhi z25.d, z25.s
+; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NOI8MM-NEXT: uunpklo z26.d, z4.s
+; CHECK-NOI8MM-NEXT: sunpklo z28.d, z5.s
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NOI8MM-NEXT: uunpklo z27.d, z2.s
+; CHECK-NOI8MM-NEXT: sunpklo z29.d, z3.s
; CHECK-NOI8MM-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NOI8MM-NEXT: sunpkhi z5.d, z5.s
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z6.d, z7.d
-; CHECK-NOI8MM-NEXT: uunpklo z6.s, z2.h
-; CHECK-NOI8MM-NEXT: sunpklo z7.s, z3.h
-; CHECK-NOI8MM-NEXT: uunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT: sunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z24.d, z25.d
-; CHECK-NOI8MM-NEXT: uunpklo z24.d, z6.s
-; CHECK-NOI8MM-NEXT: sunpklo z25.d, z7.s
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z4.d, z5.d
-; CHECK-NOI8MM-NEXT: uunpkhi z4.d, z6.s
-; CHECK-NOI8MM-NEXT: sunpkhi z5.d, z7.s
-; CHECK-NOI8MM-NEXT: uunpklo z6.d, z2.s
-; CHECK-NOI8MM-NEXT: sunpklo z7.d, z3.s
; CHECK-NOI8MM-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NOI8MM-NEXT: sunpkhi z5.d, z5.s
; CHECK-NOI8MM-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z24.d, z25.d
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z4.d, z5.d
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z6.d, z7.d
+; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z6.d, z24.d
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z7.d, z25.d
+; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z4.d, z5.d
; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z2.d, z3.d
; CHECK-NOI8MM-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: usdot_8to64:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z3.b
+; CHECK-NEWLOWERING-NEXT: uunpkhi z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT: uunpklo z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT: sunpkhi z5.h, z3.b
+; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z5.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z2.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z5.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z6.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z7.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z6.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z7.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z24.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z25.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z6.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z5.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z24.d, z24.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z25.d, z25.s
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z4.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z5.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z3.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z7.d
-; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z6.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z7.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z6.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z7.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z3.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z7.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z6.d, z24.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z7.d, z25.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z4.d, z5.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z3.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
@@ -419,83 +419,83 @@ define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
;
; CHECK-NOI8MM-LABEL: sudot_8to64:
; CHECK-NOI8MM: // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT: sunpklo z4.h, z2.b
-; CHECK-NOI8MM-NEXT: uunpklo z5.h, z3.b
+; CHECK-NOI8MM-NEXT: sunpkhi z4.h, z2.b
+; CHECK-NOI8MM-NEXT: sunpklo z2.h, z2.b
+; CHECK-NOI8MM-NEXT: uunpkhi z5.h, z3.b
+; CHECK-NOI8MM-NEXT: uunpklo z3.h, z3.b
; CHECK-NOI8MM-NEXT: ptrue p0.d
-; CHECK-NOI8MM-NEXT: sunpkhi z2.h, z2.b
-; CHECK-NOI8MM-NEXT: uunpkhi z3.h, z3.b
; CHECK-NOI8MM-NEXT: sunpklo z6.s, z4.h
-; CHECK-NOI8MM-NEXT: uunpklo z7.s, z5.h
+; CHECK-NOI8MM-NEXT: sunpklo z7.s, z2.h
+; CHECK-NOI8MM-NEXT: uunpklo z24.s, z5.h
+; CHECK-NOI8MM-NEXT: uunpklo z25.s, z3.h
; CHECK-NOI8MM-NEXT: sunpkhi z4.s, z4.h
+; CHECK-NOI8MM-NEXT: sunpkhi z2.s, z2.h
; CHECK-NOI8MM-NEXT: uunpkhi z5.s, z5.h
-; CHECK-NOI8MM-NEXT: sunpklo z24.d, z6.s
-; CHECK-NOI8MM-NEXT: uunpklo z25.d, z7.s
+; CHECK-NOI8MM-NEXT: uunpkhi z3.s, z3.h
+; CHECK-NOI8MM-NEXT: sunpklo z26.d, z6.s
+; CHECK-NOI8MM-NEXT: sunpklo z27.d, z7.s
+; CHECK-NOI8MM-NEXT: uunpklo z28.d, z24.s
+; CHECK-NOI8MM-NEXT: uunpklo z29.d, z25.s
; CHECK-NOI8MM-NEXT: sunpkhi z6.d, z6.s
-; CHECK-NOI8MM-NEXT: uunpkhi z7.d, z7.s
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z24.d, z25.d
-; CHECK-NOI8MM-NEXT: sunpklo z24.d, z4.s
-; CHECK-NOI8MM-NEXT: uunpklo z25.d, z5.s
+; CHECK-NOI8MM-NEXT: sunpkhi z7.d, z7.s
+; CHECK-NOI8MM-NEXT: uunpkhi z24.d, z24.s
+; CHECK-NOI8MM-NEXT: uunpkhi z25.d, z25.s
+; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NOI8MM-NEXT: sunpklo z26.d, z4.s
+; CHECK-NOI8MM-NEXT: uunpklo z28.d, z5.s
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NOI8MM-NEXT: sunpklo z27.d, z2.s
+; CHECK-NOI8MM-NEXT: uunpklo z29.d, z3.s
; CHECK-NOI8MM-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NOI8MM-NEXT: uunpkhi z5.d, z5.s
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z6.d, z7.d
-; CHECK-NOI8MM-NEXT: sunpklo z6.s, z2.h
-; CHECK-NOI8MM-NEXT: uunpklo z7.s, z3.h
-; CHECK-NOI8MM-NEXT: sunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT: uunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z24.d, z25.d
-; CHECK-NOI8MM-NEXT: sunpklo z24.d, z6.s
-; CHECK-NOI8MM-NEXT: uunpklo z25.d, z7.s
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z4.d, z5.d
-; CHECK-NOI8MM-NEXT: sunpkhi z4.d, z6.s
-; CHECK-NOI8MM-NEXT: uunpkhi z5.d, z7.s
-; CHECK-NOI8MM-NEXT: sunpklo z6.d, z2.s
-; CHECK-NOI8MM-NEXT: uunpklo z7.d, z3.s
; CHECK-NOI8MM-NEXT: sunpkhi z2.d, z2.s
+; CHECK-NOI8MM-NEXT: uunpkhi z5.d, z5.s
; CHECK-NOI8MM-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z24.d, z25.d
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z4.d, z5.d
-; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z6.d, z7.d
+; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z6.d, z24.d
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z7.d, z25.d
+; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NOI8MM-NEXT: mla z1.d, p0/m, z4.d, z5.d
; CHECK-NOI8MM-NEXT: mla z0.d, p0/m, z2.d, z3.d
; CHECK-NOI8MM-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: sudot_8to64:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z3.b
+; CHECK-NEWLOWERING-NEXT: sunpkhi z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT: sunpklo z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT: uunpkhi z5.h, z3.b
+; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z5.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z5.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z6.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z7.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z6.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z7.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z24.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z25.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z6.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z5.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z24.d, z24.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z25.d, z25.s
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z4.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z5.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z3.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z7.d
-; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z3.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z6.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z7.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z6.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z7.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z3.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z7.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z6.d, z24.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z7.d, z25.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z4.d, z5.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z3.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
@@ -625,26 +625,26 @@ define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale
; CHECK-NEWLOWERING: // %bb.0:
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z2.b
; CHECK-NEWLOWERING-NEXT: uunpklo z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z3.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z5.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z24.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-NEXT: add z2.d, z24.d, z5.d
-; CHECK-NEWLOWERING-NEXT: add z5.d, z7.d, z6.d
-; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z5.s
; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z24.d
+; CHECK-NEWLOWERING-NEXT: add z5.d, z5.d, z25.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
+; CHECK-NEWLOWERING-NEXT: add z3.d, z7.d, z6.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z5.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-NEXT: add z2.d, z5.d, z4.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-NEXT: add z2.d, z3.d, z6.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z4.d
; CHECK-NEWLOWERING-NEXT: ret
%a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
@@ -667,26 +667,26 @@ define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale
; CHECK-NEWLOWERING: // %bb.0:
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z2.b
; CHECK-NEWLOWERING-NEXT: sunpklo z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z3.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z3.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z2.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z5.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z24.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-NEXT: add z2.d, z24.d, z5.d
-; CHECK-NEWLOWERING-NEXT: add z5.d, z7.d, z6.d
-; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z3.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z3.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z5.s
; CHECK-NEWLOWERING-NEXT: sunpklo z3.d, z3.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z24.d
+; CHECK-NEWLOWERING-NEXT: add z5.d, z5.d, z25.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
+; CHECK-NEWLOWERING-NEXT: add z3.d, z7.d, z6.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z5.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z3.d
; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-NEXT: add z2.d, z5.d, z4.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-NEXT: add z2.d, z3.d, z6.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z2.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z1.d, z4.d
; CHECK-NEWLOWERING-NEXT: ret
%a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
%partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
@@ -1138,3 +1138,59 @@ entry:
%partial.reduce = tail call <vscale x 2 x i16> @llvm.experimental.vector.partial.reduce.add.nxv2i16.nxv8i16(<vscale x 2 x i16> %acc, <vscale x 8 x i16> %mult)
ret <vscale x 2 x i16> %partial.reduce
}
+
+
+define <vscale x 4 x i64> @partial_reduce_only_split_acc(<vscale x 4 x i64> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: partial_reduce_only_split_acc:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: and z2.h, z2.h, #0xff
+; CHECK-NEXT: and z3.h, z3.h, #0xff
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpkhi z4.s, z2.h
+; CHECK-NEXT: uunpklo z2.s, z2.h
+; CHECK-NEXT: uunpkhi z5.s, z3.h
+; CHECK-NEXT: uunpklo z3.s, z3.h
+; CHECK-NEXT: uunpklo z6.d, z4.s
+; CHECK-NEXT: uunpklo z7.d, z2.s
+; CHECK-NEXT: uunpklo z24.d, z5.s
+; CHECK-NEXT: uunpklo z25.d, z3.s
+; CHECK-NEXT: uunpkhi z4.d, z4.s
+; CHECK-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEXT: uunpkhi z5.d, z5.s
+; CHECK-NEXT: uunpkhi z3.d, z3.s
+; CHECK-NEXT: mla z1.d, p0/m, z6.d, z24.d
+; CHECK-NEXT: mla z0.d, p0/m, z7.d, z25.d
+; CHECK-NEXT: mla z1.d, p0/m, z4.d, z5.d
+; CHECK-NEXT: mla z0.d, p0/m, z2.d, z3.d
+; CHECK-NEXT: ret
+;
+; CHECK-NEWLOWERING-LABEL: partial_reduce_only_split_acc:
+; CHECK-NEWLOWERING: // %bb.0: // %entry
+; CHECK-NEWLOWERING-NEXT: and z3.h, z3.h, #0xff
+; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff
+; CHECK-NEWLOWERING-NEXT: ptrue p0.d
+; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z3.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z5.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z7.d, z6.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z25.d, z24.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z4.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z2.d, z3.d
+; CHECK-NEWLOWERING-NEXT: ret
+entry:
+ %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i64>
+ %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
+ %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
+ %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(
+ <vscale x 4 x i64> %acc, <vscale x 8 x i64> %mult)
+ ret <vscale x 4 x i64> %partial.reduce
+}
>From b45dc07b4f8103f28e32bc16ce505e6b3f784631 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Wed, 30 Apr 2025 15:11:18 +0100
Subject: [PATCH 10/10] Fix typo and update test
---
.../SelectionDAG/LegalizeVectorTypes.cpp | 2 +-
.../AArch64/sve-partial-reduce-dot-product.ll | 18 +-----------------
2 files changed, 2 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 884a428ea9a60..e4db1663f8daa 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3230,7 +3230,7 @@ void DAGTypeLegalizer::SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo,
// If the input types don't need splitting, just accumulate into the
// low part of the accumulator.
- if (getTypeAction(Input1.getValueType()) == TargetLowering::TypeSplitVector) {
+ if (getTypeAction(Input1.getValueType()) != TargetLowering::TypeSplitVector) {
Lo = DAG.getNode(Opcode, DL, AccLo.getValueType(), AccLo, Input1, Input2);
Hi = AccHi;
return;
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index 02bbb0a70dba1..039cac01008b8 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -1168,23 +1168,7 @@ define <vscale x 4 x i64> @partial_reduce_only_split_acc(<vscale x 4 x i64> %acc
; CHECK-NEWLOWERING: // %bb.0: // %entry
; CHECK-NEWLOWERING-NEXT: and z3.h, z3.h, #0xff
; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-NEXT: ptrue p0.d
-; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z5.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z3.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z7.d, z6.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z25.d, z24.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z4.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z2.d, z3.d
+; CHECK-NEWLOWERING-NEXT: udot z0.d, z2.h, z3.h
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i64>
More information about the llvm-commits
mailing list