[llvm] 7a06681 - Revert "[DAGCombiner] Add generic DAG combine for ISD::PARTIAL_REDUCE_MLA (#127083)"
Kazu Hirata via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 4 01:44:15 PST 2025
Author: Kazu Hirata
Date: 2025-03-04T01:44:09-08:00
New Revision: 7a06681398a33d53ba6d661777be8b4c1d19acb7
URL: https://github.com/llvm/llvm-project/commit/7a06681398a33d53ba6d661777be8b4c1d19acb7
DIFF: https://github.com/llvm/llvm-project/commit/7a06681398a33d53ba6d661777be8b4c1d19acb7.diff
LOG: Revert "[DAGCombiner] Add generic DAG combine for ISD::PARTIAL_REDUCE_MLA (#127083)"
This reverts commit 2bef21f24ba932a757a644470358c340f4bcd113.
Multiple builtbot failures have been reported:
https://github.com/llvm/llvm-project/pull/127083
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0e17897cf60b0..ef6bf142b306d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -545,7 +545,6 @@ namespace {
SDValue visitMGATHER(SDNode *N);
SDValue visitMSCATTER(SDNode *N);
SDValue visitMHISTOGRAM(SDNode *N);
- SDValue visitPARTIAL_REDUCE_MLA(SDNode *N);
SDValue visitVPGATHER(SDNode *N);
SDValue visitVPSCATTER(SDNode *N);
SDValue visitVP_STRIDED_LOAD(SDNode *N);
@@ -1974,9 +1973,6 @@ SDValue DAGCombiner::visit(SDNode *N) {
case ISD::MSCATTER: return visitMSCATTER(N);
case ISD::MSTORE: return visitMSTORE(N);
case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: return visitMHISTOGRAM(N);
- case ISD::PARTIAL_REDUCE_SMLA:
- case ISD::PARTIAL_REDUCE_UMLA:
- return visitPARTIAL_REDUCE_MLA(N);
case ISD::VECTOR_COMPRESS: return visitVECTOR_COMPRESS(N);
case ISD::LIFETIME_END: return visitLIFETIME_END(N);
case ISD::FP_TO_FP16: return visitFP_TO_FP16(N);
@@ -12496,58 +12492,6 @@ SDValue DAGCombiner::visitMHISTOGRAM(SDNode *N) {
return SDValue();
}
-// Makes PARTIAL_REDUCE_*MLA(Acc, MUL(ZEXT(LHSExtOp), ZEXT(RHSExtOp)),
-// Splat(1)) into
-// PARTIAL_REDUCE_UMLA(Acc, LHSExtOp, RHSExtOp).
-// Makes PARTIAL_REDUCE_*MLA(Acc, MUL(SEXT(LHSExtOp), SEXT(RHSExtOp)),
-// Splat(1)) into
-// PARTIAL_REDUCE_SMLA(Acc, LHSExtOp, RHSExtOp).
-SDValue DAGCombiner::visitPARTIAL_REDUCE_MLA(SDNode *N) {
- SDLoc DL(N);
-
- SDValue Acc = N->getOperand(0);
- SDValue Op1 = N->getOperand(1);
- SDValue Op2 = N->getOperand(2);
-
- APInt ConstantOne;
- if (Op1->getOpcode() != ISD::MUL ||
- !ISD::isConstantSplatVector(Op2.getNode(), ConstantOne) ||
- !ConstantOne.isOne())
- return SDValue();
-
- SDValue LHS = Op1->getOperand(0);
- SDValue RHS = Op1->getOperand(1);
- unsigned LHSOpcode = LHS->getOpcode();
- unsigned RHSOpcode = RHS->getOpcode();
- if (!ISD::isExtOpcode(LHSOpcode) || !ISD::isExtOpcode(RHSOpcode))
- return SDValue();
-
- SDValue LHSExtOp = LHS->getOperand(0);
- SDValue RHSExtOp = RHS->getOperand(0);
- EVT LHSExtOpVT = LHSExtOp.getValueType();
- if (LHSExtOpVT != RHSExtOp.getValueType() || LHSOpcode != RHSOpcode)
- return SDValue();
-
- // FIXME: Add a check to only perform the DAG combine if there is lowering
- // provided by the target
-
- bool ExtIsSigned = LHSOpcode == ISD::SIGN_EXTEND;
-
- // For a 2-stage extend the signedness of both of the extends must be the
- // same. This is so the node can be folded into only a signed or unsigned
- // node.
- bool NodeIsSigned = N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA;
- EVT AccElemVT = Acc.getValueType().getVectorElementType();
- if (ExtIsSigned != NodeIsSigned &&
- Op1.getValueType().getVectorElementType() != AccElemVT)
- return SDValue();
-
- unsigned NewOpcode =
- ExtIsSigned ? ISD::PARTIAL_REDUCE_SMLA : ISD::PARTIAL_REDUCE_UMLA;
- return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, LHSExtOp,
- RHSExtOp);
-}
-
SDValue DAGCombiner::visitVP_STRIDED_LOAD(SDNode *N) {
auto *SLD = cast<VPStridedLoadSDNode>(N);
EVT EltVT = SLD->getValueType(0).getVectorElementType();
diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index ecb6c281b8c0b..249675470e38c 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -12,15 +12,13 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
;
; CHECK-NODOT-LABEL: udot:
; CHECK-NODOT: // %bb.0:
-; CHECK-NODOT-NEXT: ushll v3.8h, v1.8b, #0
-; CHECK-NODOT-NEXT: ushll v4.8h, v2.8b, #0
-; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0
-; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
-; CHECK-NODOT-NEXT: umlal v0.4s, v4.4h, v3.4h
-; CHECK-NODOT-NEXT: umull v5.4s, v2.4h, v1.4h
-; CHECK-NODOT-NEXT: umlal2 v0.4s, v2.8h, v1.8h
-; CHECK-NODOT-NEXT: umlal2 v5.4s, v4.8h, v3.8h
-; CHECK-NODOT-NEXT: add v0.4s, v5.4s, v0.4s
+; CHECK-NODOT-NEXT: umull v3.8h, v2.8b, v1.8b
+; CHECK-NODOT-NEXT: umull2 v1.8h, v2.16b, v1.16b
+; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
+; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v3.4h
+; CHECK-NODOT-NEXT: uaddw2 v2.4s, v2.4s, v3.8h
+; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
+; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
; CHECK-NODOT-NEXT: ret
%u.wide = zext <16 x i8> %u to <16 x i32>
%s.wide = zext <16 x i8> %s to <16 x i32>
@@ -97,19 +95,17 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
;
; CHECK-NODOT-LABEL: udot_narrow:
; CHECK-NODOT: // %bb.0:
-; CHECK-NODOT-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NODOT-NEXT: ushll v2.8h, v2.8b, #0
+; CHECK-NODOT-NEXT: umull v1.8h, v2.8b, v1.8b
; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NODOT-NEXT: umull v3.4s, v2.4h, v1.4h
-; CHECK-NODOT-NEXT: umull2 v4.4s, v2.8h, v1.8h
-; CHECK-NODOT-NEXT: ext v5.16b, v1.16b, v1.16b, #8
-; CHECK-NODOT-NEXT: ext v6.16b, v2.16b, v2.16b, #8
-; CHECK-NODOT-NEXT: umlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
+; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NODOT-NEXT: ext v1.16b, v4.16b, v4.16b, #8
-; CHECK-NODOT-NEXT: umlal v3.4s, v6.4h, v5.4h
-; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT: uaddw v1.4s, v2.4s, v4.4h
+; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
; CHECK-NODOT-NEXT: ret
%u.wide = zext <8 x i8> %u to <8 x i32>
%s.wide = zext <8 x i8> %s to <8 x i32>
@@ -126,15 +122,13 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
;
; CHECK-NODOT-LABEL: sdot:
; CHECK-NODOT: // %bb.0:
-; CHECK-NODOT-NEXT: sshll v3.8h, v1.8b, #0
-; CHECK-NODOT-NEXT: sshll v4.8h, v2.8b, #0
-; CHECK-NODOT-NEXT: sshll2 v1.8h, v1.16b, #0
-; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
-; CHECK-NODOT-NEXT: smlal v0.4s, v4.4h, v3.4h
-; CHECK-NODOT-NEXT: smull v5.4s, v2.4h, v1.4h
-; CHECK-NODOT-NEXT: smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-NODOT-NEXT: smlal2 v5.4s, v4.8h, v3.8h
-; CHECK-NODOT-NEXT: add v0.4s, v5.4s, v0.4s
+; CHECK-NODOT-NEXT: smull v3.8h, v2.8b, v1.8b
+; CHECK-NODOT-NEXT: smull2 v1.8h, v2.16b, v1.16b
+; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
+; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v3.4h
+; CHECK-NODOT-NEXT: saddw2 v2.4s, v2.4s, v3.8h
+; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h
+; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
; CHECK-NODOT-NEXT: ret
%u.wide = sext <16 x i8> %u to <16 x i32>
%s.wide = sext <16 x i8> %s to <16 x i32>
@@ -151,19 +145,17 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
;
; CHECK-NODOT-LABEL: sdot_narrow:
; CHECK-NODOT: // %bb.0:
-; CHECK-NODOT-NEXT: sshll v1.8h, v1.8b, #0
-; CHECK-NODOT-NEXT: sshll v2.8h, v2.8b, #0
+; CHECK-NODOT-NEXT: smull v1.8h, v2.8b, v1.8b
; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NODOT-NEXT: smull v3.4s, v2.4h, v1.4h
-; CHECK-NODOT-NEXT: smull2 v4.4s, v2.8h, v1.8h
-; CHECK-NODOT-NEXT: ext v5.16b, v1.16b, v1.16b, #8
-; CHECK-NODOT-NEXT: ext v6.16b, v2.16b, v2.16b, #8
-; CHECK-NODOT-NEXT: smlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
+; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NODOT-NEXT: ext v1.16b, v4.16b, v4.16b, #8
-; CHECK-NODOT-NEXT: smlal v3.4s, v6.4h, v5.4h
-; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT: saddw v1.4s, v2.4s, v4.4h
+; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
; CHECK-NODOT-NEXT: ret
%u.wide = sext <8 x i8> %u to <8 x i32>
%s.wide = sext <8 x i8> %s to <8 x i32>
@@ -415,27 +407,19 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
;
; CHECK-NODOT-LABEL: udot_8to64:
; CHECK-NODOT: // %bb.0: // %entry
-; CHECK-NODOT-NEXT: ushll v4.8h, v3.8b, #0
-; CHECK-NODOT-NEXT: ushll v5.8h, v2.8b, #0
-; CHECK-NODOT-NEXT: ushll2 v3.8h, v3.16b, #0
-; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
-; CHECK-NODOT-NEXT: ushll v6.4s, v4.4h, #0
-; CHECK-NODOT-NEXT: ushll v7.4s, v5.4h, #0
+; CHECK-NODOT-NEXT: umull v4.8h, v2.8b, v3.8b
+; CHECK-NODOT-NEXT: umull2 v2.8h, v2.16b, v3.16b
+; CHECK-NODOT-NEXT: ushll v3.4s, v4.4h, #0
+; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0
; CHECK-NODOT-NEXT: ushll2 v4.4s, v4.8h, #0
-; CHECK-NODOT-NEXT: ushll2 v5.4s, v5.8h, #0
-; CHECK-NODOT-NEXT: ushll2 v16.4s, v3.8h, #0
-; CHECK-NODOT-NEXT: ushll2 v17.4s, v2.8h, #0
-; CHECK-NODOT-NEXT: ushll v3.4s, v3.4h, #0
-; CHECK-NODOT-NEXT: ushll v2.4s, v2.4h, #0
-; CHECK-NODOT-NEXT: umlal2 v1.2d, v7.4s, v6.4s
-; CHECK-NODOT-NEXT: umlal v0.2d, v7.2s, v6.2s
-; CHECK-NODOT-NEXT: umull2 v18.2d, v5.4s, v4.4s
-; CHECK-NODOT-NEXT: umull v4.2d, v5.2s, v4.2s
-; CHECK-NODOT-NEXT: umlal2 v1.2d, v17.4s, v16.4s
-; CHECK-NODOT-NEXT: umlal v0.2d, v17.2s, v16.2s
-; CHECK-NODOT-NEXT: umlal2 v18.2d, v2.4s, v3.4s
-; CHECK-NODOT-NEXT: umlal v4.2d, v2.2s, v3.2s
-; CHECK-NODOT-NEXT: add v1.2d, v18.2d, v1.2d
+; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0
+; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v3.4s
+; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s
+; CHECK-NODOT-NEXT: uaddl2 v3.2d, v4.4s, v5.4s
+; CHECK-NODOT-NEXT: uaddl v4.2d, v4.2s, v5.2s
+; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s
+; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s
+; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d
; CHECK-NODOT-NEXT: ret
entry:
@@ -458,27 +442,19 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
;
; CHECK-NODOT-LABEL: sdot_8to64:
; CHECK-NODOT: // %bb.0: // %entry
-; CHECK-NODOT-NEXT: sshll v4.8h, v3.8b, #0
-; CHECK-NODOT-NEXT: sshll v5.8h, v2.8b, #0
-; CHECK-NODOT-NEXT: sshll2 v3.8h, v3.16b, #0
-; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
-; CHECK-NODOT-NEXT: sshll v6.4s, v4.4h, #0
-; CHECK-NODOT-NEXT: sshll v7.4s, v5.4h, #0
+; CHECK-NODOT-NEXT: smull v4.8h, v2.8b, v3.8b
+; CHECK-NODOT-NEXT: smull2 v2.8h, v2.16b, v3.16b
+; CHECK-NODOT-NEXT: sshll v3.4s, v4.4h, #0
+; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0
; CHECK-NODOT-NEXT: sshll2 v4.4s, v4.8h, #0
-; CHECK-NODOT-NEXT: sshll2 v5.4s, v5.8h, #0
-; CHECK-NODOT-NEXT: sshll2 v16.4s, v3.8h, #0
-; CHECK-NODOT-NEXT: sshll2 v17.4s, v2.8h, #0
-; CHECK-NODOT-NEXT: sshll v3.4s, v3.4h, #0
-; CHECK-NODOT-NEXT: sshll v2.4s, v2.4h, #0
-; CHECK-NODOT-NEXT: smlal2 v1.2d, v7.4s, v6.4s
-; CHECK-NODOT-NEXT: smlal v0.2d, v7.2s, v6.2s
-; CHECK-NODOT-NEXT: smull2 v18.2d, v5.4s, v4.4s
-; CHECK-NODOT-NEXT: smull v4.2d, v5.2s, v4.2s
-; CHECK-NODOT-NEXT: smlal2 v1.2d, v17.4s, v16.4s
-; CHECK-NODOT-NEXT: smlal v0.2d, v17.2s, v16.2s
-; CHECK-NODOT-NEXT: smlal2 v18.2d, v2.4s, v3.4s
-; CHECK-NODOT-NEXT: smlal v4.2d, v2.2s, v3.2s
-; CHECK-NODOT-NEXT: add v1.2d, v18.2d, v1.2d
+; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0
+; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v3.4s
+; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s
+; CHECK-NODOT-NEXT: saddl2 v3.2d, v4.4s, v5.4s
+; CHECK-NODOT-NEXT: saddl v4.2d, v4.2s, v5.2s
+; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s
+; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s
+; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d
; CHECK-NODOT-NEXT: ret
entry:
@@ -795,10 +771,9 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
; CHECK-LABEL: not_udot:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: ushll v2.8h, v2.8b, #0
-; CHECK-NEXT: umlal v0.4s, v2.4h, v1.4h
-; CHECK-NEXT: umlal2 v0.4s, v2.8h, v1.8h
+; CHECK-NEXT: umull v1.8h, v2.8b, v1.8b
+; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
; CHECK-NEXT: ret
%u.wide = zext <8 x i8> %u to <8 x i32>
%s.wide = zext <8 x i8> %s to <8 x i32>
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index d7bab3297cf29..455231dd37be6 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -11,23 +11,24 @@ define <vscale x 4 x i32> @udot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a,
;
; CHECK-NEWLOWERING-LABEL: udot:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z2.b
-; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z1.b
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT: uunpklo z3.h, z1.b
+; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.s
; CHECK-NEWLOWERING-NEXT: uunpklo z5.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z6.s, z5.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z6.s, z1.h
-; CHECK-NEWLOWERING-NEXT: mul z3.s, z4.s, z3.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z6.s, z5.s
-; CHECK-NEWLOWERING-NEXT: mad z1.s, p0/m, z2.s, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z1.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s
+; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s
+; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
+; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s
; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s
; CHECK-NEWLOWERING-NEXT: ret
entry:
@@ -46,23 +47,24 @@ define <vscale x 2 x i64> @udot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16>
;
; CHECK-NEWLOWERING-LABEL: udot_wide:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z1.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z1.s
-; CHECK-NEWLOWERING-NEXT: mul z3.d, z4.d, z3.d
-; CHECK-NEWLOWERING-NEXT: uunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEWLOWERING-NEXT: mad z1.d, p0/m, z2.d, z3.d
+; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z1.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d
; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
@@ -81,23 +83,24 @@ define <vscale x 4 x i32> @sdot(<vscale x 4 x i32> %accc, <vscale x 16 x i8> %a,
;
; CHECK-NEWLOWERING-LABEL: sdot:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z2.b
-; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z1.b
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT: sunpklo z3.h, z1.b
+; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.h, z1.b
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.s
; CHECK-NEWLOWERING-NEXT: sunpklo z5.s, z3.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z6.s, z5.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z6.s, z1.h
-; CHECK-NEWLOWERING-NEXT: mul z3.s, z4.s, z3.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z6.s, z5.s
-; CHECK-NEWLOWERING-NEXT: mad z1.s, p0/m, z2.s, z3.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z1.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z5.s, z6.s
+; CHECK-NEWLOWERING-NEXT: mul z3.s, z3.s, z4.s
+; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
+; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
+; CHECK-NEWLOWERING-NEXT: mla z1.s, p0/m, z7.s, z24.s
; CHECK-NEWLOWERING-NEXT: add z0.s, z1.s, z0.s
; CHECK-NEWLOWERING-NEXT: ret
entry:
@@ -116,23 +119,24 @@ define <vscale x 2 x i64> @sdot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16>
;
; CHECK-NEWLOWERING-LABEL: sdot_wide:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z1.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z1.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z2.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z1.s
-; CHECK-NEWLOWERING-NEXT: mul z3.d, z4.d, z3.d
-; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEWLOWERING-NEXT: mad z1.d, p0/m, z2.d, z3.d
+; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z1.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z1.d, z1.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEWLOWERING-NEXT: mul z3.d, z3.d, z4.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d
+; CHECK-NEWLOWERING-NEXT: movprfx z1, z3
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z24.d
; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
@@ -274,46 +278,59 @@ define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
;
; CHECK-NEWLOWERING-LABEL: udot_8to64:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z3.b
-; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z2.b
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b
+; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
+; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
+; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEWLOWERING-NEXT: uunpklo z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT: uunpklo z5.h, z3.b
; CHECK-NEWLOWERING-NEXT: uunpkhi z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT: uunpkhi z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-NEXT: uunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z5.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.s, z4.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z7.s, z5.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z24.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z25.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z6.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z7.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z6.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z4.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z24.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z25.s, z3.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z26.d, z6.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z6.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z27.d, z4.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z7.s
; CHECK-NEWLOWERING-NEXT: uunpklo z29.d, z5.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z7.d, z7.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z26.d
-; CHECK-NEWLOWERING-NEXT: uunpklo z26.d, z24.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z6.d
-; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z25.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z7.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mul z27.d, z29.d, z28.d
-; CHECK-NEWLOWERING-NEXT: uunpklo z28.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z25.d, z25.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z30.d, z24.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z31.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z24.d, z24.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z8.d, z25.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z25.d, z25.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z9.d, z3.s
+; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d
; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: mul z4.d, z5.d, z4.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z26.d
-; CHECK-NEWLOWERING-NEXT: movprfx z5, z27
-; CHECK-NEWLOWERING-NEXT: mla z5.d, p0/m, z28.d, z7.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z25.d, z24.d
-; CHECK-NEWLOWERING-NEXT: mad z2.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z5.d, z0.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
+; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
+; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
+; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
+; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
+; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
+; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
+; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
+; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -337,46 +354,59 @@ define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
;
; CHECK-NEWLOWERING-LABEL: sdot_8to64:
; CHECK-NEWLOWERING: // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z3.b
-; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z2.b
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
+; CHECK-NEWLOWERING-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #-2
+; CHECK-NEWLOWERING-NEXT: str z9, [sp] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16
+; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEWLOWERING-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
+; CHECK-NEWLOWERING-NEXT: sunpklo z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT: sunpklo z5.h, z3.b
; CHECK-NEWLOWERING-NEXT: sunpkhi z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT: sunpkhi z3.h, z3.b
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
; CHECK-NEWLOWERING-NEXT: sunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z5.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.s, z4.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z7.s, z5.h
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z24.s, z3.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z25.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z6.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z7.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z6.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z4.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z24.s, z2.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: sunpklo z25.s, z3.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT: sunpkhi z26.d, z6.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z6.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z27.d, z4.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z7.s
; CHECK-NEWLOWERING-NEXT: sunpklo z29.d, z5.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z7.d, z7.s
; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z27.d, z26.d
-; CHECK-NEWLOWERING-NEXT: sunpklo z26.d, z24.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z7.d, z6.d
-; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z25.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z7.d, z3.s
-; CHECK-NEWLOWERING-NEXT: mul z27.d, z29.d, z28.d
-; CHECK-NEWLOWERING-NEXT: sunpklo z28.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z25.d, z25.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z30.d, z24.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z31.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z24.d, z24.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: sunpkhi z8.d, z25.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z25.d, z25.s
+; CHECK-NEWLOWERING-NEXT: sunpklo z9.d, z3.s
+; CHECK-NEWLOWERING-NEXT: mul z27.d, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z28.d
; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: mul z4.d, z5.d, z4.d
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z26.d
-; CHECK-NEWLOWERING-NEXT: movprfx z5, z27
-; CHECK-NEWLOWERING-NEXT: mla z5.d, p0/m, z28.d, z7.d
-; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z25.d, z24.d
-; CHECK-NEWLOWERING-NEXT: mad z2.d, p0/m, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT: add z0.d, z5.d, z0.d
-; CHECK-NEWLOWERING-NEXT: add z1.d, z2.d, z1.d
+; CHECK-NEWLOWERING-NEXT: mul z4.d, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z26.d, z7.d
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z2.d, z9.d
+; CHECK-NEWLOWERING-NEXT: movprfx z2, z27
+; CHECK-NEWLOWERING-NEXT: mla z2.d, p0/m, z24.d, z25.d
+; CHECK-NEWLOWERING-NEXT: ldr z9, [sp] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: mla z1.d, p0/m, z31.d, z3.d
+; CHECK-NEWLOWERING-NEXT: movprfx z3, z4
+; CHECK-NEWLOWERING-NEXT: mla z3.d, p0/m, z30.d, z8.d
+; CHECK-NEWLOWERING-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
+; CHECK-NEWLOWERING-NEXT: add z1.d, z3.d, z1.d
+; CHECK-NEWLOWERING-NEXT: addvl sp, sp, #2
+; CHECK-NEWLOWERING-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -845,11 +875,11 @@ define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %
; CHECK-NEXT: and z1.h, z1.h, #0xff
; CHECK-NEXT: and z2.h, z2.h, #0xff
; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: uunpklo z3.s, z2.h
-; CHECK-NEXT: uunpklo z4.s, z1.h
-; CHECK-NEXT: uunpkhi z2.s, z2.h
+; CHECK-NEXT: uunpklo z3.s, z1.h
+; CHECK-NEXT: uunpklo z4.s, z2.h
; CHECK-NEXT: uunpkhi z1.s, z1.h
-; CHECK-NEXT: mla z0.s, p0/m, z4.s, z3.s
+; CHECK-NEXT: uunpkhi z2.s, z2.h
+; CHECK-NEXT: mla z0.s, p0/m, z3.s, z4.s
; CHECK-NEXT: mla z0.s, p0/m, z1.s, z2.s
; CHECK-NEXT: ret
;
@@ -858,11 +888,11 @@ define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %
; CHECK-NEWLOWERING-NEXT: and z1.h, z1.h, #0xff
; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff
; CHECK-NEWLOWERING-NEXT: ptrue p0.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z1.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z1.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z2.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z4.s, z3.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z3.s, z4.s
; CHECK-NEWLOWERING-NEXT: mla z0.s, p0/m, z1.s, z2.s
; CHECK-NEWLOWERING-NEXT: ret
entry:
@@ -879,11 +909,11 @@ define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x
; CHECK-NEXT: and z1.s, z1.s, #0xffff
; CHECK-NEXT: and z2.s, z2.s, #0xffff
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: uunpklo z3.d, z2.s
-; CHECK-NEXT: uunpklo z4.d, z1.s
-; CHECK-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEXT: uunpklo z3.d, z1.s
+; CHECK-NEXT: uunpklo z4.d, z2.s
; CHECK-NEXT: uunpkhi z1.d, z1.s
-; CHECK-NEXT: mla z0.d, p0/m, z4.d, z3.d
+; CHECK-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-NEXT: ret
;
@@ -892,11 +922,11 @@ define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x
; CHECK-NEWLOWERING-NEXT: and z1.s, z1.s, #0xffff
; CHECK-NEWLOWERING-NEXT: and z2.s, z2.s, #0xffff
; CHECK-NEWLOWERING-NEXT: ptrue p0.d
-; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z1.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z1.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z2.s
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z4.d, z3.d
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z3.d, z4.d
; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z1.d, z2.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
@@ -1248,48 +1278,34 @@ define <vscale x 2 x i16> @udot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: and z1.h, z1.h, #0xff
; CHECK-NEXT: and z2.h, z2.h, #0xff
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: uunpklo z3.s, z2.h
-; CHECK-NEXT: uunpklo z4.s, z1.h
-; CHECK-NEXT: uunpkhi z2.s, z2.h
+; CHECK-NEXT: mul z1.h, z1.h, z2.h
+; CHECK-NEXT: uunpklo z2.s, z1.h
; CHECK-NEXT: uunpkhi z1.s, z1.h
-; CHECK-NEXT: uunpklo z5.d, z3.s
-; CHECK-NEXT: uunpklo z6.d, z4.s
-; CHECK-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NEXT: mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEXT: uunpkhi z5.d, z2.s
-; CHECK-NEXT: uunpkhi z6.d, z1.s
-; CHECK-NEXT: mul z3.d, z4.d, z3.d
-; CHECK-NEXT: uunpklo z2.d, z2.s
-; CHECK-NEXT: uunpklo z1.d, z1.s
-; CHECK-NEXT: mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEXT: mad z1.d, p0/m, z2.d, z3.d
+; CHECK-NEXT: uunpklo z3.d, z2.s
+; CHECK-NEXT: uunpklo z4.d, z1.s
+; CHECK-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEXT: add z0.d, z0.d, z3.d
+; CHECK-NEXT: add z2.d, z2.d, z4.d
; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: add z0.d, z2.d, z0.d
; CHECK-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: udot_nxv8i8_promote:
; CHECK-NEWLOWERING: // %bb.0: // %entry
; CHECK-NEWLOWERING-NEXT: and z1.h, z1.h, #0xff
; CHECK-NEWLOWERING-NEXT: and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-NEXT: ptrue p0.d
-; CHECK-NEWLOWERING-NEXT: uunpklo z3.s, z2.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z4.s, z1.h
-; CHECK-NEWLOWERING-NEXT: uunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT: mul z1.h, z1.h, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z1.h
; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT: uunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEWLOWERING-NEXT: uunpkhi z5.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpkhi z6.d, z1.s
-; CHECK-NEWLOWERING-NEXT: mul z3.d, z4.d, z3.d
-; CHECK-NEWLOWERING-NEXT: uunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: uunpklo z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEWLOWERING-NEXT: mad z1.d, p0/m, z2.d, z3.d
+; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z1.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z3.d
+; CHECK-NEWLOWERING-NEXT: add z2.d, z2.d, z4.d
; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
@@ -1305,24 +1321,17 @@ define <vscale x 2 x i16> @sdot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: sxtb z1.h, p0/m, z1.h
; CHECK-NEXT: sxtb z2.h, p0/m, z2.h
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: sunpklo z3.s, z2.h
-; CHECK-NEXT: sunpklo z4.s, z1.h
-; CHECK-NEXT: sunpkhi z2.s, z2.h
-; CHECK-NEXT: sunpkhi z1.s, z1.h
-; CHECK-NEXT: sunpklo z5.d, z3.s
-; CHECK-NEXT: sunpklo z6.d, z4.s
-; CHECK-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NEXT: mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEXT: sunpkhi z5.d, z2.s
-; CHECK-NEXT: sunpkhi z6.d, z1.s
-; CHECK-NEXT: mul z3.d, z4.d, z3.d
-; CHECK-NEXT: sunpklo z2.d, z2.s
-; CHECK-NEXT: sunpklo z1.d, z1.s
-; CHECK-NEXT: mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEXT: mad z1.d, p0/m, z2.d, z3.d
+; CHECK-NEXT: mul z1.h, z1.h, z2.h
+; CHECK-NEXT: uunpklo z2.s, z1.h
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uunpklo z3.d, z2.s
+; CHECK-NEXT: uunpklo z4.d, z1.s
+; CHECK-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEXT: add z0.d, z0.d, z3.d
+; CHECK-NEXT: add z2.d, z2.d, z4.d
; CHECK-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEXT: add z0.d, z2.d, z0.d
; CHECK-NEXT: ret
;
; CHECK-NEWLOWERING-LABEL: sdot_nxv8i8_promote:
@@ -1330,24 +1339,17 @@ define <vscale x 2 x i16> @sdot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale
; CHECK-NEWLOWERING-NEXT: ptrue p0.h
; CHECK-NEWLOWERING-NEXT: sxtb z1.h, p0/m, z1.h
; CHECK-NEWLOWERING-NEXT: sxtb z2.h, p0/m, z2.h
-; CHECK-NEWLOWERING-NEXT: ptrue p0.d
-; CHECK-NEWLOWERING-NEXT: sunpklo z3.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z4.s, z1.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT: sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT: sunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEWLOWERING-NEXT: sunpkhi z5.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpkhi z6.d, z1.s
-; CHECK-NEWLOWERING-NEXT: mul z3.d, z4.d, z3.d
-; CHECK-NEWLOWERING-NEXT: sunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT: sunpklo z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT: mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEWLOWERING-NEXT: mad z1.d, p0/m, z2.d, z3.d
+; CHECK-NEWLOWERING-NEXT: mul z1.h, z1.h, z2.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z2.s, z1.h
+; CHECK-NEWLOWERING-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEWLOWERING-NEXT: uunpklo z3.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpklo z4.d, z1.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEWLOWERING-NEXT: add z0.d, z0.d, z3.d
+; CHECK-NEWLOWERING-NEXT: add z2.d, z2.d, z4.d
; CHECK-NEWLOWERING-NEXT: add z0.d, z1.d, z0.d
+; CHECK-NEWLOWERING-NEXT: add z0.d, z2.d, z0.d
; CHECK-NEWLOWERING-NEXT: ret
entry:
%a.wide = sext <vscale x 8 x i8> %a to <vscale x 8 x i16>
More information about the llvm-commits
mailing list