[llvm] 2bef21f - [DAGCombiner] Add generic DAG combine for ISD::PARTIAL_REDUCE_MLA (#127083)

via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 4 01:09:18 PST 2025


Author: James Chesterman
Date: 2025-03-04T09:09:15Z
New Revision: 2bef21f24ba932a757a644470358c340f4bcd113

URL: https://github.com/llvm/llvm-project/commit/2bef21f24ba932a757a644470358c340f4bcd113
DIFF: https://github.com/llvm/llvm-project/commit/2bef21f24ba932a757a644470358c340f4bcd113.diff

LOG: [DAGCombiner] Add generic DAG combine for ISD::PARTIAL_REDUCE_MLA (#127083)

Add generic DAG combine for ISD::PARTIAL_REDUCE_U/SMLA nodes. Transforms
the DAG from:
PARTIAL_REDUCE_MLA(Acc, MUL(EXT(MulOpLHS), EXT(MulOpRHS)), Splat(1)) to
PARTIAL_REDUCE_MLA(Acc, MulOpLHS, MulOpRHS).

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
    llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ef6bf142b306d..0e17897cf60b0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -545,6 +545,7 @@ namespace {
     SDValue visitMGATHER(SDNode *N);
     SDValue visitMSCATTER(SDNode *N);
     SDValue visitMHISTOGRAM(SDNode *N);
+    SDValue visitPARTIAL_REDUCE_MLA(SDNode *N);
     SDValue visitVPGATHER(SDNode *N);
     SDValue visitVPSCATTER(SDNode *N);
     SDValue visitVP_STRIDED_LOAD(SDNode *N);
@@ -1973,6 +1974,9 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::MSCATTER:           return visitMSCATTER(N);
   case ISD::MSTORE:             return visitMSTORE(N);
   case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: return visitMHISTOGRAM(N);
+  case ISD::PARTIAL_REDUCE_SMLA:
+  case ISD::PARTIAL_REDUCE_UMLA:
+                                return visitPARTIAL_REDUCE_MLA(N);
   case ISD::VECTOR_COMPRESS:    return visitVECTOR_COMPRESS(N);
   case ISD::LIFETIME_END:       return visitLIFETIME_END(N);
   case ISD::FP_TO_FP16:         return visitFP_TO_FP16(N);
@@ -12492,6 +12496,58 @@ SDValue DAGCombiner::visitMHISTOGRAM(SDNode *N) {
   return SDValue();
 }
 
+// Makes PARTIAL_REDUCE_*MLA(Acc, MUL(ZEXT(LHSExtOp), ZEXT(RHSExtOp)),
+// Splat(1)) into
+// PARTIAL_REDUCE_UMLA(Acc, LHSExtOp, RHSExtOp).
+// Makes PARTIAL_REDUCE_*MLA(Acc, MUL(SEXT(LHSExtOp), SEXT(RHSExtOp)),
+// Splat(1)) into
+// PARTIAL_REDUCE_SMLA(Acc, LHSExtOp, RHSExtOp).
+SDValue DAGCombiner::visitPARTIAL_REDUCE_MLA(SDNode *N) {
+  SDLoc DL(N);
+
+  SDValue Acc = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDValue Op2 = N->getOperand(2);
+
+  APInt ConstantOne;
+  if (Op1->getOpcode() != ISD::MUL ||
+      !ISD::isConstantSplatVector(Op2.getNode(), ConstantOne) ||
+      !ConstantOne.isOne())
+    return SDValue();
+
+  SDValue LHS = Op1->getOperand(0);
+  SDValue RHS = Op1->getOperand(1);
+  unsigned LHSOpcode = LHS->getOpcode();
+  unsigned RHSOpcode = RHS->getOpcode();
+  if (!ISD::isExtOpcode(LHSOpcode) || !ISD::isExtOpcode(RHSOpcode))
+    return SDValue();
+
+  SDValue LHSExtOp = LHS->getOperand(0);
+  SDValue RHSExtOp = RHS->getOperand(0);
+  EVT LHSExtOpVT = LHSExtOp.getValueType();
+  if (LHSExtOpVT != RHSExtOp.getValueType() || LHSOpcode != RHSOpcode)
+    return SDValue();
+
+  // FIXME: Add a check to only perform the DAG combine if there is lowering
+  // provided by the target
+
+  bool ExtIsSigned = LHSOpcode == ISD::SIGN_EXTEND;
+
+  // For a 2-stage extend the signedness of both of the extends must be the
+  // same. This is so the node can be folded into only a signed or unsigned
+  // node.
+  bool NodeIsSigned = N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA;
+  EVT AccElemVT = Acc.getValueType().getVectorElementType();
+  if (ExtIsSigned != NodeIsSigned &&
+      Op1.getValueType().getVectorElementType() != AccElemVT)
+    return SDValue();
+
+  unsigned NewOpcode =
+      ExtIsSigned ? ISD::PARTIAL_REDUCE_SMLA : ISD::PARTIAL_REDUCE_UMLA;
+  return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, LHSExtOp,
+                     RHSExtOp);
+}
+
 SDValue DAGCombiner::visitVP_STRIDED_LOAD(SDNode *N) {
   auto *SLD = cast<VPStridedLoadSDNode>(N);
   EVT EltVT = SLD->getValueType(0).getVectorElementType();

diff  --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index 249675470e38c..ecb6c281b8c0b 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -12,13 +12,15 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 ;
 ; CHECK-NODOT-LABEL: udot:
 ; CHECK-NODOT:       // %bb.0:
-; CHECK-NODOT-NEXT:    umull v3.8h, v2.8b, v1.8b
-; CHECK-NODOT-NEXT:    umull2 v1.8h, v2.16b, v1.16b
-; CHECK-NODOT-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v3.4h
-; CHECK-NODOT-NEXT:    uaddw2 v2.4s, v2.4s, v3.8h
-; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
-; CHECK-NODOT-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-NODOT-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v1.8h, v1.16b, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    umlal v0.4s, v4.4h, v3.4h
+; CHECK-NODOT-NEXT:    umull v5.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    umlal2 v0.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    umlal2 v5.4s, v4.8h, v3.8h
+; CHECK-NODOT-NEXT:    add v0.4s, v5.4s, v0.4s
 ; CHECK-NODOT-NEXT:    ret
   %u.wide = zext <16 x i8> %u to <16 x i32>
   %s.wide = zext <16 x i8> %s to <16 x i32>
@@ -95,17 +97,19 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 ;
 ; CHECK-NODOT-LABEL: udot_narrow:
 ; CHECK-NODOT:       // %bb.0:
-; CHECK-NODOT-NEXT:    umull v1.8h, v2.8b, v1.8b
+; CHECK-NODOT-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    ushll v2.8h, v2.8b, #0
 ; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NODOT-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-NODOT-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
-; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NODOT-NEXT:    umull v3.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    umull2 v4.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    ext v6.16b, v2.16b, v2.16b, #8
+; CHECK-NODOT-NEXT:    umlal v0.4s, v2.4h, v1.4h
 ; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
-; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
-; CHECK-NODOT-NEXT:    uaddw v1.4s, v2.4s, v4.4h
+; CHECK-NODOT-NEXT:    ext v1.16b, v4.16b, v4.16b, #8
+; CHECK-NODOT-NEXT:    umlal v3.4s, v6.4h, v5.4h
 ; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>
@@ -122,13 +126,15 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 ;
 ; CHECK-NODOT-LABEL: sdot:
 ; CHECK-NODOT:       // %bb.0:
-; CHECK-NODOT-NEXT:    smull v3.8h, v2.8b, v1.8b
-; CHECK-NODOT-NEXT:    smull2 v1.8h, v2.16b, v1.16b
-; CHECK-NODOT-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v3.4h
-; CHECK-NODOT-NEXT:    saddw2 v2.4s, v2.4s, v3.8h
-; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
-; CHECK-NODOT-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-NODOT-NEXT:    sshll v3.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v1.8h, v1.16b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    smlal v0.4s, v4.4h, v3.4h
+; CHECK-NODOT-NEXT:    smull v5.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    smlal2 v5.4s, v4.8h, v3.8h
+; CHECK-NODOT-NEXT:    add v0.4s, v5.4s, v0.4s
 ; CHECK-NODOT-NEXT:    ret
   %u.wide = sext <16 x i8> %u to <16 x i32>
   %s.wide = sext <16 x i8> %s to <16 x i32>
@@ -145,17 +151,19 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 ;
 ; CHECK-NODOT-LABEL: sdot_narrow:
 ; CHECK-NODOT:       // %bb.0:
-; CHECK-NODOT-NEXT:    smull v1.8h, v2.8b, v1.8b
+; CHECK-NODOT-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NODOT-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NODOT-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-NODOT-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
-; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
+; CHECK-NODOT-NEXT:    smull v3.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    smull2 v4.4s, v2.8h, v1.8h
+; CHECK-NODOT-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    ext v6.16b, v2.16b, v2.16b, #8
+; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
 ; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
-; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
-; CHECK-NODOT-NEXT:    saddw v1.4s, v2.4s, v4.4h
+; CHECK-NODOT-NEXT:    ext v1.16b, v4.16b, v4.16b, #8
+; CHECK-NODOT-NEXT:    smlal v3.4s, v6.4h, v5.4h
 ; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
   %u.wide = sext <8 x i8> %u to <8 x i32>
   %s.wide = sext <8 x i8> %s to <8 x i32>
@@ -407,19 +415,27 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
 ;
 ; CHECK-NODOT-LABEL: udot_8to64:
 ; CHECK-NODOT:       // %bb.0: // %entry
-; CHECK-NODOT-NEXT:    umull v4.8h, v2.8b, v3.8b
-; CHECK-NODOT-NEXT:    umull2 v2.8h, v2.16b, v3.16b
-; CHECK-NODOT-NEXT:    ushll v3.4s, v4.4h, #0
-; CHECK-NODOT-NEXT:    ushll v5.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    ushll v4.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    ushll v5.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    ushll v6.4s, v4.4h, #0
+; CHECK-NODOT-NEXT:    ushll v7.4s, v5.4h, #0
 ; CHECK-NODOT-NEXT:    ushll2 v4.4s, v4.8h, #0
-; CHECK-NODOT-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v3.4s
-; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v3.2s
-; CHECK-NODOT-NEXT:    uaddl2 v3.2d, v4.4s, v5.4s
-; CHECK-NODOT-NEXT:    uaddl v4.2d, v4.2s, v5.2s
-; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v2.4s
-; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v2.2s
-; CHECK-NODOT-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-NODOT-NEXT:    ushll2 v5.4s, v5.8h, #0
+; CHECK-NODOT-NEXT:    ushll2 v16.4s, v3.8h, #0
+; CHECK-NODOT-NEXT:    ushll2 v17.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-NODOT-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    umlal2 v1.2d, v7.4s, v6.4s
+; CHECK-NODOT-NEXT:    umlal v0.2d, v7.2s, v6.2s
+; CHECK-NODOT-NEXT:    umull2 v18.2d, v5.4s, v4.4s
+; CHECK-NODOT-NEXT:    umull v4.2d, v5.2s, v4.2s
+; CHECK-NODOT-NEXT:    umlal2 v1.2d, v17.4s, v16.4s
+; CHECK-NODOT-NEXT:    umlal v0.2d, v17.2s, v16.2s
+; CHECK-NODOT-NEXT:    umlal2 v18.2d, v2.4s, v3.4s
+; CHECK-NODOT-NEXT:    umlal v4.2d, v2.2s, v3.2s
+; CHECK-NODOT-NEXT:    add v1.2d, v18.2d, v1.2d
 ; CHECK-NODOT-NEXT:    add v0.2d, v4.2d, v0.2d
 ; CHECK-NODOT-NEXT:    ret
 entry:
@@ -442,19 +458,27 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
 ;
 ; CHECK-NODOT-LABEL: sdot_8to64:
 ; CHECK-NODOT:       // %bb.0: // %entry
-; CHECK-NODOT-NEXT:    smull v4.8h, v2.8b, v3.8b
-; CHECK-NODOT-NEXT:    smull2 v2.8h, v2.16b, v3.16b
-; CHECK-NODOT-NEXT:    sshll v3.4s, v4.4h, #0
-; CHECK-NODOT-NEXT:    sshll v5.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    sshll v4.8h, v3.8b, #0
+; CHECK-NODOT-NEXT:    sshll v5.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    sshll v6.4s, v4.4h, #0
+; CHECK-NODOT-NEXT:    sshll v7.4s, v5.4h, #0
 ; CHECK-NODOT-NEXT:    sshll2 v4.4s, v4.8h, #0
-; CHECK-NODOT-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v3.4s
-; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v3.2s
-; CHECK-NODOT-NEXT:    saddl2 v3.2d, v4.4s, v5.4s
-; CHECK-NODOT-NEXT:    saddl v4.2d, v4.2s, v5.2s
-; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v2.4s
-; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v2.2s
-; CHECK-NODOT-NEXT:    add v1.2d, v3.2d, v1.2d
+; CHECK-NODOT-NEXT:    sshll2 v5.4s, v5.8h, #0
+; CHECK-NODOT-NEXT:    sshll2 v16.4s, v3.8h, #0
+; CHECK-NODOT-NEXT:    sshll2 v17.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    sshll v3.4s, v3.4h, #0
+; CHECK-NODOT-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    smlal2 v1.2d, v7.4s, v6.4s
+; CHECK-NODOT-NEXT:    smlal v0.2d, v7.2s, v6.2s
+; CHECK-NODOT-NEXT:    smull2 v18.2d, v5.4s, v4.4s
+; CHECK-NODOT-NEXT:    smull v4.2d, v5.2s, v4.2s
+; CHECK-NODOT-NEXT:    smlal2 v1.2d, v17.4s, v16.4s
+; CHECK-NODOT-NEXT:    smlal v0.2d, v17.2s, v16.2s
+; CHECK-NODOT-NEXT:    smlal2 v18.2d, v2.4s, v3.4s
+; CHECK-NODOT-NEXT:    smlal v4.2d, v2.2s, v3.2s
+; CHECK-NODOT-NEXT:    add v1.2d, v18.2d, v1.2d
 ; CHECK-NODOT-NEXT:    add v0.2d, v4.2d, v0.2d
 ; CHECK-NODOT-NEXT:    ret
 entry:
@@ -771,9 +795,10 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
 define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
 ; CHECK-LABEL: not_udot:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    umull v1.8h, v2.8b, v1.8b
-; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
-; CHECK-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-NEXT:    umlal v0.4s, v2.4h, v1.4h
+; CHECK-NEXT:    umlal2 v0.4s, v2.8h, v1.8h
 ; CHECK-NEXT:    ret
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>

diff  --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index 455231dd37be6..d7bab3297cf29 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -11,24 +11,23 @@ define <vscale x 4 x i32> @udot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a,
 ;
 ; CHECK-NEWLOWERING-LABEL: udot:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-NEWLOWERING-NEXT:    uunpklo z3.h, z2.b
+; CHECK-NEWLOWERING-NEXT:    uunpklo z4.h, z1.b
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.h, z1.b
 ; CHECK-NEWLOWERING-NEXT:    ptrue p0.s
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z5.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.s, z3.h
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z6.s, z4.h
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.s, z3.h
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z7.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z24.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NEWLOWERING-NEXT:    mul z3.s, z3.s, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEWLOWERING-NEXT:    movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT:    mla z1.s, p0/m, z7.s, z24.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z6.s, z5.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z5.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z6.s, z1.h
+; CHECK-NEWLOWERING-NEXT:    mul z3.s, z4.s, z3.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z6.s, z5.s
+; CHECK-NEWLOWERING-NEXT:    mad z1.s, p0/m, z2.s, z3.s
 ; CHECK-NEWLOWERING-NEXT:    add z0.s, z1.s, z0.s
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
@@ -47,24 +46,23 @@ define <vscale x 2 x i64> @udot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16>
 ;
 ; CHECK-NEWLOWERING-LABEL: udot_wide:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z7.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z24.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mul z3.d, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z7.d, z24.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z5.d
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z5.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z6.d, z1.s
+; CHECK-NEWLOWERING-NEXT:    mul z3.d, z4.d, z3.d
+; CHECK-NEWLOWERING-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z5.d
+; CHECK-NEWLOWERING-NEXT:    mad z1.d, p0/m, z2.d, z3.d
 ; CHECK-NEWLOWERING-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
@@ -83,24 +81,23 @@ define <vscale x 4 x i32> @sdot(<vscale x 4 x i32> %accc, <vscale x 16 x i8> %a,
 ;
 ; CHECK-NEWLOWERING-LABEL: sdot:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-NEWLOWERING-NEXT:    sunpklo z3.h, z2.b
+; CHECK-NEWLOWERING-NEXT:    sunpklo z4.h, z1.b
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.h, z1.b
 ; CHECK-NEWLOWERING-NEXT:    ptrue p0.s
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z5.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.s, z3.h
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z6.s, z4.h
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.s, z3.h
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z7.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z24.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NEWLOWERING-NEXT:    mul z3.s, z3.s, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEWLOWERING-NEXT:    movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT:    mla z1.s, p0/m, z7.s, z24.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z6.s, z5.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z5.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z6.s, z1.h
+; CHECK-NEWLOWERING-NEXT:    mul z3.s, z4.s, z3.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z6.s, z5.s
+; CHECK-NEWLOWERING-NEXT:    mad z1.s, p0/m, z2.s, z3.s
 ; CHECK-NEWLOWERING-NEXT:    add z0.s, z1.s, z0.s
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
@@ -119,24 +116,23 @@ define <vscale x 2 x i64> @sdot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16>
 ;
 ; CHECK-NEWLOWERING-LABEL: sdot_wide:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z7.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z24.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mul z3.d, z3.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z7.d, z24.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z5.d
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z5.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z6.d, z1.s
+; CHECK-NEWLOWERING-NEXT:    mul z3.d, z4.d, z3.d
+; CHECK-NEWLOWERING-NEXT:    sunpklo z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z5.d
+; CHECK-NEWLOWERING-NEXT:    mad z1.d, p0/m, z2.d, z3.d
 ; CHECK-NEWLOWERING-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
@@ -278,59 +274,46 @@ define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
 ;
 ; CHECK-NEWLOWERING-LABEL: udot_8to64:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    addvl sp, sp, #-2
-; CHECK-NEWLOWERING-NEXT:    str z9, [sp] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEWLOWERING-NEXT:    .cfi_offset w29, -16
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT:    uunpklo z4.h, z3.b
+; CHECK-NEWLOWERING-NEXT:    uunpklo z5.h, z2.b
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.h, z3.b
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.h, z2.b
 ; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.s, z4.h
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z7.s, z5.h
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.s, z4.h
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z24.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z25.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z26.d, z6.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z6.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z27.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z28.d, z7.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z24.s, z3.h
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z25.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    uunpklo z26.d, z6.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z27.d, z7.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z6.d, z6.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z28.d, z4.s
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z29.d, z5.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z7.d, z7.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z30.d, z24.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z31.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z8.d, z25.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z9.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    mul z27.d, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z28.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z27.d, z26.d
+; CHECK-NEWLOWERING-NEXT:    uunpklo z26.d, z24.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z24.d, z24.s
+; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z7.d, z6.d
+; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z25.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z7.d, z3.s
+; CHECK-NEWLOWERING-NEXT:    mul z27.d, z29.d, z28.d
+; CHECK-NEWLOWERING-NEXT:    uunpklo z28.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z25.d, z25.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    mul z4.d, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z26.d, z7.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z2.d, z9.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z2, z27
-; CHECK-NEWLOWERING-NEXT:    mla z2.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT:    ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z31.d, z3.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z3, z4
-; CHECK-NEWLOWERING-NEXT:    mla z3.d, p0/m, z30.d, z8.d
-; CHECK-NEWLOWERING-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z2.d, z0.d
-; CHECK-NEWLOWERING-NEXT:    add z1.d, z3.d, z1.d
-; CHECK-NEWLOWERING-NEXT:    addvl sp, sp, #2
-; CHECK-NEWLOWERING-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    mul z4.d, z5.d, z4.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z26.d
+; CHECK-NEWLOWERING-NEXT:    movprfx z5, z27
+; CHECK-NEWLOWERING-NEXT:    mla z5.d, p0/m, z28.d, z7.d
+; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z25.d, z24.d
+; CHECK-NEWLOWERING-NEXT:    mad z2.d, p0/m, z3.d, z4.d
+; CHECK-NEWLOWERING-NEXT:    add z0.d, z5.d, z0.d
+; CHECK-NEWLOWERING-NEXT:    add z1.d, z2.d, z1.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -354,59 +337,46 @@ define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
 ;
 ; CHECK-NEWLOWERING-LABEL: sdot_8to64:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    addvl sp, sp, #-2
-; CHECK-NEWLOWERING-NEXT:    str z9, [sp] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEWLOWERING-NEXT:    .cfi_offset w29, -16
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT:    sunpklo z4.h, z3.b
+; CHECK-NEWLOWERING-NEXT:    sunpklo z5.h, z2.b
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.h, z3.b
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.h, z2.b
 ; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.s, z4.h
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z7.s, z5.h
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.s, z4.h
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z24.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z25.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z26.d, z6.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z6.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z27.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z28.d, z7.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z24.s, z3.h
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z25.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT:    sunpklo z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    sunpklo z26.d, z6.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z27.d, z7.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z6.d, z6.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z28.d, z4.s
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z29.d, z5.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z7.d, z7.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z30.d, z24.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z31.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z8.d, z25.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z9.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    mul z27.d, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z28.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z27.d, z26.d
+; CHECK-NEWLOWERING-NEXT:    sunpklo z26.d, z24.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z24.d, z24.s
+; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z7.d, z6.d
+; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z25.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z7.d, z3.s
+; CHECK-NEWLOWERING-NEXT:    mul z27.d, z29.d, z28.d
+; CHECK-NEWLOWERING-NEXT:    sunpklo z28.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z25.d, z25.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    mul z4.d, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z26.d, z7.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z2.d, z9.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z2, z27
-; CHECK-NEWLOWERING-NEXT:    mla z2.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT:    ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z31.d, z3.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z3, z4
-; CHECK-NEWLOWERING-NEXT:    mla z3.d, p0/m, z30.d, z8.d
-; CHECK-NEWLOWERING-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z2.d, z0.d
-; CHECK-NEWLOWERING-NEXT:    add z1.d, z3.d, z1.d
-; CHECK-NEWLOWERING-NEXT:    addvl sp, sp, #2
-; CHECK-NEWLOWERING-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    mul z4.d, z5.d, z4.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z26.d
+; CHECK-NEWLOWERING-NEXT:    movprfx z5, z27
+; CHECK-NEWLOWERING-NEXT:    mla z5.d, p0/m, z28.d, z7.d
+; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z25.d, z24.d
+; CHECK-NEWLOWERING-NEXT:    mad z2.d, p0/m, z3.d, z4.d
+; CHECK-NEWLOWERING-NEXT:    add z0.d, z5.d, z0.d
+; CHECK-NEWLOWERING-NEXT:    add z1.d, z2.d, z1.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -875,11 +845,11 @@ define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %
 ; CHECK-NEXT:    and z1.h, z1.h, #0xff
 ; CHECK-NEXT:    and z2.h, z2.h, #0xff
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEXT:    uunpklo z3.s, z2.h
+; CHECK-NEXT:    uunpklo z4.s, z1.h
 ; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    mla z0.s, p0/m, z3.s, z4.s
+; CHECK-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEXT:    mla z0.s, p0/m, z4.s, z3.s
 ; CHECK-NEXT:    mla z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    ret
 ;
@@ -888,11 +858,11 @@ define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %
 ; CHECK-NEWLOWERING-NEXT:    and z1.h, z1.h, #0xff
 ; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
 ; CHECK-NEWLOWERING-NEXT:    ptrue p0.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z3.s, z4.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z4.s, z3.s
 ; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
@@ -909,11 +879,11 @@ define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x
 ; CHECK-NEXT:    and z1.s, z1.s, #0xffff
 ; CHECK-NEXT:    and z2.s, z2.s, #0xffff
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z3.d, z1.s
-; CHECK-NEXT:    uunpklo z4.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    uunpklo z3.d, z2.s
+; CHECK-NEXT:    uunpklo z4.d, z1.s
 ; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    mla z0.d, p0/m, z4.d, z3.d
 ; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    ret
 ;
@@ -922,11 +892,11 @@ define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x
 ; CHECK-NEWLOWERING-NEXT:    and z1.s, z1.s, #0xffff
 ; CHECK-NEWLOWERING-NEXT:    and z2.s, z2.s, #0xffff
 ; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z3.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z4.d, z1.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z4.d, z3.d
 ; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
@@ -1278,34 +1248,48 @@ define <vscale x 2 x i16> @udot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    and z1.h, z1.h, #0xff
 ; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    mul z1.h, z1.h, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z1.h
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpklo z3.s, z2.h
+; CHECK-NEXT:    uunpklo z4.s, z1.h
+; CHECK-NEXT:    uunpkhi z2.s, z2.h
 ; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpklo z3.d, z2.s
-; CHECK-NEXT:    uunpklo z4.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEXT:    add z2.d, z2.d, z4.d
+; CHECK-NEXT:    uunpklo z5.d, z3.s
+; CHECK-NEXT:    uunpklo z6.d, z4.s
+; CHECK-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-NEXT:    mla z0.d, p0/m, z6.d, z5.d
+; CHECK-NEXT:    uunpkhi z5.d, z2.s
+; CHECK-NEXT:    uunpkhi z6.d, z1.s
+; CHECK-NEXT:    mul z3.d, z4.d, z3.d
+; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    mla z0.d, p0/m, z6.d, z5.d
+; CHECK-NEXT:    mad z1.d, p0/m, z2.d, z3.d
 ; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    add z0.d, z2.d, z0.d
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: udot_nxv8i8_promote:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
 ; CHECK-NEWLOWERING-NEXT:    and z1.h, z1.h, #0xff
 ; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    mul z1.h, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z2.s, z1.h
+; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
+; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z1.h
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-NEXT:    add z2.d, z2.d, z4.d
+; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z3.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z5.d
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z5.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z6.d, z1.s
+; CHECK-NEWLOWERING-NEXT:    mul z3.d, z4.d, z3.d
+; CHECK-NEWLOWERING-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z5.d
+; CHECK-NEWLOWERING-NEXT:    mad z1.d, p0/m, z2.d, z3.d
 ; CHECK-NEWLOWERING-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z2.d, z0.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
@@ -1321,17 +1305,24 @@ define <vscale x 2 x i16> @sdot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    sxtb z1.h, p0/m, z1.h
 ; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEXT:    mul z1.h, z1.h, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpklo z3.d, z2.s
-; CHECK-NEXT:    uunpklo z4.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEXT:    add z2.d, z2.d, z4.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sunpklo z3.s, z2.h
+; CHECK-NEXT:    sunpklo z4.s, z1.h
+; CHECK-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-NEXT:    sunpklo z5.d, z3.s
+; CHECK-NEXT:    sunpklo z6.d, z4.s
+; CHECK-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-NEXT:    mla z0.d, p0/m, z6.d, z5.d
+; CHECK-NEXT:    sunpkhi z5.d, z2.s
+; CHECK-NEXT:    sunpkhi z6.d, z1.s
+; CHECK-NEXT:    mul z3.d, z4.d, z3.d
+; CHECK-NEXT:    sunpklo z2.d, z2.s
+; CHECK-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEXT:    mla z0.d, p0/m, z6.d, z5.d
+; CHECK-NEXT:    mad z1.d, p0/m, z2.d, z3.d
 ; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    add z0.d, z2.d, z0.d
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: sdot_nxv8i8_promote:
@@ -1339,17 +1330,24 @@ define <vscale x 2 x i16> @sdot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale
 ; CHECK-NEWLOWERING-NEXT:    ptrue p0.h
 ; CHECK-NEWLOWERING-NEXT:    sxtb z1.h, p0/m, z1.h
 ; CHECK-NEWLOWERING-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEWLOWERING-NEXT:    mul z1.h, z1.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-NEXT:    add z2.d, z2.d, z4.d
+; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
+; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z1.h
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z3.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z5.d
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z5.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z6.d, z1.s
+; CHECK-NEWLOWERING-NEXT:    mul z3.d, z4.d, z3.d
+; CHECK-NEWLOWERING-NEXT:    sunpklo z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z1.d, z1.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z5.d
+; CHECK-NEWLOWERING-NEXT:    mad z1.d, p0/m, z2.d, z3.d
 ; CHECK-NEWLOWERING-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z2.d, z0.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i8> %a to <vscale x 8 x i16>


        


More information about the llvm-commits mailing list