[llvm] b6f65f0 - [SelectionDAG] Improve type legalisation for PARTIAL_REDUCE_MLA (#130935)

via llvm-commits llvm-commits at lists.llvm.org
Thu May 1 07:08:50 PDT 2025


Author: Nicholas Guy
Date: 2025-05-01T15:08:46+01:00
New Revision: b6f65f07bc0214f932aa69fc58859ad5ac9efa1a

URL: https://github.com/llvm/llvm-project/commit/b6f65f07bc0214f932aa69fc58859ad5ac9efa1a
DIFF: https://github.com/llvm/llvm-project/commit/b6f65f07bc0214f932aa69fc58859ad5ac9efa1a.diff

LOG: [SelectionDAG] Improve type legalisation for PARTIAL_REDUCE_MLA (#130935)

Implement proper splitting functions for PARTIAL_REDUCE_MLA ISD nodes.
This makes the udot_8to64 and sdot_8to64 tests generate dot product
instructions for when the new ISD nodes are used.

---------

Co-authored-by: James Chesterman <james.chesterman at arm.com>

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
    llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
    llvm/test/CodeGen/AArch64/partial-reduction-add.ll
    llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
    llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index a01e1cff74564..d0b69b88748a9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3220,8 +3220,30 @@ void DAGTypeLegalizer::SplitVecRes_VP_REVERSE(SDNode *N, SDValue &Lo,
 void DAGTypeLegalizer::SplitVecRes_PARTIAL_REDUCE_MLA(SDNode *N, SDValue &Lo,
                                                       SDValue &Hi) {
   SDLoc DL(N);
-  SDValue Expanded = TLI.expandPartialReduceMLA(N, DAG);
-  std::tie(Lo, Hi) = DAG.SplitVector(Expanded, DL);
+  SDValue Acc = N->getOperand(0);
+  SDValue Input1 = N->getOperand(1);
+  SDValue Input2 = N->getOperand(2);
+
+  SDValue AccLo, AccHi;
+  std::tie(AccLo, AccHi) = DAG.SplitVector(Acc, DL);
+  unsigned Opcode = N->getOpcode();
+
+  // If the input types don't need splitting, just accumulate into the
+  // low part of the accumulator.
+  if (getTypeAction(Input1.getValueType()) != TargetLowering::TypeSplitVector) {
+    Lo = DAG.getNode(Opcode, DL, AccLo.getValueType(), AccLo, Input1, Input2);
+    Hi = AccHi;
+    return;
+  }
+
+  SDValue Input1Lo, Input1Hi;
+  SDValue Input2Lo, Input2Hi;
+  std::tie(Input1Lo, Input1Hi) = DAG.SplitVector(Input1, DL);
+  std::tie(Input2Lo, Input2Hi) = DAG.SplitVector(Input2, DL);
+  EVT ResultVT = AccLo.getValueType();
+
+  Lo = DAG.getNode(Opcode, DL, ResultVT, AccLo, Input1Lo, Input2Lo);
+  Hi = DAG.getNode(Opcode, DL, ResultVT, AccHi, Input1Hi, Input2Hi);
 }
 
 void DAGTypeLegalizer::SplitVecRes_VECTOR_DEINTERLEAVE(SDNode *N) {
@@ -4501,7 +4523,20 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECTOR_HISTOGRAM(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_PARTIAL_REDUCE_MLA(SDNode *N) {
-  return TLI.expandPartialReduceMLA(N, DAG);
+  SDValue Acc = N->getOperand(0);
+  assert(getTypeAction(Acc.getValueType()) != TargetLowering::TypeSplitVector &&
+         "Accumulator should already be a legal type, and shouldn't need "
+         "further splitting");
+
+  SDLoc DL(N);
+  SDValue Input1Lo, Input1Hi, Input2Lo, Input2Hi;
+  std::tie(Input1Lo, Input1Hi) = DAG.SplitVector(N->getOperand(1), DL);
+  std::tie(Input2Lo, Input2Hi) = DAG.SplitVector(N->getOperand(2), DL);
+  unsigned Opcode = N->getOpcode();
+  EVT ResultVT = Acc.getValueType();
+
+  SDValue Lo = DAG.getNode(Opcode, DL, ResultVT, Acc, Input1Lo, Input2Lo);
+  return DAG.getNode(Opcode, DL, ResultVT, Lo, Input1Hi, Input2Hi);
 }
 
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index 9e305056abce2..ab9813aa796e3 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -14,11 +14,10 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    umull v3.8h, v2.8b, v1.8b
 ; CHECK-NODOT-NEXT:    umull2 v1.8h, v2.16b, v1.16b
-; CHECK-NODOT-NEXT:    ushll v2.4s, v1.4h, #0
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v3.4h
-; CHECK-NODOT-NEXT:    uaddw2 v2.4s, v2.4s, v3.8h
+; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v3.8h
+; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
-; CHECK-NODOT-NEXT:    add v0.4s, v2.4s, v0.4s
 ; CHECK-NODOT-NEXT:    ret
   %u.wide = zext <16 x i8> %u to <16 x i32>
   %s.wide = zext <16 x i8> %s to <16 x i32>
@@ -50,18 +49,17 @@ define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){
 ; CHECK-NODOT-NEXT:    mov x8, xzr
 ; CHECK-NODOT-NEXT:  .LBB1_1: // %vector.body
 ; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NODOT-NEXT:    ldr q0, [x0, x8]
-; CHECK-NODOT-NEXT:    ldr q2, [x1, x8]
+; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-NODOT-NEXT:    ldr q3, [x1, x8]
+; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NODOT-NEXT:    add x8, x8, #16
+; CHECK-NODOT-NEXT:    umull v4.8h, v2.8b, v3.8b
+; CHECK-NODOT-NEXT:    umull2 v2.8h, v2.16b, v3.16b
 ; CHECK-NODOT-NEXT:    cmp x8, #16
-; CHECK-NODOT-NEXT:    umull v3.8h, v0.8b, v2.8b
-; CHECK-NODOT-NEXT:    umull2 v2.8h, v0.16b, v2.16b
-; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
-; CHECK-NODOT-NEXT:    ushll v1.4s, v2.4h, #0
-; CHECK-NODOT-NEXT:    uaddw v4.4s, v0.4s, v3.4h
-; CHECK-NODOT-NEXT:    uaddw2 v1.4s, v1.4s, v3.8h
-; CHECK-NODOT-NEXT:    uaddw2 v2.4s, v4.4s, v2.8h
-; CHECK-NODOT-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NODOT-NEXT:    uaddw v1.4s, v1.4s, v4.4h
+; CHECK-NODOT-NEXT:    uaddw2 v1.4s, v1.4s, v4.8h
+; CHECK-NODOT-NEXT:    uaddw v1.4s, v1.4s, v2.4h
+; CHECK-NODOT-NEXT:    uaddw2 v1.4s, v1.4s, v2.8h
 ; CHECK-NODOT-NEXT:    b.ne .LBB1_1
 ; CHECK-NODOT-NEXT:  // %bb.2: // %end
 ; CHECK-NODOT-NEXT:    ret
@@ -98,14 +96,14 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 ; CHECK-NODOT-NEXT:    umull v1.8h, v2.8b, v1.8b
 ; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NODOT-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-NODOT-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
-; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
 ; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
-; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
-; CHECK-NODOT-NEXT:    uaddw v1.4s, v2.4s, v4.4h
-; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
+; CHECK-NODOT-NEXT:    ext v2.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>
@@ -124,11 +122,10 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    smull v3.8h, v2.8b, v1.8b
 ; CHECK-NODOT-NEXT:    smull2 v1.8h, v2.16b, v1.16b
-; CHECK-NODOT-NEXT:    sshll v2.4s, v1.4h, #0
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v3.4h
-; CHECK-NODOT-NEXT:    saddw2 v2.4s, v2.4s, v3.8h
+; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v3.8h
+; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
-; CHECK-NODOT-NEXT:    add v0.4s, v2.4s, v0.4s
 ; CHECK-NODOT-NEXT:    ret
   %u.wide = sext <16 x i8> %u to <16 x i32>
   %s.wide = sext <16 x i8> %s to <16 x i32>
@@ -148,14 +145,14 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 ; CHECK-NODOT-NEXT:    smull v1.8h, v2.8b, v1.8b
 ; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NODOT-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-NODOT-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
-; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
 ; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
-; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
-; CHECK-NODOT-NEXT:    saddw v1.4s, v2.4s, v4.4h
-; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
+; CHECK-NODOT-NEXT:    ext v2.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
   %u.wide = sext <8 x i8> %u to <8 x i32>
   %s.wide = sext <8 x i8> %s to <8 x i32>
@@ -168,14 +165,13 @@ define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 ; CHECK-NOI8MM-LABEL: usdot:
 ; CHECK-NOI8MM:       // %bb.0:
 ; CHECK-NOI8MM-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v1.8h, v1.16b, #0
 ; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-NOI8MM-NEXT:    ushll2 v1.8h, v1.16b, #0
 ; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
 ; CHECK-NOI8MM-NEXT:    smlal v0.4s, v4.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smull v5.4s, v2.4h, v1.4h
+; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
 ; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    smlal2 v5.4s, v4.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    add v0.4s, v5.4s, v0.4s
 ; CHECK-NOI8MM-NEXT:    ret
 ;
 ; CHECK-I8MM-LABEL: usdot:
@@ -196,20 +192,19 @@ define <4 x i32> @usdot_in_loop(ptr %p1, ptr %p2){
 ; CHECK-NOI8MM-NEXT:    mov x8, xzr
 ; CHECK-NOI8MM-NEXT:  .LBB6_1: // %vector.body
 ; CHECK-NOI8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOI8MM-NEXT:    ldr q0, [x0, x8]
-; CHECK-NOI8MM-NEXT:    ldr q2, [x1, x8]
+; CHECK-NOI8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-NOI8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-NOI8MM-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NOI8MM-NEXT:    add x8, x8, #16
+; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
+; CHECK-NOI8MM-NEXT:    ushll v5.8h, v3.8b, #0
+; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NOI8MM-NEXT:    ushll2 v3.8h, v3.16b, #0
 ; CHECK-NOI8MM-NEXT:    cmp x8, #16
-; CHECK-NOI8MM-NEXT:    sshll v3.8h, v0.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v4.8h, v0.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll v5.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v3.4h, v5.4h
-; CHECK-NOI8MM-NEXT:    smull v6.4s, v4.4h, v2.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v4.8h, v2.8h
-; CHECK-NOI8MM-NEXT:    smlal2 v6.4s, v3.8h, v5.8h
-; CHECK-NOI8MM-NEXT:    add v1.4s, v6.4s, v1.4s
+; CHECK-NOI8MM-NEXT:    smlal v1.4s, v4.4h, v5.4h
+; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-NOI8MM-NEXT:    smlal v1.4s, v2.4h, v3.4h
+; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
 ; CHECK-NOI8MM-NEXT:    b.ne .LBB6_1
 ; CHECK-NOI8MM-NEXT:  // %bb.2: // %end
 ; CHECK-NOI8MM-NEXT:    ret
@@ -258,15 +253,15 @@ define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
 ; CHECK-NOI8MM-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-NOI8MM-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NOI8MM-NEXT:    smull v3.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    smull2 v4.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
-; CHECK-NOI8MM-NEXT:    ext v6.16b, v2.16b, v2.16b, #8
 ; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NOI8MM-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NOI8MM-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-NOI8MM-NEXT:    smull2 v1.4s, v2.8h, v1.8h
 ; CHECK-NOI8MM-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NOI8MM-NEXT:    ext v1.16b, v4.16b, v4.16b, #8
-; CHECK-NOI8MM-NEXT:    smlal v3.4s, v6.4h, v5.4h
-; CHECK-NOI8MM-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NOI8MM-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
 ; CHECK-NOI8MM-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NOI8MM-NEXT:    smlal v0.4s, v5.4h, v4.4h
+; CHECK-NOI8MM-NEXT:    add v0.2s, v1.2s, v0.2s
 ; CHECK-NOI8MM-NEXT:    ret
 ;
 ; CHECK-I8MM-LABEL: usdot_narrow:
@@ -284,14 +279,13 @@ define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{
 ; CHECK-NOI8MM-LABEL: sudot:
 ; CHECK-NOI8MM:       // %bb.0:
 ; CHECK-NOI8MM-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v1.8h, v1.16b, #0
 ; CHECK-NOI8MM-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-NOI8MM-NEXT:    sshll2 v1.8h, v1.16b, #0
 ; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
 ; CHECK-NOI8MM-NEXT:    smlal v0.4s, v4.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smull v5.4s, v2.4h, v1.4h
+; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v4.8h, v3.8h
+; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
 ; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    smlal2 v5.4s, v4.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    add v0.4s, v5.4s, v0.4s
 ; CHECK-NOI8MM-NEXT:    ret
 ;
 ; CHECK-I8MM-LABEL: sudot:
@@ -312,20 +306,19 @@ define <4 x i32> @sudot_in_loop(ptr %p1, ptr %p2){
 ; CHECK-NOI8MM-NEXT:    mov x8, xzr
 ; CHECK-NOI8MM-NEXT:  .LBB9_1: // %vector.body
 ; CHECK-NOI8MM-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOI8MM-NEXT:    ldr q0, [x0, x8]
-; CHECK-NOI8MM-NEXT:    ldr q2, [x1, x8]
+; CHECK-NOI8MM-NEXT:    ldr q2, [x0, x8]
+; CHECK-NOI8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-NOI8MM-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NOI8MM-NEXT:    add x8, x8, #16
+; CHECK-NOI8MM-NEXT:    ushll v4.8h, v2.8b, #0
+; CHECK-NOI8MM-NEXT:    sshll v5.8h, v3.8b, #0
+; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NOI8MM-NEXT:    sshll2 v3.8h, v3.16b, #0
 ; CHECK-NOI8MM-NEXT:    cmp x8, #16
-; CHECK-NOI8MM-NEXT:    ushll v3.8h, v0.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v4.8h, v0.16b, #0
-; CHECK-NOI8MM-NEXT:    sshll v5.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    mov v0.16b, v1.16b
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v3.4h, v5.4h
-; CHECK-NOI8MM-NEXT:    smull v6.4s, v4.4h, v2.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v4.8h, v2.8h
-; CHECK-NOI8MM-NEXT:    smlal2 v6.4s, v3.8h, v5.8h
-; CHECK-NOI8MM-NEXT:    add v1.4s, v6.4s, v1.4s
+; CHECK-NOI8MM-NEXT:    smlal v1.4s, v4.4h, v5.4h
+; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v4.8h, v5.8h
+; CHECK-NOI8MM-NEXT:    smlal v1.4s, v2.4h, v3.4h
+; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v2.8h, v3.8h
 ; CHECK-NOI8MM-NEXT:    b.ne .LBB9_1
 ; CHECK-NOI8MM-NEXT:  // %bb.2: // %end
 ; CHECK-NOI8MM-NEXT:    ret
@@ -374,15 +367,15 @@ define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
 ; CHECK-NOI8MM-NEXT:    ushll v2.8h, v2.8b, #0
 ; CHECK-NOI8MM-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NOI8MM-NEXT:    smull v3.4s, v2.4h, v1.4h
-; CHECK-NOI8MM-NEXT:    smull2 v4.4s, v2.8h, v1.8h
-; CHECK-NOI8MM-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
-; CHECK-NOI8MM-NEXT:    ext v6.16b, v2.16b, v2.16b, #8
 ; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NOI8MM-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NOI8MM-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-NOI8MM-NEXT:    smull2 v1.4s, v2.8h, v1.8h
 ; CHECK-NOI8MM-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NOI8MM-NEXT:    ext v1.16b, v4.16b, v4.16b, #8
-; CHECK-NOI8MM-NEXT:    smlal v3.4s, v6.4h, v5.4h
-; CHECK-NOI8MM-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NOI8MM-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
 ; CHECK-NOI8MM-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NOI8MM-NEXT:    smlal v0.4s, v5.4h, v4.4h
+; CHECK-NOI8MM-NEXT:    add v0.2s, v1.2s, v0.2s
 ; CHECK-NOI8MM-NEXT:    ret
 ;
 ; CHECK-I8MM-LABEL: sudot_narrow:
@@ -413,14 +406,14 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
 ; CHECK-NODOT-NEXT:    ushll v5.4s, v2.4h, #0
 ; CHECK-NODOT-NEXT:    ushll2 v4.4s, v4.8h, #0
 ; CHECK-NODOT-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v3.4s
+; CHECK-NODOT-NEXT:    uaddw v1.2d, v1.2d, v5.2s
 ; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v3.2s
-; CHECK-NODOT-NEXT:    uaddl2 v3.2d, v4.4s, v5.4s
-; CHECK-NODOT-NEXT:    uaddl v4.2d, v4.2s, v5.2s
+; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v5.4s
+; CHECK-NODOT-NEXT:    uaddw2 v0.2d, v0.2d, v3.4s
+; CHECK-NODOT-NEXT:    uaddw v1.2d, v1.2d, v2.2s
+; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v4.2s
 ; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v2.4s
-; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v2.2s
-; CHECK-NODOT-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-NODOT-NEXT:    add v0.2d, v4.2d, v0.2d
+; CHECK-NODOT-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
 ; CHECK-NODOT-NEXT:    ret
 entry:
   %a.wide = zext <16 x i8> %a to <16 x i64>
@@ -448,14 +441,14 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
 ; CHECK-NODOT-NEXT:    sshll v5.4s, v2.4h, #0
 ; CHECK-NODOT-NEXT:    sshll2 v4.4s, v4.8h, #0
 ; CHECK-NODOT-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v3.4s
+; CHECK-NODOT-NEXT:    saddw v1.2d, v1.2d, v5.2s
 ; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v3.2s
-; CHECK-NODOT-NEXT:    saddl2 v3.2d, v4.4s, v5.4s
-; CHECK-NODOT-NEXT:    saddl v4.2d, v4.2s, v5.2s
+; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v5.4s
+; CHECK-NODOT-NEXT:    saddw2 v0.2d, v0.2d, v3.4s
+; CHECK-NODOT-NEXT:    saddw v1.2d, v1.2d, v2.2s
+; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
 ; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v2.4s
-; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v2.2s
-; CHECK-NODOT-NEXT:    add v1.2d, v3.2d, v1.2d
-; CHECK-NODOT-NEXT:    add v0.2d, v4.2d, v0.2d
+; CHECK-NODOT-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
 ; CHECK-NODOT-NEXT:    ret
 entry:
   %a.wide = sext <16 x i8> %a to <16 x i64>
@@ -470,27 +463,25 @@ define <4 x i64> @usdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
 ; CHECK-NOI8MM-LABEL: usdot_8to64:
 ; CHECK-NOI8MM:       // %bb.0: // %entry
 ; CHECK-NOI8MM-NEXT:    ushll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll v5.8h, v3.8b, #0
 ; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NOI8MM-NEXT:    sshll v5.8h, v3.8b, #0
 ; CHECK-NOI8MM-NEXT:    sshll2 v3.8h, v3.16b, #0
 ; CHECK-NOI8MM-NEXT:    ushll v6.4s, v4.4h, #0
-; CHECK-NOI8MM-NEXT:    sshll v7.4s, v5.4h, #0
+; CHECK-NOI8MM-NEXT:    ushll v7.4s, v2.4h, #0
+; CHECK-NOI8MM-NEXT:    sshll v16.4s, v5.4h, #0
+; CHECK-NOI8MM-NEXT:    sshll v17.4s, v3.4h, #0
 ; CHECK-NOI8MM-NEXT:    ushll2 v4.4s, v4.8h, #0
+; CHECK-NOI8MM-NEXT:    ushll2 v2.4s, v2.8h, #0
 ; CHECK-NOI8MM-NEXT:    sshll2 v5.4s, v5.8h, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v16.4s, v2.8h, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v17.4s, v3.8h, #0
-; CHECK-NOI8MM-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-NOI8MM-NEXT:    sshll v3.4s, v3.4h, #0
-; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v6.4s, v7.4s
-; CHECK-NOI8MM-NEXT:    smlal v0.2d, v6.2s, v7.2s
-; CHECK-NOI8MM-NEXT:    smull v18.2d, v4.2s, v5.2s
-; CHECK-NOI8MM-NEXT:    smull2 v4.2d, v4.4s, v5.4s
-; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v16.4s, v17.4s
-; CHECK-NOI8MM-NEXT:    smlal v0.2d, v16.2s, v17.2s
-; CHECK-NOI8MM-NEXT:    smlal2 v4.2d, v2.4s, v3.4s
-; CHECK-NOI8MM-NEXT:    smlal v18.2d, v2.2s, v3.2s
-; CHECK-NOI8MM-NEXT:    add v1.2d, v4.2d, v1.2d
-; CHECK-NOI8MM-NEXT:    add v0.2d, v18.2d, v0.2d
+; CHECK-NOI8MM-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-NOI8MM-NEXT:    smlal v0.2d, v6.2s, v16.2s
+; CHECK-NOI8MM-NEXT:    smlal v1.2d, v7.2s, v17.2s
+; CHECK-NOI8MM-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
+; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
+; CHECK-NOI8MM-NEXT:    smlal v0.2d, v4.2s, v5.2s
+; CHECK-NOI8MM-NEXT:    smlal v1.2d, v2.2s, v3.2s
+; CHECK-NOI8MM-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
 ; CHECK-NOI8MM-NEXT:    ret
 ;
 ; CHECK-I8MM-LABEL: usdot_8to64:
@@ -513,27 +504,25 @@ define <4 x i64> @sudot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
 ; CHECK-NOI8MM-LABEL: sudot_8to64:
 ; CHECK-NOI8MM:       // %bb.0: // %entry
 ; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
-; CHECK-NOI8MM-NEXT:    ushll v5.8h, v3.8b, #0
 ; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NOI8MM-NEXT:    ushll v5.8h, v3.8b, #0
 ; CHECK-NOI8MM-NEXT:    ushll2 v3.8h, v3.16b, #0
 ; CHECK-NOI8MM-NEXT:    sshll v6.4s, v4.4h, #0
-; CHECK-NOI8MM-NEXT:    ushll v7.4s, v5.4h, #0
+; CHECK-NOI8MM-NEXT:    sshll v7.4s, v2.4h, #0
+; CHECK-NOI8MM-NEXT:    ushll v16.4s, v5.4h, #0
+; CHECK-NOI8MM-NEXT:    ushll v17.4s, v3.4h, #0
 ; CHECK-NOI8MM-NEXT:    sshll2 v4.4s, v4.8h, #0
+; CHECK-NOI8MM-NEXT:    sshll2 v2.4s, v2.8h, #0
 ; CHECK-NOI8MM-NEXT:    ushll2 v5.4s, v5.8h, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v16.4s, v2.8h, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v17.4s, v3.8h, #0
-; CHECK-NOI8MM-NEXT:    sshll v2.4s, v2.4h, #0
-; CHECK-NOI8MM-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v6.4s, v7.4s
-; CHECK-NOI8MM-NEXT:    smlal v0.2d, v6.2s, v7.2s
-; CHECK-NOI8MM-NEXT:    smull v18.2d, v4.2s, v5.2s
-; CHECK-NOI8MM-NEXT:    smull2 v4.2d, v4.4s, v5.4s
-; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v16.4s, v17.4s
-; CHECK-NOI8MM-NEXT:    smlal v0.2d, v16.2s, v17.2s
-; CHECK-NOI8MM-NEXT:    smlal2 v4.2d, v2.4s, v3.4s
-; CHECK-NOI8MM-NEXT:    smlal v18.2d, v2.2s, v3.2s
-; CHECK-NOI8MM-NEXT:    add v1.2d, v4.2d, v1.2d
-; CHECK-NOI8MM-NEXT:    add v0.2d, v18.2d, v0.2d
+; CHECK-NOI8MM-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-NOI8MM-NEXT:    smlal v0.2d, v6.2s, v16.2s
+; CHECK-NOI8MM-NEXT:    smlal v1.2d, v7.2s, v17.2s
+; CHECK-NOI8MM-NEXT:    smlal2 v0.2d, v6.4s, v16.4s
+; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v7.4s, v17.4s
+; CHECK-NOI8MM-NEXT:    smlal v0.2d, v4.2s, v5.2s
+; CHECK-NOI8MM-NEXT:    smlal v1.2d, v2.2s, v3.2s
+; CHECK-NOI8MM-NEXT:    smlal2 v0.2d, v4.4s, v5.4s
+; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v2.4s, v3.4s
 ; CHECK-NOI8MM-NEXT:    ret
 ;
 ; CHECK-I8MM-LABEL: sudot_8to64:
@@ -563,11 +552,10 @@ define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    ushll v2.8h, v1.8b, #0
 ; CHECK-NODOT-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-NODOT-NEXT:    ushll v3.4s, v1.4h, #0
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v2.4h
-; CHECK-NODOT-NEXT:    uaddw2 v2.4s, v3.4s, v2.8h
+; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v2.8h
+; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
-; CHECK-NODOT-NEXT:    add v0.4s, v2.4s, v0.4s
 ; CHECK-NODOT-NEXT:    ret
   %a.wide = zext <16 x i8> %a to <16 x i32>
   %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
@@ -597,17 +585,16 @@ define <4 x i32> @udot_no_bin_op_in_loop(ptr %p){
 ; CHECK-NODOT-NEXT:    mov x8, xzr
 ; CHECK-NODOT-NEXT:  .LBB16_1: // %vector.body
 ; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NODOT-NEXT:    ldr q0, [x0, x8]
+; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
 ; CHECK-NODOT-NEXT:    add x8, x8, #16
 ; CHECK-NODOT-NEXT:    cmp x8, #16
-; CHECK-NODOT-NEXT:    ushll v2.8h, v0.8b, #0
-; CHECK-NODOT-NEXT:    ushll2 v3.8h, v0.16b, #0
-; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
-; CHECK-NODOT-NEXT:    ushll v1.4s, v3.4h, #0
-; CHECK-NODOT-NEXT:    uaddw v4.4s, v0.4s, v2.4h
+; CHECK-NODOT-NEXT:    ushll v3.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NODOT-NEXT:    uaddw v1.4s, v1.4s, v3.4h
+; CHECK-NODOT-NEXT:    uaddw2 v1.4s, v1.4s, v3.8h
+; CHECK-NODOT-NEXT:    uaddw v1.4s, v1.4s, v2.4h
 ; CHECK-NODOT-NEXT:    uaddw2 v1.4s, v1.4s, v2.8h
-; CHECK-NODOT-NEXT:    uaddw2 v2.4s, v4.4s, v3.8h
-; CHECK-NODOT-NEXT:    add v1.4s, v1.4s, v2.4s
 ; CHECK-NODOT-NEXT:    b.ne .LBB16_1
 ; CHECK-NODOT-NEXT:  // %bb.2: // %end
 ; CHECK-NODOT-NEXT:    ret
@@ -641,11 +628,10 @@ define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
 ; CHECK-NODOT:       // %bb.0:
 ; CHECK-NODOT-NEXT:    sshll v2.8h, v1.8b, #0
 ; CHECK-NODOT-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-NODOT-NEXT:    sshll v3.4s, v1.4h, #0
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v2.4h
-; CHECK-NODOT-NEXT:    saddw2 v2.4s, v3.4s, v2.8h
+; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v2.8h
+; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
-; CHECK-NODOT-NEXT:    add v0.4s, v2.4s, v0.4s
 ; CHECK-NODOT-NEXT:    ret
   %a.wide = sext <16 x i8> %a to <16 x i32>
   %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
@@ -664,14 +650,14 @@ define <2 x i32> @udot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
 ; CHECK-NODOT-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NODOT-NEXT:    ushll v2.4s, v1.4h, #0
-; CHECK-NODOT-NEXT:    ushll2 v3.4s, v1.8h, #0
-; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
 ; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
-; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
 ; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
-; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
-; CHECK-NODOT-NEXT:    uaddw v1.4s, v2.4s, v4.4h
-; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
+; CHECK-NODOT-NEXT:    ext v2.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
   %a.wide = zext <8 x i8> %a to <8 x i32>
   %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
@@ -690,14 +676,14 @@ define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
 ; CHECK-NODOT-NEXT:    sshll v1.8h, v1.8b, #0
 ; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NODOT-NEXT:    sshll v2.4s, v1.4h, #0
-; CHECK-NODOT-NEXT:    sshll2 v3.4s, v1.8h, #0
-; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
 ; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
-; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
 ; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
-; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
-; CHECK-NODOT-NEXT:    saddw v1.4s, v2.4s, v4.4h
-; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
+; CHECK-NODOT-NEXT:    ext v2.16b, v3.16b, v3.16b, #8
+; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v2.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
   %a.wide = sext <8 x i8> %a to <8 x i32>
   %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
@@ -722,14 +708,14 @@ define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
 ; CHECK-NODOT-NEXT:    ushll v5.4s, v2.4h, #0
 ; CHECK-NODOT-NEXT:    ushll2 v3.4s, v3.8h, #0
 ; CHECK-NODOT-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v4.4s
+; CHECK-NODOT-NEXT:    uaddw v1.2d, v1.2d, v5.2s
 ; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v4.2s
-; CHECK-NODOT-NEXT:    uaddl2 v4.2d, v3.4s, v5.4s
-; CHECK-NODOT-NEXT:    uaddl v3.2d, v3.2s, v5.2s
+; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v5.4s
+; CHECK-NODOT-NEXT:    uaddw2 v0.2d, v0.2d, v4.4s
+; CHECK-NODOT-NEXT:    uaddw v1.2d, v1.2d, v2.2s
+; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v3.2s
 ; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v2.4s
-; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v2.2s
-; CHECK-NODOT-NEXT:    add v1.2d, v4.2d, v1.2d
-; CHECK-NODOT-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-NODOT-NEXT:    uaddw2 v0.2d, v0.2d, v3.4s
 ; CHECK-NODOT-NEXT:    ret
   %a.wide = zext <16 x i8> %a to <16 x i64>
   %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
@@ -754,14 +740,14 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
 ; CHECK-NODOT-NEXT:    sshll v5.4s, v2.4h, #0
 ; CHECK-NODOT-NEXT:    sshll2 v3.4s, v3.8h, #0
 ; CHECK-NODOT-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
+; CHECK-NODOT-NEXT:    saddw v1.2d, v1.2d, v5.2s
 ; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
-; CHECK-NODOT-NEXT:    saddl2 v4.2d, v3.4s, v5.4s
-; CHECK-NODOT-NEXT:    saddl v3.2d, v3.2s, v5.2s
+; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v5.4s
+; CHECK-NODOT-NEXT:    saddw2 v0.2d, v0.2d, v4.4s
+; CHECK-NODOT-NEXT:    saddw v1.2d, v1.2d, v2.2s
+; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v3.2s
 ; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v2.4s
-; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v2.2s
-; CHECK-NODOT-NEXT:    add v1.2d, v4.2d, v1.2d
-; CHECK-NODOT-NEXT:    add v0.2d, v3.2d, v0.2d
+; CHECK-NODOT-NEXT:    saddw2 v0.2d, v0.2d, v3.4s
 ; CHECK-NODOT-NEXT:    ret
   %a.wide = sext <16 x i8> %a to <16 x i64>
   %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
@@ -808,11 +794,10 @@ define <2 x i64> @udot_
diff erent_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b
 ; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
 ; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
 ; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    umull v5.2d, v1.2s, v2.2s
 ; CHECK-NEXT:    umlal v0.2d, v3.2s, v4.2s
+; CHECK-NEXT:    umlal2 v0.2d, v3.4s, v4.4s
+; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.2s
 ; CHECK-NEXT:    umlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT:    umlal2 v5.2d, v3.4s, v4.4s
-; CHECK-NEXT:    add v0.2d, v5.2d, v0.2d
 ; CHECK-NEXT:    ret
 entry:
   %a.wide = zext <8 x i16> %a to <8 x i64>
@@ -830,11 +815,10 @@ define <2 x i64> @sdot_
diff erent_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b
 ; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
 ; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
 ; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    smull v5.2d, v1.2s, v2.2s
 ; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
+; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.2s
 ; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT:    smlal2 v5.2d, v3.4s, v4.4s
-; CHECK-NEXT:    add v0.2d, v5.2d, v0.2d
 ; CHECK-NEXT:    ret
 entry:
   %a.wide = sext <8 x i16> %a to <8 x i64>
@@ -852,11 +836,10 @@ define <2 x i64> @usdot_
diff erent_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %
 ; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
 ; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
 ; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    smull v5.2d, v1.2s, v2.2s
 ; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
+; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.2s
 ; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT:    smlal2 v5.2d, v3.4s, v4.4s
-; CHECK-NEXT:    add v0.2d, v5.2d, v0.2d
 ; CHECK-NEXT:    ret
 entry:
   %a.wide = zext <8 x i16> %a to <8 x i64>
@@ -874,11 +857,10 @@ define <2 x i64> @sudot_
diff erent_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %
 ; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
 ; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
 ; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
-; CHECK-NEXT:    smull v5.2d, v1.2s, v2.2s
 ; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
+; CHECK-NEXT:    smlal2 v0.2d, v3.4s, v4.4s
+; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.2s
 ; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
-; CHECK-NEXT:    smlal2 v5.2d, v3.4s, v4.4s
-; CHECK-NEXT:    add v0.2d, v5.2d, v0.2d
 ; CHECK-NEXT:    ret
 entry:
   %a.wide = sext <8 x i16> %a to <8 x i64>
@@ -897,26 +879,24 @@ define <4 x i32> @usdot_multiple_zext_users(ptr %p1, ptr %p2, ptr %p3) {
 ; CHECK-NOI8MM-NEXT:  .LBB28_1: // %vector.body
 ; CHECK-NOI8MM-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NOI8MM-NEXT:    ldr q2, [x0, x8]
-; CHECK-NOI8MM-NEXT:    ldr q3, [x2, x8]
-; CHECK-NOI8MM-NEXT:    ldr q4, [x1, x8]
+; CHECK-NOI8MM-NEXT:    ldr q3, [x1, x8]
+; CHECK-NOI8MM-NEXT:    ldr q4, [x2, x8]
 ; CHECK-NOI8MM-NEXT:    add x8, x8, #16
 ; CHECK-NOI8MM-NEXT:    sshll v5.8h, v2.8b, #0
+; CHECK-NOI8MM-NEXT:    ushll v6.8h, v4.8b, #0
+; CHECK-NOI8MM-NEXT:    sshll v7.8h, v3.8b, #0
 ; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll2 v6.8h, v3.16b, #0
-; CHECK-NOI8MM-NEXT:    ushll v3.8h, v3.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll v7.8h, v4.8b, #0
-; CHECK-NOI8MM-NEXT:    sshll2 v4.8h, v4.16b, #0
+; CHECK-NOI8MM-NEXT:    ushll2 v4.8h, v4.16b, #0
+; CHECK-NOI8MM-NEXT:    sshll2 v3.8h, v3.16b, #0
 ; CHECK-NOI8MM-NEXT:    cmp x8, #1024
-; CHECK-NOI8MM-NEXT:    smull v16.4s, v2.4h, v6.4h
-; CHECK-NOI8MM-NEXT:    smlal v0.4s, v5.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smull v17.4s, v4.4h, v6.4h
-; CHECK-NOI8MM-NEXT:    smlal v1.4s, v7.4h, v3.4h
-; CHECK-NOI8MM-NEXT:    smlal2 v16.4s, v5.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v6.8h
-; CHECK-NOI8MM-NEXT:    smlal2 v17.4s, v7.8h, v3.8h
-; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v4.8h, v6.8h
-; CHECK-NOI8MM-NEXT:    add v0.4s, v16.4s, v0.4s
-; CHECK-NOI8MM-NEXT:    add v1.4s, v17.4s, v1.4s
+; CHECK-NOI8MM-NEXT:    smlal v0.4s, v5.4h, v6.4h
+; CHECK-NOI8MM-NEXT:    smlal v1.4s, v7.4h, v6.4h
+; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v5.8h, v6.8h
+; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v7.8h, v6.8h
+; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v4.4h
+; CHECK-NOI8MM-NEXT:    smlal v1.4s, v3.4h, v4.4h
+; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v4.8h
+; CHECK-NOI8MM-NEXT:    smlal2 v1.4s, v3.8h, v4.8h
 ; CHECK-NOI8MM-NEXT:    b.ne .LBB28_1
 ; CHECK-NOI8MM-NEXT:  // %bb.2: // %end
 ; CHECK-NOI8MM-NEXT:    add v0.4s, v1.4s, v0.4s

diff  --git a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
index ae681ee54e687..c3828c3d695c4 100644
--- a/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
+++ b/llvm/test/CodeGen/AArch64/partial-reduction-add.ll
@@ -18,7 +18,7 @@ define <4 x i32> @partial_reduce_add_fixed_half(<4 x i32> %accumulator, <8 x i32
 ; CHECK-LABEL: partial_reduce_add_fixed_half:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    add v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
   %partial.reduce = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %accumulator, <8 x i32> %0)
@@ -39,7 +39,7 @@ define <vscale x 4 x i32> @partial_reduce_add_half(<vscale x 4 x i32> %accumulat
 ; CHECK-LABEL: partial_reduce_add_half:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
-; CHECK-NEXT:    add z0.s, z2.s, z0.s
+; CHECK-NEXT:    add z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
 entry:
   %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %accumulator, <vscale x 8 x i32> %0)
@@ -49,10 +49,10 @@ entry:
 define <vscale x 4 x i32> @partial_reduce_add_quart(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0) #0 {
 ; CHECK-LABEL: partial_reduce_add_quart:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    add z2.s, z2.s, z3.s
-; CHECK-NEXT:    add z0.s, z4.s, z0.s
-; CHECK-NEXT:    add z0.s, z2.s, z0.s
+; CHECK-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-NEXT:    add z0.s, z0.s, z4.s
 ; CHECK-NEXT:    ret
 entry:
   %partial.reduce = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %accumulator, <vscale x 16 x i32> %0)
@@ -63,9 +63,9 @@ define <vscale x 8 x i32> @partial_reduce_add_half_8(<vscale x 8 x i32> %accumul
 ; CHECK-LABEL: partial_reduce_add_half_8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEXT:    add z1.s, z1.s, z3.s
-; CHECK-NEXT:    add z0.s, z4.s, z0.s
-; CHECK-NEXT:    add z1.s, z5.s, z1.s
+; CHECK-NEXT:    add z1.s, z1.s, z4.s
+; CHECK-NEXT:    add z0.s, z0.s, z3.s
+; CHECK-NEXT:    add z1.s, z1.s, z5.s
 ; CHECK-NEXT:    ret
 entry:
   %partial.reduce = call <vscale x 8 x i32> @llvm.experimental.vector.partial.reduce.add.nxv8i32.nxv8i32.nxv16i32(<vscale x 8 x i32> %accumulator, <vscale x 16 x i32> %0)

diff  --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index ed27f40aba774..039cac01008b8 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -85,46 +85,42 @@ define <vscale x 4 x i32> @usdot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a,
 ; CHECK-NOI8MM:       // %bb.0: // %entry
 ; CHECK-NOI8MM-NEXT:    uunpklo z3.h, z1.b
 ; CHECK-NOI8MM-NEXT:    sunpklo z4.h, z2.b
+; CHECK-NOI8MM-NEXT:    ptrue p0.s
 ; CHECK-NOI8MM-NEXT:    uunpkhi z1.h, z1.b
 ; CHECK-NOI8MM-NEXT:    sunpkhi z2.h, z2.b
-; CHECK-NOI8MM-NEXT:    ptrue p0.s
 ; CHECK-NOI8MM-NEXT:    uunpklo z5.s, z3.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z3.s, z3.h
 ; CHECK-NOI8MM-NEXT:    sunpklo z6.s, z4.h
+; CHECK-NOI8MM-NEXT:    uunpkhi z3.s, z3.h
 ; CHECK-NOI8MM-NEXT:    sunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT:    uunpklo z7.s, z1.h
+; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
+; CHECK-NOI8MM-NEXT:    uunpklo z5.s, z1.h
+; CHECK-NOI8MM-NEXT:    sunpklo z6.s, z2.h
 ; CHECK-NOI8MM-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NOI8MM-NEXT:    sunpklo z24.s, z2.h
 ; CHECK-NOI8MM-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z3.s, z4.s
 ; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NOI8MM-NEXT:    mul z3.s, z3.s, z4.s
 ; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NOI8MM-NEXT:    movprfx z1, z3
-; CHECK-NOI8MM-NEXT:    mla z1.s, p0/m, z7.s, z24.s
-; CHECK-NOI8MM-NEXT:    add z0.s, z1.s, z0.s
 ; CHECK-NOI8MM-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: usdot:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z3.h, z1.b
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT:    ptrue p0.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.h, z1.b
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.s
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z5.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.s, z3.h
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z6.s, z4.h
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.s, z3.h
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z7.s, z1.h
+; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z5.s, z6.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z5.s, z1.h
+; CHECK-NEWLOWERING-NEXT:    sunpklo z6.s, z2.h
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z24.s, z2.h
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z3.s, z4.s
 ; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NEWLOWERING-NEXT:    mul z3.s, z3.s, z4.s
 ; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEWLOWERING-NEXT:    movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT:    mla z1.s, p0/m, z7.s, z24.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z1.s, z0.s
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
@@ -144,46 +140,42 @@ define <vscale x 4 x i32> @sudot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a,
 ; CHECK-NOI8MM:       // %bb.0: // %entry
 ; CHECK-NOI8MM-NEXT:    sunpklo z3.h, z1.b
 ; CHECK-NOI8MM-NEXT:    uunpklo z4.h, z2.b
+; CHECK-NOI8MM-NEXT:    ptrue p0.s
 ; CHECK-NOI8MM-NEXT:    sunpkhi z1.h, z1.b
 ; CHECK-NOI8MM-NEXT:    uunpkhi z2.h, z2.b
-; CHECK-NOI8MM-NEXT:    ptrue p0.s
 ; CHECK-NOI8MM-NEXT:    sunpklo z5.s, z3.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z3.s, z3.h
 ; CHECK-NOI8MM-NEXT:    uunpklo z6.s, z4.h
+; CHECK-NOI8MM-NEXT:    sunpkhi z3.s, z3.h
 ; CHECK-NOI8MM-NEXT:    uunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT:    sunpklo z7.s, z1.h
+; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
+; CHECK-NOI8MM-NEXT:    sunpklo z5.s, z1.h
+; CHECK-NOI8MM-NEXT:    uunpklo z6.s, z2.h
 ; CHECK-NOI8MM-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NOI8MM-NEXT:    uunpklo z24.s, z2.h
 ; CHECK-NOI8MM-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z3.s, z4.s
 ; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NOI8MM-NEXT:    mul z3.s, z3.s, z4.s
 ; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NOI8MM-NEXT:    movprfx z1, z3
-; CHECK-NOI8MM-NEXT:    mla z1.s, p0/m, z7.s, z24.s
-; CHECK-NOI8MM-NEXT:    add z0.s, z1.s, z0.s
 ; CHECK-NOI8MM-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: sudot:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z3.h, z1.b
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT:    ptrue p0.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.h, z1.b
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.s
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z5.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.s, z3.h
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z6.s, z4.h
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.s, z3.h
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z7.s, z1.h
+; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z5.s, z6.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z5.s, z1.h
+; CHECK-NEWLOWERING-NEXT:    uunpklo z6.s, z2.h
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z24.s, z2.h
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z3.s, z4.s
 ; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NEWLOWERING-NEXT:    mul z3.s, z3.s, z4.s
 ; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEWLOWERING-NEXT:    movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT:    mla z1.s, p0/m, z7.s, z24.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z1.s, z0.s
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
@@ -206,59 +198,43 @@ define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
 ;
 ; CHECK-NEWLOWERING-LABEL: udot_8to64:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    addvl sp, sp, #-2
-; CHECK-NEWLOWERING-NEXT:    str z9, [sp] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEWLOWERING-NEXT:    .cfi_offset w29, -16
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.h, z3.b
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT:    uunpklo z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z5.h, z3.b
+; CHECK-NEWLOWERING-NEXT:    uunpklo z3.h, z3.b
 ; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z6.s, z4.h
+; CHECK-NEWLOWERING-NEXT:    uunpklo z7.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    uunpklo z24.s, z5.h
+; CHECK-NEWLOWERING-NEXT:    uunpklo z25.s, z3.h
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z7.s, z5.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z24.s, z2.h
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z25.s, z3.h
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z5.s, z5.h
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z26.d, z6.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z6.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z27.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z28.d, z7.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z29.d, z5.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z26.d, z6.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z27.d, z7.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z28.d, z24.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z29.d, z25.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z6.d, z6.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z24.d, z24.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z25.d, z25.s
+; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT:    uunpklo z26.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z28.d, z5.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT:    uunpklo z27.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z29.d, z3.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z30.d, z24.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z31.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z8.d, z25.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z9.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    mul z27.d, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z28.d
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    mul z4.d, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z26.d, z7.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z2.d, z9.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z2, z27
-; CHECK-NEWLOWERING-NEXT:    mla z2.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT:    ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z31.d, z3.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z3, z4
-; CHECK-NEWLOWERING-NEXT:    mla z3.d, p0/m, z30.d, z8.d
-; CHECK-NEWLOWERING-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z2.d, z0.d
-; CHECK-NEWLOWERING-NEXT:    add z1.d, z3.d, z1.d
-; CHECK-NEWLOWERING-NEXT:    addvl sp, sp, #2
-; CHECK-NEWLOWERING-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z6.d, z24.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z7.d, z25.d
+; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z2.d, z3.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -282,59 +258,43 @@ define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8
 ;
 ; CHECK-NEWLOWERING-LABEL: sdot_8to64:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    addvl sp, sp, #-2
-; CHECK-NEWLOWERING-NEXT:    str z9, [sp] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEWLOWERING-NEXT:    .cfi_offset w29, -16
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.h, z3.b
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT:    sunpklo z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z5.h, z3.b
+; CHECK-NEWLOWERING-NEXT:    sunpklo z3.h, z3.b
 ; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z6.s, z4.h
+; CHECK-NEWLOWERING-NEXT:    sunpklo z7.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    sunpklo z24.s, z5.h
+; CHECK-NEWLOWERING-NEXT:    sunpklo z25.s, z3.h
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z7.s, z5.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z24.s, z2.h
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z25.s, z3.h
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z5.s, z5.h
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z26.d, z6.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z6.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z27.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z28.d, z7.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z29.d, z5.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z26.d, z6.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z27.d, z7.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z28.d, z24.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z29.d, z25.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z6.d, z6.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z24.d, z24.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z25.d, z25.s
+; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT:    sunpklo z26.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z28.d, z5.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT:    sunpklo z27.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z29.d, z3.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z30.d, z24.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z31.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z8.d, z25.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z9.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    mul z27.d, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z28.d
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    mul z4.d, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z26.d, z7.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z2.d, z9.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z2, z27
-; CHECK-NEWLOWERING-NEXT:    mla z2.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT:    ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z31.d, z3.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z3, z4
-; CHECK-NEWLOWERING-NEXT:    mla z3.d, p0/m, z30.d, z8.d
-; CHECK-NEWLOWERING-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z2.d, z0.d
-; CHECK-NEWLOWERING-NEXT:    add z1.d, z3.d, z1.d
-; CHECK-NEWLOWERING-NEXT:    addvl sp, sp, #2
-; CHECK-NEWLOWERING-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z6.d, z24.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z7.d, z25.d
+; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z2.d, z3.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -358,116 +318,84 @@ define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
 ;
 ; CHECK-NOI8MM-LABEL: usdot_8to64:
 ; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NOI8MM-NEXT:    addvl sp, sp, #-2
-; CHECK-NOI8MM-NEXT:    str z9, [sp] // 16-byte Folded Spill
-; CHECK-NOI8MM-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NOI8MM-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NOI8MM-NEXT:    .cfi_offset w29, -16
-; CHECK-NOI8MM-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NOI8MM-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-; CHECK-NOI8MM-NEXT:    uunpklo z4.h, z2.b
-; CHECK-NOI8MM-NEXT:    sunpklo z5.h, z3.b
-; CHECK-NOI8MM-NEXT:    uunpkhi z2.h, z2.b
-; CHECK-NOI8MM-NEXT:    sunpkhi z3.h, z3.b
+; CHECK-NOI8MM-NEXT:    uunpkhi z4.h, z2.b
+; CHECK-NOI8MM-NEXT:    uunpklo z2.h, z2.b
+; CHECK-NOI8MM-NEXT:    sunpkhi z5.h, z3.b
+; CHECK-NOI8MM-NEXT:    sunpklo z3.h, z3.b
 ; CHECK-NOI8MM-NEXT:    ptrue p0.d
 ; CHECK-NOI8MM-NEXT:    uunpklo z6.s, z4.h
+; CHECK-NOI8MM-NEXT:    uunpklo z7.s, z2.h
+; CHECK-NOI8MM-NEXT:    sunpklo z24.s, z5.h
+; CHECK-NOI8MM-NEXT:    sunpklo z25.s, z3.h
 ; CHECK-NOI8MM-NEXT:    uunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT:    sunpklo z7.s, z5.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z5.s, z5.h
-; CHECK-NOI8MM-NEXT:    uunpklo z24.s, z2.h
 ; CHECK-NOI8MM-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT:    sunpklo z25.s, z3.h
+; CHECK-NOI8MM-NEXT:    sunpkhi z5.s, z5.h
 ; CHECK-NOI8MM-NEXT:    sunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z26.d, z6.s
-; CHECK-NOI8MM-NEXT:    uunpklo z6.d, z6.s
-; CHECK-NOI8MM-NEXT:    uunpklo z27.d, z4.s
-; CHECK-NOI8MM-NEXT:    sunpklo z28.d, z7.s
-; CHECK-NOI8MM-NEXT:    sunpklo z29.d, z5.s
+; CHECK-NOI8MM-NEXT:    uunpklo z26.d, z6.s
+; CHECK-NOI8MM-NEXT:    uunpklo z27.d, z7.s
+; CHECK-NOI8MM-NEXT:    sunpklo z28.d, z24.s
+; CHECK-NOI8MM-NEXT:    sunpklo z29.d, z25.s
+; CHECK-NOI8MM-NEXT:    uunpkhi z6.d, z6.s
+; CHECK-NOI8MM-NEXT:    uunpkhi z7.d, z7.s
+; CHECK-NOI8MM-NEXT:    sunpkhi z24.d, z24.s
+; CHECK-NOI8MM-NEXT:    sunpkhi z25.d, z25.s
+; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NOI8MM-NEXT:    uunpklo z26.d, z4.s
+; CHECK-NOI8MM-NEXT:    sunpklo z28.d, z5.s
+; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NOI8MM-NEXT:    uunpklo z27.d, z2.s
+; CHECK-NOI8MM-NEXT:    sunpklo z29.d, z3.s
 ; CHECK-NOI8MM-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z7.d, z7.s
+; CHECK-NOI8MM-NEXT:    uunpkhi z2.d, z2.s
 ; CHECK-NOI8MM-NEXT:    sunpkhi z5.d, z5.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z30.d, z24.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z31.d, z2.s
-; CHECK-NOI8MM-NEXT:    uunpklo z24.d, z24.s
-; CHECK-NOI8MM-NEXT:    uunpklo z2.d, z2.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z8.d, z25.s
-; CHECK-NOI8MM-NEXT:    sunpklo z25.d, z25.s
-; CHECK-NOI8MM-NEXT:    sunpklo z9.d, z3.s
-; CHECK-NOI8MM-NEXT:    mul z27.d, z27.d, z29.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z6.d, z28.d
 ; CHECK-NOI8MM-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NOI8MM-NEXT:    mul z4.d, z4.d, z5.d
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z7.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z2.d, z9.d
-; CHECK-NOI8MM-NEXT:    movprfx z2, z27
-; CHECK-NOI8MM-NEXT:    mla z2.d, p0/m, z24.d, z25.d
-; CHECK-NOI8MM-NEXT:    ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z31.d, z3.d
-; CHECK-NOI8MM-NEXT:    movprfx z3, z4
-; CHECK-NOI8MM-NEXT:    mla z3.d, p0/m, z30.d, z8.d
-; CHECK-NOI8MM-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NOI8MM-NEXT:    add z0.d, z2.d, z0.d
-; CHECK-NOI8MM-NEXT:    add z1.d, z3.d, z1.d
-; CHECK-NOI8MM-NEXT:    addvl sp, sp, #2
-; CHECK-NOI8MM-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z6.d, z24.d
+; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z7.d, z25.d
+; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z4.d, z5.d
+; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z2.d, z3.d
 ; CHECK-NOI8MM-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: usdot_8to64:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    addvl sp, sp, #-2
-; CHECK-NEWLOWERING-NEXT:    str z9, [sp] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEWLOWERING-NEXT:    .cfi_offset w29, -16
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.h, z3.b
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT:    uunpklo z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z5.h, z3.b
+; CHECK-NEWLOWERING-NEXT:    sunpklo z3.h, z3.b
 ; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z6.s, z4.h
+; CHECK-NEWLOWERING-NEXT:    uunpklo z7.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    sunpklo z24.s, z5.h
+; CHECK-NEWLOWERING-NEXT:    sunpklo z25.s, z3.h
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z7.s, z5.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z24.s, z2.h
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z25.s, z3.h
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z5.s, z5.h
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z26.d, z6.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z6.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z27.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z28.d, z7.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z29.d, z5.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z26.d, z6.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z27.d, z7.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z28.d, z24.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z29.d, z25.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z6.d, z6.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z24.d, z24.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z25.d, z25.s
+; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT:    uunpklo z26.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z28.d, z5.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT:    uunpklo z27.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z29.d, z3.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z30.d, z24.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z31.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z8.d, z25.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z9.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    mul z27.d, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z28.d
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    mul z4.d, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z26.d, z7.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z2.d, z9.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z2, z27
-; CHECK-NEWLOWERING-NEXT:    mla z2.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT:    ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z31.d, z3.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z3, z4
-; CHECK-NEWLOWERING-NEXT:    mla z3.d, p0/m, z30.d, z8.d
-; CHECK-NEWLOWERING-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z2.d, z0.d
-; CHECK-NEWLOWERING-NEXT:    add z1.d, z3.d, z1.d
-; CHECK-NEWLOWERING-NEXT:    addvl sp, sp, #2
-; CHECK-NEWLOWERING-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z6.d, z24.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z7.d, z25.d
+; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z2.d, z3.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -491,116 +419,84 @@ define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
 ;
 ; CHECK-NOI8MM-LABEL: sudot_8to64:
 ; CHECK-NOI8MM:       // %bb.0: // %entry
-; CHECK-NOI8MM-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NOI8MM-NEXT:    addvl sp, sp, #-2
-; CHECK-NOI8MM-NEXT:    str z9, [sp] // 16-byte Folded Spill
-; CHECK-NOI8MM-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NOI8MM-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NOI8MM-NEXT:    .cfi_offset w29, -16
-; CHECK-NOI8MM-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NOI8MM-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-; CHECK-NOI8MM-NEXT:    sunpklo z4.h, z2.b
-; CHECK-NOI8MM-NEXT:    uunpklo z5.h, z3.b
-; CHECK-NOI8MM-NEXT:    sunpkhi z2.h, z2.b
-; CHECK-NOI8MM-NEXT:    uunpkhi z3.h, z3.b
+; CHECK-NOI8MM-NEXT:    sunpkhi z4.h, z2.b
+; CHECK-NOI8MM-NEXT:    sunpklo z2.h, z2.b
+; CHECK-NOI8MM-NEXT:    uunpkhi z5.h, z3.b
+; CHECK-NOI8MM-NEXT:    uunpklo z3.h, z3.b
 ; CHECK-NOI8MM-NEXT:    ptrue p0.d
 ; CHECK-NOI8MM-NEXT:    sunpklo z6.s, z4.h
+; CHECK-NOI8MM-NEXT:    sunpklo z7.s, z2.h
+; CHECK-NOI8MM-NEXT:    uunpklo z24.s, z5.h
+; CHECK-NOI8MM-NEXT:    uunpklo z25.s, z3.h
 ; CHECK-NOI8MM-NEXT:    sunpkhi z4.s, z4.h
-; CHECK-NOI8MM-NEXT:    uunpklo z7.s, z5.h
-; CHECK-NOI8MM-NEXT:    uunpkhi z5.s, z5.h
-; CHECK-NOI8MM-NEXT:    sunpklo z24.s, z2.h
 ; CHECK-NOI8MM-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NOI8MM-NEXT:    uunpklo z25.s, z3.h
+; CHECK-NOI8MM-NEXT:    uunpkhi z5.s, z5.h
 ; CHECK-NOI8MM-NEXT:    uunpkhi z3.s, z3.h
-; CHECK-NOI8MM-NEXT:    sunpkhi z26.d, z6.s
-; CHECK-NOI8MM-NEXT:    sunpklo z6.d, z6.s
-; CHECK-NOI8MM-NEXT:    sunpklo z27.d, z4.s
-; CHECK-NOI8MM-NEXT:    uunpklo z28.d, z7.s
-; CHECK-NOI8MM-NEXT:    uunpklo z29.d, z5.s
+; CHECK-NOI8MM-NEXT:    sunpklo z26.d, z6.s
+; CHECK-NOI8MM-NEXT:    sunpklo z27.d, z7.s
+; CHECK-NOI8MM-NEXT:    uunpklo z28.d, z24.s
+; CHECK-NOI8MM-NEXT:    uunpklo z29.d, z25.s
+; CHECK-NOI8MM-NEXT:    sunpkhi z6.d, z6.s
+; CHECK-NOI8MM-NEXT:    sunpkhi z7.d, z7.s
+; CHECK-NOI8MM-NEXT:    uunpkhi z24.d, z24.s
+; CHECK-NOI8MM-NEXT:    uunpkhi z25.d, z25.s
+; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NOI8MM-NEXT:    sunpklo z26.d, z4.s
+; CHECK-NOI8MM-NEXT:    uunpklo z28.d, z5.s
+; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NOI8MM-NEXT:    sunpklo z27.d, z2.s
+; CHECK-NOI8MM-NEXT:    uunpklo z29.d, z3.s
 ; CHECK-NOI8MM-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z7.d, z7.s
+; CHECK-NOI8MM-NEXT:    sunpkhi z2.d, z2.s
 ; CHECK-NOI8MM-NEXT:    uunpkhi z5.d, z5.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z30.d, z24.s
-; CHECK-NOI8MM-NEXT:    sunpkhi z31.d, z2.s
-; CHECK-NOI8MM-NEXT:    sunpklo z24.d, z24.s
-; CHECK-NOI8MM-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NOI8MM-NEXT:    uunpkhi z8.d, z25.s
-; CHECK-NOI8MM-NEXT:    uunpklo z25.d, z25.s
-; CHECK-NOI8MM-NEXT:    uunpklo z9.d, z3.s
-; CHECK-NOI8MM-NEXT:    mul z27.d, z27.d, z29.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z6.d, z28.d
 ; CHECK-NOI8MM-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NOI8MM-NEXT:    mul z4.d, z4.d, z5.d
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z7.d
-; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z2.d, z9.d
-; CHECK-NOI8MM-NEXT:    movprfx z2, z27
-; CHECK-NOI8MM-NEXT:    mla z2.d, p0/m, z24.d, z25.d
-; CHECK-NOI8MM-NEXT:    ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z31.d, z3.d
-; CHECK-NOI8MM-NEXT:    movprfx z3, z4
-; CHECK-NOI8MM-NEXT:    mla z3.d, p0/m, z30.d, z8.d
-; CHECK-NOI8MM-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NOI8MM-NEXT:    add z0.d, z2.d, z0.d
-; CHECK-NOI8MM-NEXT:    add z1.d, z3.d, z1.d
-; CHECK-NOI8MM-NEXT:    addvl sp, sp, #2
-; CHECK-NOI8MM-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z6.d, z24.d
+; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z7.d, z25.d
+; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z4.d, z5.d
+; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z2.d, z3.d
 ; CHECK-NOI8MM-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: sudot_8to64:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    addvl sp, sp, #-2
-; CHECK-NEWLOWERING-NEXT:    str z9, [sp] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEWLOWERING-NEXT:    .cfi_offset w29, -16
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.h, z3.b
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.h, z2.b
+; CHECK-NEWLOWERING-NEXT:    sunpklo z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z5.h, z3.b
+; CHECK-NEWLOWERING-NEXT:    uunpklo z3.h, z3.b
 ; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z6.s, z4.h
+; CHECK-NEWLOWERING-NEXT:    sunpklo z7.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    uunpklo z24.s, z5.h
+; CHECK-NEWLOWERING-NEXT:    uunpklo z25.s, z3.h
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z7.s, z5.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z24.s, z2.h
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z25.s, z3.h
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z5.s, z5.h
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z26.d, z6.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z6.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z27.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z28.d, z7.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z29.d, z5.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z26.d, z6.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z27.d, z7.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z28.d, z24.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z29.d, z25.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z6.d, z6.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z24.d, z24.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z25.d, z25.s
+; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT:    sunpklo z26.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z28.d, z5.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT:    sunpklo z27.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z29.d, z3.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z7.d, z7.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z30.d, z24.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z31.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z8.d, z25.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z9.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    mul z27.d, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z28.d
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    mul z4.d, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z26.d, z7.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z2.d, z9.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z2, z27
-; CHECK-NEWLOWERING-NEXT:    mla z2.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT:    ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z31.d, z3.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z3, z4
-; CHECK-NEWLOWERING-NEXT:    mla z3.d, p0/m, z30.d, z8.d
-; CHECK-NEWLOWERING-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z2.d, z0.d
-; CHECK-NEWLOWERING-NEXT:    add z1.d, z3.d, z1.d
-; CHECK-NEWLOWERING-NEXT:    addvl sp, sp, #2
-; CHECK-NEWLOWERING-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z6.d, z24.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z7.d, z25.d
+; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z26.d, z28.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z27.d, z29.d
+; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z4.d, z5.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z2.d, z3.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -620,16 +516,16 @@ define <vscale x 4 x i32> @udot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16
 ;
 ; CHECK-NEWLOWERING-LABEL: udot_no_bin_op:
 ; CHECK-NEWLOWERING:       // %bb.0:
-; CHECK-NEWLOWERING-NEXT:    uunpklo z2.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.h, z1.b
+; CHECK-NEWLOWERING-NEXT:    uunpklo z1.h, z1.b
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z2.h
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z3.s
-; CHECK-NEWLOWERING-NEXT:    add z1.s, z2.s, z1.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z4.s, z0.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z1.s, z0.s
+; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-NEWLOWERING-NEXT:    add z1.s, z4.s, z3.s
+; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z2.s
 ; CHECK-NEWLOWERING-NEXT:    ret
   %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
@@ -645,16 +541,16 @@ define <vscale x 4 x i32> @sdot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16
 ;
 ; CHECK-NEWLOWERING-LABEL: sdot_no_bin_op:
 ; CHECK-NEWLOWERING:       // %bb.0:
-; CHECK-NEWLOWERING-NEXT:    sunpklo z2.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.h, z1.b
+; CHECK-NEWLOWERING-NEXT:    sunpklo z1.h, z1.b
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z2.h
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z3.s
-; CHECK-NEWLOWERING-NEXT:    add z1.s, z2.s, z1.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z4.s, z0.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z1.s, z0.s
+; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-NEWLOWERING-NEXT:    add z1.s, z4.s, z3.s
+; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z2.s
 ; CHECK-NEWLOWERING-NEXT:    ret
   %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
   %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
@@ -670,16 +566,16 @@ define <vscale x 2 x i64> @udot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale
 ;
 ; CHECK-NEWLOWERING-LABEL: udot_no_bin_op_wide:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    uunpklo z2.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z1.h
+; CHECK-NEWLOWERING-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z3.d, z2.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z1.s
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-NEXT:    add z1.d, z2.d, z1.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z4.d, z0.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z1.d
+; CHECK-NEWLOWERING-NEXT:    add z1.d, z4.d, z3.d
+; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z1.d
+; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z2.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
@@ -696,16 +592,16 @@ define <vscale x 2 x i64> @sdot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale
 ;
 ; CHECK-NEWLOWERING-LABEL: sdot_no_bin_op_wide:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sunpklo z2.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z1.h
+; CHECK-NEWLOWERING-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z3.d, z2.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z1.s
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z1.d, z1.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z3.d
-; CHECK-NEWLOWERING-NEXT:    add z1.d, z2.d, z1.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z4.d, z0.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z1.d
+; CHECK-NEWLOWERING-NEXT:    add z1.d, z4.d, z3.d
+; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z1.d
+; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z2.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
@@ -727,28 +623,28 @@ define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale
 ;
 ; CHECK-NEWLOWERING-LABEL: udot_no_bin_op_8to64:
 ; CHECK-NEWLOWERING:       // %bb.0:
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z5.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z7.d, z5.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z24.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z25.d, z3.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.h, z2.b
+; CHECK-NEWLOWERING-NEXT:    uunpklo z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.s, z3.h
+; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT:    uunpklo z5.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z7.d, z3.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z24.d, z5.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z3.d, z3.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z25.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z5.d, z5.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    add z1.d, z1.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    add z4.d, z25.d, z24.d
-; CHECK-NEWLOWERING-NEXT:    add z2.d, z3.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z5.d, z0.d
-; CHECK-NEWLOWERING-NEXT:    add z1.d, z7.d, z1.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z4.d, z0.d
-; CHECK-NEWLOWERING-NEXT:    add z1.d, z2.d, z1.d
+; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z24.d
+; CHECK-NEWLOWERING-NEXT:    add z5.d, z5.d, z25.d
+; CHECK-NEWLOWERING-NEXT:    add z1.d, z1.d, z3.d
+; CHECK-NEWLOWERING-NEXT:    add z3.d, z7.d, z6.d
+; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z5.d
+; CHECK-NEWLOWERING-NEXT:    add z1.d, z1.d, z3.d
+; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-NEWLOWERING-NEXT:    add z1.d, z1.d, z4.d
 ; CHECK-NEWLOWERING-NEXT:    ret
   %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
@@ -769,28 +665,28 @@ define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale
 ;
 ; CHECK-NEWLOWERING-LABEL: sdot_no_bin_op_8to64:
 ; CHECK-NEWLOWERING:       // %bb.0:
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z5.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z7.d, z5.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z24.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z25.d, z3.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.h, z2.b
+; CHECK-NEWLOWERING-NEXT:    sunpklo z2.h, z2.b
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.s, z3.h
+; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z3.h
+; CHECK-NEWLOWERING-NEXT:    sunpklo z5.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z7.d, z3.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z24.d, z5.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z3.d, z3.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z25.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z5.d, z5.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z4.d
-; CHECK-NEWLOWERING-NEXT:    add z1.d, z1.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    add z4.d, z25.d, z24.d
-; CHECK-NEWLOWERING-NEXT:    add z2.d, z3.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z5.d, z0.d
-; CHECK-NEWLOWERING-NEXT:    add z1.d, z7.d, z1.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z4.d, z0.d
-; CHECK-NEWLOWERING-NEXT:    add z1.d, z2.d, z1.d
+; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z24.d
+; CHECK-NEWLOWERING-NEXT:    add z5.d, z5.d, z25.d
+; CHECK-NEWLOWERING-NEXT:    add z1.d, z1.d, z3.d
+; CHECK-NEWLOWERING-NEXT:    add z3.d, z7.d, z6.d
+; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z5.d
+; CHECK-NEWLOWERING-NEXT:    add z1.d, z1.d, z3.d
+; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-NEWLOWERING-NEXT:    add z1.d, z1.d, z4.d
 ; CHECK-NEWLOWERING-NEXT:    ret
   %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
   %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
@@ -870,46 +766,42 @@ define <vscale x 2 x i64> @not_usdot(<vscale x 2 x i64> %acc, <vscale x 8 x i16>
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    uunpklo z3.s, z1.h
 ; CHECK-NEXT:    sunpklo z4.s, z2.h
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uunpkhi z1.s, z1.h
 ; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
 ; CHECK-NEXT:    sunpklo z6.d, z4.s
+; CHECK-NEXT:    uunpkhi z3.d, z3.s
 ; CHECK-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEXT:    uunpklo z7.d, z1.s
+; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT:    uunpklo z5.d, z1.s
+; CHECK-NEXT:    sunpklo z6.d, z2.s
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    sunpklo z24.d, z2.s
 ; CHECK-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
 ; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mul z3.d, z3.d, z4.d
 ; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: not_usdot:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z7.d, z1.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z1.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z2.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z24.d, z2.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
 ; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mul z3.d, z3.d, z4.d
 ; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
@@ -924,46 +816,42 @@ define <vscale x 2 x i64> @not_sudot(<vscale x 2 x i64> %acc, <vscale x 8 x i16>
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sunpklo z3.s, z1.h
 ; CHECK-NEXT:    uunpklo z4.s, z2.h
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    sunpkhi z1.s, z1.h
 ; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
 ; CHECK-NEXT:    uunpklo z6.d, z4.s
+; CHECK-NEXT:    sunpkhi z3.d, z3.s
 ; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    sunpklo z7.d, z1.s
+; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT:    sunpklo z5.d, z1.s
+; CHECK-NEXT:    uunpklo z6.d, z2.s
 ; CHECK-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEXT:    uunpklo z24.d, z2.s
 ; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
 ; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEXT:    mul z3.d, z3.d, z4.d
 ; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: not_sudot:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z2.h
+; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z7.d, z1.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z1.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z2.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z24.d, z2.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
 ; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
-; CHECK-NEWLOWERING-NEXT:    mul z3.d, z3.d, z4.d
 ; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
@@ -978,48 +866,44 @@ define <vscale x 2 x i64> @udot_
diff erent_types(<vscale x 2 x i64> %acc, <vscale
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    and z2.h, z2.h, #0xff
 ; CHECK-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEXT:    uunpkhi z1.s, z1.h
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z4.s, z2.h
 ; CHECK-NEXT:    uunpkhi z2.s, z2.h
 ; CHECK-NEXT:    uunpklo z5.d, z3.s
 ; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    uunpklo z7.d, z1.s
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    uunpklo z6.d, z4.s
 ; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    uunpklo z24.d, z2.s
+; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT:    uunpklo z5.d, z1.s
+; CHECK-NEXT:    uunpklo z6.d, z2.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    mul z3.d, z3.d, z4.d
+; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
 ; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
 ; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: udot_
diff erent_types:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
 ; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z2.h
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z3.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z7.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z4.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z24.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z1.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mul z3.d, z3.d, z4.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
 ; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
 ; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
@@ -1039,20 +923,18 @@ define <vscale x 2 x i64> @sdot_
diff erent_types(<vscale x 2 x i64> %acc, <vscale
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    sunpklo z5.d, z3.s
 ; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    sunpklo z7.d, z1.s
 ; CHECK-NEXT:    sunpklo z4.s, z2.h
 ; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpkhi z1.d, z1.s
 ; CHECK-NEXT:    sunpklo z6.d, z4.s
 ; CHECK-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEXT:    sunpklo z24.d, z2.s
+; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT:    sunpklo z5.d, z1.s
+; CHECK-NEXT:    sunpklo z6.d, z2.s
+; CHECK-NEXT:    sunpkhi z1.d, z1.s
 ; CHECK-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEXT:    mul z3.d, z3.d, z4.d
+; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
 ; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
 ; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: sdot_
diff erent_types:
@@ -1064,20 +946,18 @@ define <vscale x 2 x i64> @sdot_
diff erent_types(<vscale x 2 x i64> %acc, <vscale
 ; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z3.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z7.d, z1.s
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z2.h
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.d, z1.s
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z4.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z24.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z1.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.d, z1.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mul z3.d, z3.d, z4.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
 ; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
 ; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
@@ -1097,20 +977,18 @@ define <vscale x 2 x i64> @usdot_
diff erent_types(<vscale x 2 x i64> %acc, <vscal
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uunpklo z5.d, z3.s
 ; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    uunpklo z7.d, z1.s
 ; CHECK-NEXT:    sunpklo z4.s, z2.h
 ; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    sunpklo z6.d, z4.s
 ; CHECK-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEXT:    sunpklo z24.d, z2.s
+; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT:    uunpklo z5.d, z1.s
+; CHECK-NEXT:    sunpklo z6.d, z2.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEXT:    mul z3.d, z3.d, z4.d
+; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
 ; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
 ; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: usdot_
diff erent_types:
@@ -1122,20 +1000,18 @@ define <vscale x 2 x i64> @usdot_
diff erent_types(<vscale x 2 x i64> %acc, <vscal
 ; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z3.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z7.d, z1.s
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z2.h
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z4.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z24.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z1.s
+; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mul z3.d, z3.d, z4.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
 ; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
 ; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
@@ -1150,48 +1026,44 @@ define <vscale x 2 x i64> @sudot_
diff erent_types(<vscale x 2 x i64> %acc, <vscal
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    and z2.h, z2.h, #0xff
 ; CHECK-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
 ; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    sunpkhi z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z4.s, z2.h
 ; CHECK-NEXT:    uunpkhi z2.s, z2.h
 ; CHECK-NEXT:    sunpklo z5.d, z3.s
 ; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    sunpklo z7.d, z1.s
-; CHECK-NEXT:    sunpkhi z1.d, z1.s
 ; CHECK-NEXT:    uunpklo z6.d, z4.s
 ; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    uunpklo z24.d, z2.s
+; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEXT:    sunpklo z5.d, z1.s
+; CHECK-NEXT:    uunpklo z6.d, z2.s
+; CHECK-NEXT:    sunpkhi z1.d, z1.s
 ; CHECK-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEXT:    mul z3.d, z3.d, z4.d
+; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
 ; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
 ; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    movprfx z1, z3
-; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: sudot_
diff erent_types:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
 ; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z2.h
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z3.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z7.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.d, z1.s
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z4.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z24.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
+; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z1.s
+; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z2.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.d, z1.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    mul z3.d, z3.d, z4.d
+; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z3.d, z4.d
 ; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z5.d, z6.d
 ; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z7.d, z24.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
@@ -1266,3 +1138,43 @@ entry:
   %partial.reduce = tail call <vscale x 2 x i16> @llvm.experimental.vector.partial.reduce.add.nxv2i16.nxv8i16(<vscale x 2 x i16> %acc, <vscale x 8 x i16> %mult)
   ret <vscale x 2 x i16> %partial.reduce
 }
+
+
+define <vscale x 4 x i64> @partial_reduce_only_split_acc(<vscale x 4 x i64> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
+; CHECK-LABEL: partial_reduce_only_split_acc:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-NEXT:    and z3.h, z3.h, #0xff
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uunpkhi z4.s, z2.h
+; CHECK-NEXT:    uunpklo z2.s, z2.h
+; CHECK-NEXT:    uunpkhi z5.s, z3.h
+; CHECK-NEXT:    uunpklo z3.s, z3.h
+; CHECK-NEXT:    uunpklo z6.d, z4.s
+; CHECK-NEXT:    uunpklo z7.d, z2.s
+; CHECK-NEXT:    uunpklo z24.d, z5.s
+; CHECK-NEXT:    uunpklo z25.d, z3.s
+; CHECK-NEXT:    uunpkhi z4.d, z4.s
+; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    uunpkhi z5.d, z5.s
+; CHECK-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-NEXT:    mla z1.d, p0/m, z6.d, z24.d
+; CHECK-NEXT:    mla z0.d, p0/m, z7.d, z25.d
+; CHECK-NEXT:    mla z1.d, p0/m, z4.d, z5.d
+; CHECK-NEXT:    mla z0.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    ret
+;
+; CHECK-NEWLOWERING-LABEL: partial_reduce_only_split_acc:
+; CHECK-NEWLOWERING:       // %bb.0: // %entry
+; CHECK-NEWLOWERING-NEXT:    and z3.h, z3.h, #0xff
+; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
+; CHECK-NEWLOWERING-NEXT:    udot z0.d, z2.h, z3.h
+; CHECK-NEWLOWERING-NEXT:    ret
+entry:
+  %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i64>
+  %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
+  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv8i64(
+  <vscale x 4 x i64> %acc, <vscale x 8 x i64> %mult)
+  ret <vscale x 4 x i64> %partial.reduce
+}

diff  --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
index 11fb60ead4fb2..5148d3da6c737 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
@@ -15,7 +15,7 @@ define <vscale x 2 x i64> @signed_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vsc
 ; CHECK-SVE-NEXT:    sunpklo z2.d, z1.s
 ; CHECK-SVE-NEXT:    sunpkhi z1.d, z1.s
 ; CHECK-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-SVE-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-SVE-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-SVE-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: signed_wide_add_nxv4i32:
@@ -23,7 +23,7 @@ define <vscale x 2 x i64> @signed_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vsc
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z2.d, z1.s
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.d, z1.s
 ; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 4 x i32> %input to <vscale x 4 x i64>
@@ -43,7 +43,7 @@ define <vscale x 2 x i64> @unsigned_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <v
 ; CHECK-SVE-NEXT:    uunpklo z2.d, z1.s
 ; CHECK-SVE-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-SVE-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-SVE-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-SVE-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-SVE-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: unsigned_wide_add_nxv4i32:
@@ -51,7 +51,7 @@ define <vscale x 2 x i64> @unsigned_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <v
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z2.d, z1.s
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z2.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z1.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 4 x i32> %input to <vscale x 4 x i64>
@@ -71,7 +71,7 @@ define <vscale x 4 x i32> @signed_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vsc
 ; CHECK-SVE-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-SVE-NEXT:    sunpkhi z1.s, z1.h
 ; CHECK-SVE-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-SVE-NEXT:    add z0.s, z1.s, z0.s
+; CHECK-SVE-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-SVE-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: signed_wide_add_nxv8i16:
@@ -79,7 +79,7 @@ define <vscale x 4 x i32> @signed_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vsc
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z1.s, z0.s
+; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 8 x i16> %input to <vscale x 8 x i32>
@@ -99,7 +99,7 @@ define <vscale x 4 x i32> @unsigned_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <v
 ; CHECK-SVE-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-SVE-NEXT:    uunpkhi z1.s, z1.h
 ; CHECK-SVE-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-SVE-NEXT:    add z0.s, z1.s, z0.s
+; CHECK-SVE-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-SVE-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: unsigned_wide_add_nxv8i16:
@@ -107,7 +107,7 @@ define <vscale x 4 x i32> @unsigned_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <v
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
 ; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z2.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z1.s, z0.s
+; CHECK-NEWLOWERING-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 8 x i16> %input to <vscale x 8 x i32>
@@ -127,7 +127,7 @@ define <vscale x 8 x i16> @signed_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vsc
 ; CHECK-SVE-NEXT:    sunpklo z2.h, z1.b
 ; CHECK-SVE-NEXT:    sunpkhi z1.h, z1.b
 ; CHECK-SVE-NEXT:    add z0.h, z0.h, z2.h
-; CHECK-SVE-NEXT:    add z0.h, z1.h, z0.h
+; CHECK-SVE-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-SVE-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: signed_wide_add_nxv16i8:
@@ -135,7 +135,7 @@ define <vscale x 8 x i16> @signed_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vsc
 ; CHECK-NEWLOWERING-NEXT:    sunpklo z2.h, z1.b
 ; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.h, z1.b
 ; CHECK-NEWLOWERING-NEXT:    add z0.h, z0.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    add z0.h, z1.h, z0.h
+; CHECK-NEWLOWERING-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 16 x i8> %input to <vscale x 16 x i16>
@@ -155,7 +155,7 @@ define <vscale x 8 x i16> @unsigned_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <v
 ; CHECK-SVE-NEXT:    uunpklo z2.h, z1.b
 ; CHECK-SVE-NEXT:    uunpkhi z1.h, z1.b
 ; CHECK-SVE-NEXT:    add z0.h, z0.h, z2.h
-; CHECK-SVE-NEXT:    add z0.h, z1.h, z0.h
+; CHECK-SVE-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-SVE-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: unsigned_wide_add_nxv16i8:
@@ -163,7 +163,7 @@ define <vscale x 8 x i16> @unsigned_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <v
 ; CHECK-NEWLOWERING-NEXT:    uunpklo z2.h, z1.b
 ; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.h, z1.b
 ; CHECK-NEWLOWERING-NEXT:    add z0.h, z0.h, z2.h
-; CHECK-NEWLOWERING-NEXT:    add z0.h, z1.h, z0.h
+; CHECK-NEWLOWERING-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 16 x i8> %input to <vscale x 16 x i16>
@@ -205,14 +205,14 @@ entry:
 define <vscale x 4 x i64> @signed_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vscale x 8 x i32> %input){
 ; CHECK-LABEL: signed_wide_add_nxv8i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sunpkhi z4.d, z2.s
-; CHECK-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEXT:    sunpkhi z5.d, z3.s
-; CHECK-NEXT:    sunpklo z3.d, z3.s
-; CHECK-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-NEXT:    sunpklo z4.d, z3.s
+; CHECK-NEXT:    sunpklo z5.d, z2.s
+; CHECK-NEXT:    sunpkhi z3.d, z3.s
+; CHECK-NEXT:    sunpkhi z2.d, z2.s
+; CHECK-NEXT:    add z0.d, z0.d, z5.d
 ; CHECK-NEXT:    add z1.d, z1.d, z4.d
-; CHECK-NEXT:    add z0.d, z3.d, z0.d
-; CHECK-NEXT:    add z1.d, z5.d, z1.d
+; CHECK-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-NEXT:    add z1.d, z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 8 x i32> %input to <vscale x 8 x i64>
@@ -223,14 +223,14 @@ entry:
 define <vscale x 4 x i64> @unsigned_wide_add_nxv8i32(<vscale x 4 x i64> %acc, <vscale x 8 x i32> %input){
 ; CHECK-LABEL: unsigned_wide_add_nxv8i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uunpkhi z4.d, z2.s
-; CHECK-NEXT:    uunpklo z2.d, z2.s
-; CHECK-NEXT:    uunpkhi z5.d, z3.s
-; CHECK-NEXT:    uunpklo z3.d, z3.s
-; CHECK-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-NEXT:    uunpklo z4.d, z3.s
+; CHECK-NEXT:    uunpklo z5.d, z2.s
+; CHECK-NEXT:    uunpkhi z3.d, z3.s
+; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    add z0.d, z0.d, z5.d
 ; CHECK-NEXT:    add z1.d, z1.d, z4.d
-; CHECK-NEXT:    add z0.d, z3.d, z0.d
-; CHECK-NEXT:    add z1.d, z5.d, z1.d
+; CHECK-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-NEXT:    add z1.d, z1.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 8 x i32> %input to <vscale x 8 x i64>


        


More information about the llvm-commits mailing list