[llvm] [AArch64][SVE] Add lowering for PARTIAL_REDUCE_U/SMLA to USDOT (PR #131327)

Tue Apr 15 09:15:39 PDT 2025

https://github.com/NickGuy-Arm updated https://github.com/llvm/llvm-project/pull/131327

>From 4a6ecb90cfd3188fdfa30b8379d896c867aedbf2 Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Fri, 28 Feb 2025 16:56:11 +0000
Subject: [PATCH 01/10] [AArch64][SVE] Add dot product lowering for
 PARTIAL_REDUCE_MLA node

Add lowering in tablegen for PARTIAL_REDUCE_U/SMLA ISD nodes.
Only happens when the combine has been performed on the ISD node.
Also adds in check to only do the DAG combine when the node can
then eventually be lowered, so changes neon tests too.
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  35 ++++
 .../include/llvm/Target/TargetSelectionDAG.td |   9 +
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   6 +-
 .../SelectionDAG/LegalizeVectorOps.cpp        |   7 +-
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |   5 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  15 ++
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |   3 +
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  11 ++
 .../neon-partial-reduce-dot-product.ll        | 139 ++++++-------
 .../AArch64/sve-partial-reduce-dot-product.ll | 186 ++++--------------
 10 files changed, 176 insertions(+), 240 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 2089d47e9cbc8..a9d7d596e6869 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1639,6 +1639,25 @@ class TargetLoweringBase {
            getCondCodeAction(CC, VT) == Custom;
   }
 
+  /// Return how a PARTIAL_REDUCE_U/SMLA node with Acc type AccVT and Input type
+  /// InputVT should be treated. Either it's legal, needs to be promoted to a
+  /// larger size, needs to be expanded to some other code sequence, or the
+  /// target has a custom expander for it.
+  LegalizeAction getPartialReduceMLAAction(EVT AccVT, EVT InputVT) const {
+    unsigned AccI = (unsigned)AccVT.getSimpleVT().SimpleTy;
+    unsigned InputI = (unsigned)InputVT.getSimpleVT().SimpleTy;
+    assert(AccI < MVT::VALUETYPE_SIZE && InputI < MVT::VALUETYPE_SIZE &&
+           "Table isn't big enough!");
+    return PartialReduceMLAActions[AccI][InputI];
+  }
+
+  /// Return true if a PARTIAL_REDUCE_U/SMLA node with the specified types is
+  /// legal or custom for this target.
+  bool isPartialReduceMLALegalOrCustom(EVT AccVT, EVT InputVT) const {
+    return getPartialReduceMLAAction(AccVT, InputVT) == Legal ||
+           getPartialReduceMLAAction(AccVT, InputVT) == Custom;
+  }
+
   /// If the action for this operation is to promote, this method returns the
   /// ValueType to promote to.
   MVT getTypeToPromoteTo(unsigned Op, MVT VT) const {
@@ -2712,6 +2731,16 @@ class TargetLoweringBase {
       setCondCodeAction(CCs, VT, Action);
   }
 
+  /// Indicate how a PARTIAL_REDUCE_U/SMLA node with Acc type AccVT and Input
+  /// type InputVT should be treated by the target. Either it's legal, needs to
+  /// be promoted to a larger size, needs to be expanded to some other code
+  /// sequence, or the target has a custom expander for it.
+  void setPartialReduceMLAAction(MVT AccVT, MVT InputVT,
+                                 LegalizeAction Action) {
+    assert(AccVT.isValid() && InputVT.isValid() && "Table isn't big enough!");
+    PartialReduceMLAActions[AccVT.SimpleTy][InputVT.SimpleTy] = Action;
+  }
+
   /// If Opc/OrigVT is specified as being promoted, the promotion code defaults
   /// to trying a larger integer/fp until it can find one that works. If that
   /// default is insufficient, this method can be used by the target to override
@@ -3658,6 +3687,12 @@ class TargetLoweringBase {
   /// up the MVT::VALUETYPE_SIZE value to the next multiple of 8.
   uint32_t CondCodeActions[ISD::SETCC_INVALID][(MVT::VALUETYPE_SIZE + 7) / 8];
 
+  /// For each result type and input type for the ISD::PARTIAL_REDUCE_U/SMLA
+  /// nodes, keep a LegalizeAction which indicates how instruction selection
+  /// should deal with this operation.
+  LegalizeAction PartialReduceMLAActions[MVT::VALUETYPE_SIZE]
+                                        [MVT::VALUETYPE_SIZE];
+
   ValueTypeActionImpl ValueTypeActions;
 
 private:
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 42a5fbec95174..64c27dbace397 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -313,6 +313,10 @@ def SDTSubVecInsert : SDTypeProfile<1, 3, [ // subvector insert
   SDTCisSubVecOfVec<2, 1>, SDTCisSameAs<0,1>, SDTCisInt<3>
 ]>;
 
+def SDTPartialReduceMLA : SDTypeProfile<1, 3, [ // partial reduce mla
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>
+]>;
+
 def SDTPrefetch : SDTypeProfile<0, 4, [     // prefetch
   SDTCisPtrTy<0>, SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>, SDTCisInt<1>
 ]>;
@@ -513,6 +517,11 @@ def vecreduce_fmax  : SDNode<"ISD::VECREDUCE_FMAX", SDTFPVecReduce>;
 def vecreduce_fminimum : SDNode<"ISD::VECREDUCE_FMINIMUM", SDTFPVecReduce>;
 def vecreduce_fmaximum : SDNode<"ISD::VECREDUCE_FMAXIMUM", SDTFPVecReduce>;
 
+def partial_reduce_umla : SDNode<"ISD::PARTIAL_REDUCE_UMLA",
+                                 SDTPartialReduceMLA>;
+def partial_reduce_smla : SDNode<"ISD::PARTIAL_REDUCE_SMLA",
+                                 SDTPartialReduceMLA>;
+
 def fadd       : SDNode<"ISD::FADD"       , SDTFPBinOp, [SDNPCommutative]>;
 def fsub       : SDNode<"ISD::FSUB"       , SDTFPBinOp>;
 def fmul       : SDNode<"ISD::FMUL"       , SDTFPBinOp, [SDNPCommutative]>;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0e17897cf60b0..5aaa6cc31efd8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12528,8 +12528,10 @@ SDValue DAGCombiner::visitPARTIAL_REDUCE_MLA(SDNode *N) {
   if (LHSExtOpVT != RHSExtOp.getValueType() || LHSOpcode != RHSOpcode)
     return SDValue();
 
-  // FIXME: Add a check to only perform the DAG combine if there is lowering
-  // provided by the target
+  // Only perform the DAG combine if there is custom lowering provided by the
+  // target
+  if (!TLI.isPartialReduceMLALegalOrCustom(N->getValueType(0), LHSExtOpVT))
+    return SDValue();
 
   bool ExtIsSigned = LHSOpcode == ISD::SIGN_EXTEND;
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 27bde7b96c857..c61e5b263a967 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -469,8 +469,6 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::VECTOR_COMPRESS:
   case ISD::SCMP:
   case ISD::UCMP:
-  case ISD::PARTIAL_REDUCE_UMLA:
-  case ISD::PARTIAL_REDUCE_SMLA:
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     break;
   case ISD::SMULFIX:
@@ -530,6 +528,11 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
       Action = TLI.getOperationAction(Node->getOpcode(), OpVT);
     break;
   }
+  case ISD::PARTIAL_REDUCE_UMLA:
+  case ISD::PARTIAL_REDUCE_SMLA:
+    Action = TLI.getPartialReduceMLAAction(Node->getValueType(0),
+                                           Node->getOperand(1).getValueType());
+    break;
 
 #define BEGIN_REGISTER_VP_SDNODE(VPID, LEGALPOS, ...)                          \
   case ISD::VPID: {                                                            \
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index f5ea3c0b47d6a..af97ce20fdb10 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -836,9 +836,8 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::SET_FPENV, VT, Expand);
     setOperationAction(ISD::RESET_FPENV, VT, Expand);
 
-    // PartialReduceMLA operations default to expand.
-    setOperationAction({ISD::PARTIAL_REDUCE_UMLA, ISD::PARTIAL_REDUCE_SMLA}, VT,
-                       Expand);
+    for (MVT InputVT : MVT::all_valuetypes())
+      setPartialReduceMLAAction(VT, InputVT, Expand);
   }
 
   // Most targets ignore the @llvm.prefetch intrinsic.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2dca8c0da4756..d1bfd9b78fd00 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1585,6 +1585,21 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::MSTORE, VT, Custom);
     }
 
+    for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
+      if (!EnablePartialReduceNodes)
+        break;
+      for (MVT InnerVT : MVT::integer_scalable_vector_valuetypes()) {
+        ElementCount VTElemCount = VT.getVectorElementCount();
+        if (VTElemCount.getKnownMinValue() == 1)
+          continue;
+        if (VTElemCount * 4 == InnerVT.getVectorElementCount())
+          setPartialReduceMLAAction(VT, InnerVT, Custom);
+        if (InnerVT.getVectorElementType().getSizeInBits() * 4 ==
+            VT.getVectorElementType().getSizeInBits())
+          setPartialReduceMLAAction(VT, InnerVT, Legal);
+      }
+    }
+
     // Firstly, exclude all scalable vector extending loads/truncating stores,
     // include both integer and floating scalable vector.
     for (MVT VT : MVT::scalable_vector_valuetypes()) {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c836f3138a45f..6459ec9e4fae9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -143,6 +143,9 @@ def HasFuseAES       : Predicate<"Subtarget->hasFuseAES()">,
                                  "fuse-aes">;
 def HasSVE           : Predicate<"Subtarget->isSVEAvailable()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVE), "sve">;
+def HasSVEorStreamingSVE 
+                     : Predicate<"Subtarget->isSVEorStreamingSVEAvailable()">, 
+                                 AssemblerPredicateWithAll<(all_of FeatureSVE), "sve">;
 def HasSVEB16B16     : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSVEB16B16()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVEB16B16), "sve-b16b16">;
 def HasSVE2          : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2()">,
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3ee71c14c6bd4..c72bc31c46878 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -655,6 +655,17 @@ let Predicates = [HasSVE_or_SME] in {
   defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", AArch64sdot>;
   defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", AArch64udot>;
 
+  let Predicates = [HasSVEorStreamingSVE] in {
+    def : Pat<(nxv4i32 (partial_reduce_umla nxv4i32:$Acc, nxv16i8:$MulLHS, nxv16i8:$MulRHS)),
+              (UDOT_ZZZ_S $Acc, $MulLHS, $MulRHS)>;
+    def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv16i8:$MulLHS, nxv16i8:$MulRHS)),
+              (SDOT_ZZZ_S $Acc, $MulLHS, $MulRHS)>;
+    def : Pat<(nxv2i64 (partial_reduce_umla nxv2i64:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)),
+              (UDOT_ZZZ_D $Acc, $MulLHS, $MulRHS)>;
+    def : Pat<(nxv2i64 (partial_reduce_smla nxv2i64:$Acc, nxv8i16:$MulLHS, nxv8i16:$MulRHS)),
+              (SDOT_ZZZ_D $Acc, $MulLHS, $MulRHS)>;
+  } // End HasSVEorStreamingSVE
+
   defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>;
   defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>;
 
diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index 1c9849bdaed3c..628da72f1b9ea 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -12,15 +12,13 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 ;
 ; CHECK-NODOT-LABEL: udot:
 ; CHECK-NODOT:       // %bb.0:
-; CHECK-NODOT-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-NODOT-NEXT:    ushll v4.8h, v2.8b, #0
-; CHECK-NODOT-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NODOT-NEXT:    umlal v0.4s, v4.4h, v3.4h
-; CHECK-NODOT-NEXT:    umull v5.4s, v2.4h, v1.4h
-; CHECK-NODOT-NEXT:    umlal2 v0.4s, v2.8h, v1.8h
-; CHECK-NODOT-NEXT:    umlal2 v5.4s, v4.8h, v3.8h
-; CHECK-NODOT-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-NODOT-NEXT:    umull v3.8h, v2.8b, v1.8b
+; CHECK-NODOT-NEXT:    umull2 v1.8h, v2.16b, v1.16b
+; CHECK-NODOT-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v3.4h
+; CHECK-NODOT-NEXT:    uaddw2 v2.4s, v2.4s, v3.8h
+; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
+; CHECK-NODOT-NEXT:    add v0.4s, v2.4s, v0.4s
 ; CHECK-NODOT-NEXT:    ret
   %u.wide = zext <16 x i8> %u to <16 x i32>
   %s.wide = zext <16 x i8> %s to <16 x i32>
@@ -99,19 +97,17 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 ;
 ; CHECK-NODOT-LABEL: udot_narrow:
 ; CHECK-NODOT:       // %bb.0:
-; CHECK-NODOT-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NODOT-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    umull v1.8h, v2.8b, v1.8b
 ; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NODOT-NEXT:    umull v3.4s, v2.4h, v1.4h
-; CHECK-NODOT-NEXT:    umull2 v4.4s, v2.8h, v1.8h
-; CHECK-NODOT-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
-; CHECK-NODOT-NEXT:    ext v6.16b, v2.16b, v2.16b, #8
-; CHECK-NODOT-NEXT:    umlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    ushll v2.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NODOT-NEXT:    ext v1.16b, v4.16b, v4.16b, #8
-; CHECK-NODOT-NEXT:    umlal v3.4s, v6.4h, v5.4h
-; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
 ; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT:    uaddw v1.4s, v2.4s, v4.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>
@@ -128,15 +124,13 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
 ;
 ; CHECK-NODOT-LABEL: sdot:
 ; CHECK-NODOT:       // %bb.0:
-; CHECK-NODOT-NEXT:    sshll v3.8h, v1.8b, #0
-; CHECK-NODOT-NEXT:    sshll v4.8h, v2.8b, #0
-; CHECK-NODOT-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NODOT-NEXT:    smlal v0.4s, v4.4h, v3.4h
-; CHECK-NODOT-NEXT:    smull v5.4s, v2.4h, v1.4h
-; CHECK-NODOT-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
-; CHECK-NODOT-NEXT:    smlal2 v5.4s, v4.8h, v3.8h
-; CHECK-NODOT-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-NODOT-NEXT:    smull v3.8h, v2.8b, v1.8b
+; CHECK-NODOT-NEXT:    smull2 v1.8h, v2.16b, v1.16b
+; CHECK-NODOT-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v3.4h
+; CHECK-NODOT-NEXT:    saddw2 v2.4s, v2.4s, v3.8h
+; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
+; CHECK-NODOT-NEXT:    add v0.4s, v2.4s, v0.4s
 ; CHECK-NODOT-NEXT:    ret
   %u.wide = sext <16 x i8> %u to <16 x i32>
   %s.wide = sext <16 x i8> %s to <16 x i32>
@@ -153,19 +147,17 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
 ;
 ; CHECK-NODOT-LABEL: sdot_narrow:
 ; CHECK-NODOT:       // %bb.0:
-; CHECK-NODOT-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NODOT-NEXT:    sshll v2.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    smull v1.8h, v2.8b, v1.8b
 ; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NODOT-NEXT:    smull v3.4s, v2.4h, v1.4h
-; CHECK-NODOT-NEXT:    smull2 v4.4s, v2.8h, v1.8h
-; CHECK-NODOT-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
-; CHECK-NODOT-NEXT:    ext v6.16b, v2.16b, v2.16b, #8
-; CHECK-NODOT-NEXT:    smlal v0.4s, v2.4h, v1.4h
+; CHECK-NODOT-NEXT:    sshll v2.4s, v1.4h, #0
+; CHECK-NODOT-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-NODOT-NEXT:    ext v1.16b, v4.16b, v4.16b, #8
-; CHECK-NODOT-NEXT:    smlal v3.4s, v6.4h, v5.4h
-; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
 ; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
+; CHECK-NODOT-NEXT:    saddw v1.4s, v2.4s, v4.4h
+; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
 ; CHECK-NODOT-NEXT:    ret
   %u.wide = sext <8 x i8> %u to <8 x i32>
   %s.wide = sext <8 x i8> %s to <8 x i32>
@@ -417,27 +409,19 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
 ;
 ; CHECK-NODOT-LABEL: udot_8to64:
 ; CHECK-NODOT:       // %bb.0: // %entry
-; CHECK-NODOT-NEXT:    ushll v4.8h, v3.8b, #0
-; CHECK-NODOT-NEXT:    ushll v5.8h, v2.8b, #0
-; CHECK-NODOT-NEXT:    ushll2 v3.8h, v3.16b, #0
-; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NODOT-NEXT:    ushll v6.4s, v4.4h, #0
-; CHECK-NODOT-NEXT:    ushll v7.4s, v5.4h, #0
+; CHECK-NODOT-NEXT:    umull v4.8h, v2.8b, v3.8b
+; CHECK-NODOT-NEXT:    umull2 v2.8h, v2.16b, v3.16b
+; CHECK-NODOT-NEXT:    ushll v3.4s, v4.4h, #0
+; CHECK-NODOT-NEXT:    ushll v5.4s, v2.4h, #0
 ; CHECK-NODOT-NEXT:    ushll2 v4.4s, v4.8h, #0
-; CHECK-NODOT-NEXT:    ushll2 v5.4s, v5.8h, #0
-; CHECK-NODOT-NEXT:    ushll2 v16.4s, v3.8h, #0
-; CHECK-NODOT-NEXT:    ushll2 v17.4s, v2.8h, #0
-; CHECK-NODOT-NEXT:    ushll v3.4s, v3.4h, #0
-; CHECK-NODOT-NEXT:    ushll v2.4s, v2.4h, #0
-; CHECK-NODOT-NEXT:    umlal2 v1.2d, v7.4s, v6.4s
-; CHECK-NODOT-NEXT:    umlal v0.2d, v7.2s, v6.2s
-; CHECK-NODOT-NEXT:    umull2 v18.2d, v5.4s, v4.4s
-; CHECK-NODOT-NEXT:    umull v4.2d, v5.2s, v4.2s
-; CHECK-NODOT-NEXT:    umlal2 v1.2d, v17.4s, v16.4s
-; CHECK-NODOT-NEXT:    umlal v0.2d, v17.2s, v16.2s
-; CHECK-NODOT-NEXT:    umlal2 v18.2d, v2.4s, v3.4s
-; CHECK-NODOT-NEXT:    umlal v4.2d, v2.2s, v3.2s
-; CHECK-NODOT-NEXT:    add v1.2d, v18.2d, v1.2d
+; CHECK-NODOT-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v3.4s
+; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v3.2s
+; CHECK-NODOT-NEXT:    uaddl2 v3.2d, v4.4s, v5.4s
+; CHECK-NODOT-NEXT:    uaddl v4.2d, v4.2s, v5.2s
+; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v2.4s
+; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v2.2s
+; CHECK-NODOT-NEXT:    add v1.2d, v3.2d, v1.2d
 ; CHECK-NODOT-NEXT:    add v0.2d, v4.2d, v0.2d
 ; CHECK-NODOT-NEXT:    ret
 entry:
@@ -460,27 +444,19 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
 ;
 ; CHECK-NODOT-LABEL: sdot_8to64:
 ; CHECK-NODOT:       // %bb.0: // %entry
-; CHECK-NODOT-NEXT:    sshll v4.8h, v3.8b, #0
-; CHECK-NODOT-NEXT:    sshll v5.8h, v2.8b, #0
-; CHECK-NODOT-NEXT:    sshll2 v3.8h, v3.16b, #0
-; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
-; CHECK-NODOT-NEXT:    sshll v6.4s, v4.4h, #0
-; CHECK-NODOT-NEXT:    sshll v7.4s, v5.4h, #0
+; CHECK-NODOT-NEXT:    smull v4.8h, v2.8b, v3.8b
+; CHECK-NODOT-NEXT:    smull2 v2.8h, v2.16b, v3.16b
+; CHECK-NODOT-NEXT:    sshll v3.4s, v4.4h, #0
+; CHECK-NODOT-NEXT:    sshll v5.4s, v2.4h, #0
 ; CHECK-NODOT-NEXT:    sshll2 v4.4s, v4.8h, #0
-; CHECK-NODOT-NEXT:    sshll2 v5.4s, v5.8h, #0
-; CHECK-NODOT-NEXT:    sshll2 v16.4s, v3.8h, #0
-; CHECK-NODOT-NEXT:    sshll2 v17.4s, v2.8h, #0
-; CHECK-NODOT-NEXT:    sshll v3.4s, v3.4h, #0
-; CHECK-NODOT-NEXT:    sshll v2.4s, v2.4h, #0
-; CHECK-NODOT-NEXT:    smlal2 v1.2d, v7.4s, v6.4s
-; CHECK-NODOT-NEXT:    smlal v0.2d, v7.2s, v6.2s
-; CHECK-NODOT-NEXT:    smull2 v18.2d, v5.4s, v4.4s
-; CHECK-NODOT-NEXT:    smull v4.2d, v5.2s, v4.2s
-; CHECK-NODOT-NEXT:    smlal2 v1.2d, v17.4s, v16.4s
-; CHECK-NODOT-NEXT:    smlal v0.2d, v17.2s, v16.2s
-; CHECK-NODOT-NEXT:    smlal2 v18.2d, v2.4s, v3.4s
-; CHECK-NODOT-NEXT:    smlal v4.2d, v2.2s, v3.2s
-; CHECK-NODOT-NEXT:    add v1.2d, v18.2d, v1.2d
+; CHECK-NODOT-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v3.4s
+; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v3.2s
+; CHECK-NODOT-NEXT:    saddl2 v3.2d, v4.4s, v5.4s
+; CHECK-NODOT-NEXT:    saddl v4.2d, v4.2s, v5.2s
+; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v2.4s
+; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v2.2s
+; CHECK-NODOT-NEXT:    add v1.2d, v3.2d, v1.2d
 ; CHECK-NODOT-NEXT:    add v0.2d, v4.2d, v0.2d
 ; CHECK-NODOT-NEXT:    ret
 entry:
@@ -797,10 +773,9 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
 define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
 ; CHECK-LABEL: not_udot:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
-; CHECK-NEXT:    umlal v0.4s, v2.4h, v1.4h
-; CHECK-NEXT:    umlal2 v0.4s, v2.8h, v1.8h
+; CHECK-NEXT:    umull v1.8h, v2.8b, v1.8b
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
 ; CHECK-NEXT:    ret
   %u.wide = zext <8 x i8> %u to <8 x i32>
   %s.wide = zext <8 x i8> %s to <8 x i32>
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index d7bab3297cf29..5974bac348531 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -11,24 +11,7 @@ define <vscale x 4 x i32> @udot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a,
 ;
 ; CHECK-NEWLOWERING-LABEL: udot:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z6.s, z5.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z5.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z6.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    mul z3.s, z4.s, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z6.s, z5.s
-; CHECK-NEWLOWERING-NEXT:    mad z1.s, p0/m, z2.s, z3.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z1.s, z0.s
+; CHECK-NEWLOWERING-NEXT:    udot z0.s, z1.b, z2.b
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
@@ -46,24 +29,7 @@ define <vscale x 2 x i64> @udot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16>
 ;
 ; CHECK-NEWLOWERING-LABEL: udot_wide:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z5.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z6.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    mul z3.d, z4.d, z3.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEWLOWERING-NEXT:    mad z1.d, p0/m, z2.d, z3.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEWLOWERING-NEXT:    udot z0.d, z1.h, z2.h
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
@@ -81,24 +47,7 @@ define <vscale x 4 x i32> @sdot(<vscale x 4 x i32> %accc, <vscale x 16 x i8> %a,
 ;
 ; CHECK-NEWLOWERING-LABEL: sdot:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z6.s, z5.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z5.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z6.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    mul z3.s, z4.s, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z6.s, z5.s
-; CHECK-NEWLOWERING-NEXT:    mad z1.s, p0/m, z2.s, z3.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z1.s, z0.s
+; CHECK-NEWLOWERING-NEXT:    sdot z0.s, z1.b, z2.b
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
@@ -116,24 +65,7 @@ define <vscale x 2 x i64> @sdot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16>
 ;
 ; CHECK-NEWLOWERING-LABEL: sdot_wide:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z5.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z6.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    mul z3.d, z4.d, z3.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEWLOWERING-NEXT:    mad z1.d, p0/m, z2.d, z3.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEWLOWERING-NEXT:    sdot z0.d, z1.h, z2.h
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
@@ -845,11 +777,11 @@ define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %
 ; CHECK-NEXT:    and z1.h, z1.h, #0xff
 ; CHECK-NEXT:    and z2.h, z2.h, #0xff
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    uunpklo z3.s, z2.h
-; CHECK-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-NEXT:    uunpklo z3.s, z1.h
+; CHECK-NEXT:    uunpklo z4.s, z2.h
 ; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    mla z0.s, p0/m, z4.s, z3.s
+; CHECK-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-NEXT:    mla z0.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    mla z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    ret
 ;
@@ -879,11 +811,11 @@ define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x
 ; CHECK-NEXT:    and z1.s, z1.s, #0xffff
 ; CHECK-NEXT:    and z2.s, z2.s, #0xffff
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z3.d, z2.s
-; CHECK-NEXT:    uunpklo z4.d, z1.s
-; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    uunpklo z3.d, z1.s
+; CHECK-NEXT:    uunpklo z4.d, z2.s
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
-; CHECK-NEXT:    mla z0.d, p0/m, z4.d, z3.d
+; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
 ; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    ret
 ;
@@ -1248,48 +1180,24 @@ define <vscale x 2 x i16> @udot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    and z1.h, z1.h, #0xff
 ; CHECK-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uunpklo z3.s, z2.h
-; CHECK-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
+; CHECK-NEXT:    mul z1.h, z1.h, z2.h
+; CHECK-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEXT:    uunpkhi z5.d, z2.s
-; CHECK-NEXT:    uunpkhi z6.d, z1.s
-; CHECK-NEXT:    mul z3.d, z4.d, z3.d
-; CHECK-NEXT:    uunpklo z2.d, z2.s
-; CHECK-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEXT:    mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEXT:    mad z1.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    uunpklo z3.d, z2.s
+; CHECK-NEXT:    uunpklo z4.d, z1.s
+; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    add z0.d, z0.d, z3.d
+; CHECK-NEXT:    add z2.d, z2.d, z4.d
 ; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    add z0.d, z2.d, z0.d
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: udot_nxv8i8_promote:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    and z1.h, z1.h, #0xff
 ; CHECK-NEWLOWERING-NEXT:    and z2.h, z2.h, #0xff
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z5.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z6.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    mul z3.d, z4.d, z3.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEWLOWERING-NEXT:    mad z1.d, p0/m, z2.d, z3.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEWLOWERING-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-NEWLOWERING-NEXT:    udot z0.d, z1.h, z2.h
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
@@ -1305,49 +1213,25 @@ define <vscale x 2 x i16> @sdot_nxv8i8_promote (<vscale x 2 x i16> %acc, <vscale
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    sxtb z1.h, p0/m, z1.h
 ; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    sunpklo z3.s, z2.h
-; CHECK-NEXT:    sunpklo z4.s, z1.h
-; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEXT:    mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEXT:    sunpkhi z5.d, z2.s
-; CHECK-NEXT:    sunpkhi z6.d, z1.s
-; CHECK-NEXT:    mul z3.d, z4.d, z3.d
-; CHECK-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEXT:    mad z1.d, p0/m, z2.d, z3.d
+; CHECK-NEXT:    mul z1.h, z1.h, z2.h
+; CHECK-NEXT:    uunpklo z2.s, z1.h
+; CHECK-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEXT:    uunpklo z3.d, z2.s
+; CHECK-NEXT:    uunpklo z4.d, z1.s
+; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-NEXT:    add z0.d, z0.d, z3.d
+; CHECK-NEXT:    add z2.d, z2.d, z4.d
 ; CHECK-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEXT:    add z0.d, z2.d, z0.d
 ; CHECK-NEXT:    ret
 ;
 ; CHECK-NEWLOWERING-LABEL: sdot_nxv8i8_promote:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
 ; CHECK-NEWLOWERING-NEXT:    ptrue p0.h
-; CHECK-NEWLOWERING-NEXT:    sxtb z1.h, p0/m, z1.h
 ; CHECK-NEWLOWERING-NEXT:    sxtb z2.h, p0/m, z2.h
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z5.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z6.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    mul z3.d, z4.d, z3.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z5.d
-; CHECK-NEWLOWERING-NEXT:    mad z1.d, p0/m, z2.d, z3.d
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-NEWLOWERING-NEXT:    sxtb z1.h, p0/m, z1.h
+; CHECK-NEWLOWERING-NEXT:    sdot z0.d, z1.h, z2.h
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 8 x i8> %a to <vscale x 8 x i16>

>From 651b211d2cd74caf761c2bf3150d9052bbc3a11f Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Tue, 18 Mar 2025 09:35:00 +0000
Subject: [PATCH 02/10] Update test

---
 .../neon-partial-reduce-dot-product.ll        | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
index 628da72f1b9ea..0645c7d46d861 100644
--- a/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll
@@ -50,20 +50,18 @@ define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){
 ; CHECK-NODOT-NEXT:    mov x8, xzr
 ; CHECK-NODOT-NEXT:  .LBB1_1: // %vector.body
 ; CHECK-NODOT-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NODOT-NEXT:    ldr q0, [x1, x8]
-; CHECK-NODOT-NEXT:    ldr q2, [x0, x8]
+; CHECK-NODOT-NEXT:    ldr q0, [x0, x8]
+; CHECK-NODOT-NEXT:    ldr q2, [x1, x8]
 ; CHECK-NODOT-NEXT:    add x8, x8, #16
 ; CHECK-NODOT-NEXT:    cmp x8, #16
-; CHECK-NODOT-NEXT:    ushll2 v3.8h, v0.16b, #0
-; CHECK-NODOT-NEXT:    ushll2 v4.8h, v2.16b, #0
-; CHECK-NODOT-NEXT:    ushll v5.8h, v0.8b, #0
-; CHECK-NODOT-NEXT:    ushll v2.8h, v2.8b, #0
+; CHECK-NODOT-NEXT:    umull v3.8h, v0.8b, v2.8b
+; CHECK-NODOT-NEXT:    umull2 v2.8h, v0.16b, v2.16b
 ; CHECK-NODOT-NEXT:    mov v0.16b, v1.16b
-; CHECK-NODOT-NEXT:    umull v6.4s, v4.4h, v3.4h
-; CHECK-NODOT-NEXT:    umlal v1.4s, v2.4h, v5.4h
-; CHECK-NODOT-NEXT:    umlal2 v6.4s, v2.8h, v5.8h
-; CHECK-NODOT-NEXT:    umlal2 v1.4s, v4.8h, v3.8h
-; CHECK-NODOT-NEXT:    add v1.4s, v6.4s, v1.4s
+; CHECK-NODOT-NEXT:    ushll v1.4s, v2.4h, #0
+; CHECK-NODOT-NEXT:    uaddw v4.4s, v0.4s, v3.4h
+; CHECK-NODOT-NEXT:    uaddw2 v1.4s, v1.4s, v3.8h
+; CHECK-NODOT-NEXT:    uaddw2 v2.4s, v4.4s, v2.8h
+; CHECK-NODOT-NEXT:    add v1.4s, v1.4s, v2.4s
 ; CHECK-NODOT-NEXT:    b.ne .LBB1_1
 ; CHECK-NODOT-NEXT:  // %bb.2: // %end
 ; CHECK-NODOT-NEXT:    ret

>From 866d625d483224352abf38c30915071b02648081 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Wed, 19 Mar 2025 14:33:03 +0000
Subject: [PATCH 03/10] Reduce memory footprint of PartialReduceMLAActions

---
 llvm/include/llvm/CodeGen/TargetLowering.h | 30 +++++++++++++++-------
 llvm/lib/CodeGen/TargetLoweringBase.cpp    |  3 ---
 2 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index a9d7d596e6869..056d0b7157699 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1644,11 +1644,16 @@ class TargetLoweringBase {
   /// larger size, needs to be expanded to some other code sequence, or the
   /// target has a custom expander for it.
   LegalizeAction getPartialReduceMLAAction(EVT AccVT, EVT InputVT) const {
-    unsigned AccI = (unsigned)AccVT.getSimpleVT().SimpleTy;
-    unsigned InputI = (unsigned)InputVT.getSimpleVT().SimpleTy;
-    assert(AccI < MVT::VALUETYPE_SIZE && InputI < MVT::VALUETYPE_SIZE &&
-           "Table isn't big enough!");
-    return PartialReduceMLAActions[AccI][InputI];
+    auto AccSVT = AccVT.getSimpleVT();
+    auto InputSVT = InputVT.getSimpleVT();
+    assert(AccSVT.isValid() && InputSVT.isValid() &&
+           "getPartialReduceMLAAction types aren't valid");
+    uint16_t AccI = AccSVT.SimpleTy;
+    uint16_t InputI = InputSVT.SimpleTy;
+    uint32_t TypeHash = (AccI << 16) + InputI;
+    if (PartialReduceMLAActions.contains(TypeHash))
+      return PartialReduceMLAActions.at(TypeHash);
+    return Expand;
   }
 
   /// Return true if a PARTIAL_REDUCE_U/SMLA node with the specified types is
@@ -2737,8 +2742,12 @@ class TargetLoweringBase {
   /// sequence, or the target has a custom expander for it.
   void setPartialReduceMLAAction(MVT AccVT, MVT InputVT,
                                  LegalizeAction Action) {
-    assert(AccVT.isValid() && InputVT.isValid() && "Table isn't big enough!");
-    PartialReduceMLAActions[AccVT.SimpleTy][InputVT.SimpleTy] = Action;
+    assert(AccVT.isValid() && InputVT.isValid() &&
+           "setPartialReduceMLAAction types aren't valid");
+    uint16_t AccI = AccVT.SimpleTy;
+    uint16_t InputI = InputVT.SimpleTy;
+    uint32_t TypeHash = (AccI << 16) + InputI;
+    PartialReduceMLAActions[TypeHash] = Action;
   }
 
   /// If Opc/OrigVT is specified as being promoted, the promotion code defaults
@@ -3690,8 +3699,11 @@ class TargetLoweringBase {
   /// For each result type and input type for the ISD::PARTIAL_REDUCE_U/SMLA
   /// nodes, keep a LegalizeAction which indicates how instruction selection
   /// should deal with this operation.
-  LegalizeAction PartialReduceMLAActions[MVT::VALUETYPE_SIZE]
-                                        [MVT::VALUETYPE_SIZE];
+  /// The key is made up of the accumulator type (AccTy) and the input type
+  /// (InTy) in the format of `(AccTy << 16) + InTy`.
+  /// If no entry exists for a given key, Expand is assumed as this
+  /// is the most common action.
+  DenseMap<uint32_t, LegalizeAction> PartialReduceMLAActions;
 
   ValueTypeActionImpl ValueTypeActions;
 
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index af97ce20fdb10..51cde7ce139e2 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -835,9 +835,6 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::GET_FPENV, VT, Expand);
     setOperationAction(ISD::SET_FPENV, VT, Expand);
     setOperationAction(ISD::RESET_FPENV, VT, Expand);
-
-    for (MVT InputVT : MVT::all_valuetypes())
-      setPartialReduceMLAAction(VT, InputVT, Expand);
   }
 
   // Most targets ignore the @llvm.prefetch intrinsic.

>From 18a6f976162fbf705e53e7feda165a94db8d28d5 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Tue, 25 Mar 2025 11:18:12 +0000
Subject: [PATCH 04/10] Modify how PartialReduceMLAActions are assigned

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 29 +++++++++++--------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d1bfd9b78fd00..261ecb3f2c11e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1585,19 +1585,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::MSTORE, VT, Custom);
     }
 
-    for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
-      if (!EnablePartialReduceNodes)
-        break;
-      for (MVT InnerVT : MVT::integer_scalable_vector_valuetypes()) {
-        ElementCount VTElemCount = VT.getVectorElementCount();
-        if (VTElemCount.getKnownMinValue() == 1)
-          continue;
-        if (VTElemCount * 4 == InnerVT.getVectorElementCount())
-          setPartialReduceMLAAction(VT, InnerVT, Custom);
-        if (InnerVT.getVectorElementType().getSizeInBits() * 4 ==
-            VT.getVectorElementType().getSizeInBits())
-          setPartialReduceMLAAction(VT, InnerVT, Legal);
+    if (EnablePartialReduceNodes) {
+      for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
+        for (MVT InnerVT : MVT::integer_scalable_vector_valuetypes()) {
+          // 1. Set all combinations where a type is illegal to "Legal"
+          // - These will be legalized to a legal type pair
+          // - Avoid expanding them too early (or preventing folds)
+          if (!isTypeLegal(VT) || !isTypeLegal(InnerVT)) {
+            setPartialReduceMLAAction(VT, InnerVT, Legal);
+            continue;
+          }
+          //  2. Set all legal combinations to "Expand"
+          // - Not all of these can be lowered (via a Legal or Custom lowering).
+          setPartialReduceMLAAction(VT, InnerVT, Expand);
+        }
       }
+      // 3. Mark known legal pairs as 'Legal' (these will expand to USDOT).
+      setPartialReduceMLAAction(MVT::nxv2i64, MVT::nxv8i16, Legal);
+      setPartialReduceMLAAction(MVT::nxv4i32, MVT::nxv16i8, Legal);
     }
 
     // Firstly, exclude all scalable vector extending loads/truncating stores,

>From 0a6d012f80fc7d6b64a78042ba8258b00c06cf12 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Wed, 26 Mar 2025 14:43:41 +0000
Subject: [PATCH 05/10] Change PartialReduceMLAActions key-type to std::pair

---
 llvm/include/llvm/CodeGen/TargetLowering.h | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 056d0b7157699..e636c390566e4 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1648,9 +1648,9 @@ class TargetLoweringBase {
     auto InputSVT = InputVT.getSimpleVT();
     assert(AccSVT.isValid() && InputSVT.isValid() &&
            "getPartialReduceMLAAction types aren't valid");
-    uint16_t AccI = AccSVT.SimpleTy;
-    uint16_t InputI = InputSVT.SimpleTy;
-    uint32_t TypeHash = (AccI << 16) + InputI;
+    auto AccI = AccSVT.SimpleTy;
+    auto InputI = InputSVT.SimpleTy;
+    PartialReduceActionTypes TypeHash = std::make_pair(AccI, InputI);
     if (PartialReduceMLAActions.contains(TypeHash))
       return PartialReduceMLAActions.at(TypeHash);
     return Expand;
@@ -2744,9 +2744,9 @@ class TargetLoweringBase {
                                  LegalizeAction Action) {
     assert(AccVT.isValid() && InputVT.isValid() &&
            "setPartialReduceMLAAction types aren't valid");
-    uint16_t AccI = AccVT.SimpleTy;
-    uint16_t InputI = InputVT.SimpleTy;
-    uint32_t TypeHash = (AccI << 16) + InputI;
+    auto AccI = AccVT.SimpleTy;
+    auto InputI = InputVT.SimpleTy;
+    PartialReduceActionTypes TypeHash = std::make_pair(AccI, InputI);
     PartialReduceMLAActions[TypeHash] = Action;
   }
 
@@ -3696,6 +3696,8 @@ class TargetLoweringBase {
   /// up the MVT::VALUETYPE_SIZE value to the next multiple of 8.
   uint32_t CondCodeActions[ISD::SETCC_INVALID][(MVT::VALUETYPE_SIZE + 7) / 8];
 
+  using PartialReduceActionTypes =
+      std::pair<MVT::SimpleValueType, MVT::SimpleValueType>;
   /// For each result type and input type for the ISD::PARTIAL_REDUCE_U/SMLA
   /// nodes, keep a LegalizeAction which indicates how instruction selection
   /// should deal with this operation.
@@ -3703,7 +3705,7 @@ class TargetLoweringBase {
   /// (InTy) in the format of `(AccTy << 16) + InTy`.
   /// If no entry exists for a given key, Expand is assumed as this
   /// is the most common action.
-  DenseMap<uint32_t, LegalizeAction> PartialReduceMLAActions;
+  DenseMap<PartialReduceActionTypes, LegalizeAction> PartialReduceMLAActions;
 
   ValueTypeActionImpl ValueTypeActions;
 

>From df2affec0e087acb7f3a09ad3f7b19845461170d Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Wed, 26 Mar 2025 15:08:56 +0000
Subject: [PATCH 06/10] Remove outdated comment

---
 llvm/include/llvm/CodeGen/TargetLowering.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index e636c390566e4..4dde05f273386 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3701,8 +3701,6 @@ class TargetLoweringBase {
   /// For each result type and input type for the ISD::PARTIAL_REDUCE_U/SMLA
   /// nodes, keep a LegalizeAction which indicates how instruction selection
   /// should deal with this operation.
-  /// The key is made up of the accumulator type (AccTy) and the input type
-  /// (InTy) in the format of `(AccTy << 16) + InTy`.
   /// If no entry exists for a given key, Expand is assumed as this
   /// is the most common action.
   DenseMap<PartialReduceActionTypes, LegalizeAction> PartialReduceMLAActions;

>From 6108e7a6df5cd209e5251308b20713ad94af85c2 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Wed, 26 Mar 2025 15:58:43 +0000
Subject: [PATCH 07/10] Address nits

---
 llvm/include/llvm/CodeGen/TargetLowering.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 4dde05f273386..403e54fde77bc 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1650,10 +1650,8 @@ class TargetLoweringBase {
            "getPartialReduceMLAAction types aren't valid");
     auto AccI = AccSVT.SimpleTy;
     auto InputI = InputSVT.SimpleTy;
-    PartialReduceActionTypes TypeHash = std::make_pair(AccI, InputI);
-    if (PartialReduceMLAActions.contains(TypeHash))
-      return PartialReduceMLAActions.at(TypeHash);
-    return Expand;
+    PartialReduceActionTypes TypePair = std::make_pair(AccI, InputI);
+    return PartialReduceMLAActions.lookup(TypePair);
   }
 
   /// Return true if a PARTIAL_REDUCE_U/SMLA node with the specified types is
@@ -2746,8 +2744,8 @@ class TargetLoweringBase {
            "setPartialReduceMLAAction types aren't valid");
     auto AccI = AccVT.SimpleTy;
     auto InputI = InputVT.SimpleTy;
-    PartialReduceActionTypes TypeHash = std::make_pair(AccI, InputI);
-    PartialReduceMLAActions[TypeHash] = Action;
+    PartialReduceActionTypes TypePair = std::make_pair(AccI, InputI);
+    PartialReduceMLAActions[TypePair] = Action;
   }
 
   /// If Opc/OrigVT is specified as being promoted, the promotion code defaults
@@ -3701,8 +3699,6 @@ class TargetLoweringBase {
   /// For each result type and input type for the ISD::PARTIAL_REDUCE_U/SMLA
   /// nodes, keep a LegalizeAction which indicates how instruction selection
   /// should deal with this operation.
-  /// If no entry exists for a given key, Expand is assumed as this
-  /// is the most common action.
   DenseMap<PartialReduceActionTypes, LegalizeAction> PartialReduceMLAActions;
 
   ValueTypeActionImpl ValueTypeActions;

>From 160ee3452ea8eb2b6acf6b4f4806aada6fa3459e Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Thu, 27 Mar 2025 12:41:44 +0000
Subject: [PATCH 08/10] Re-implement explicit default assignment

---
 llvm/lib/CodeGen/TargetLoweringBase.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 51cde7ce139e2..af97ce20fdb10 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -835,6 +835,9 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::GET_FPENV, VT, Expand);
     setOperationAction(ISD::SET_FPENV, VT, Expand);
     setOperationAction(ISD::RESET_FPENV, VT, Expand);
+
+    for (MVT InputVT : MVT::all_valuetypes())
+      setPartialReduceMLAAction(VT, InputVT, Expand);
   }
 
   // Most targets ignore the @llvm.prefetch intrinsic.

>From 364835daebfd3a49fb6579679a1aea1654a71344 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Fri, 28 Mar 2025 14:06:25 +0000
Subject: [PATCH 09/10] Re-implement explicit default assignment without
 reverting the memory footprint savings

---
 llvm/include/llvm/CodeGen/TargetLowering.h | 5 ++++-
 llvm/lib/CodeGen/TargetLoweringBase.cpp    | 3 ---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 403e54fde77bc..0d1ae37eaef0a 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1651,7 +1651,10 @@ class TargetLoweringBase {
     auto AccI = AccSVT.SimpleTy;
     auto InputI = InputSVT.SimpleTy;
     PartialReduceActionTypes TypePair = std::make_pair(AccI, InputI);
-    return PartialReduceMLAActions.lookup(TypePair);
+    auto It = PartialReduceMLAActions.find(TypePair);
+    if (It != PartialReduceMLAActions.end())
+      return It->second;
+    return Expand;
   }
 
   /// Return true if a PARTIAL_REDUCE_U/SMLA node with the specified types is
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index af97ce20fdb10..51cde7ce139e2 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -835,9 +835,6 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::GET_FPENV, VT, Expand);
     setOperationAction(ISD::SET_FPENV, VT, Expand);
     setOperationAction(ISD::RESET_FPENV, VT, Expand);
-
-    for (MVT InputVT : MVT::all_valuetypes())
-      setPartialReduceMLAAction(VT, InputVT, Expand);
   }
 
   // Most targets ignore the @llvm.prefetch intrinsic.

>From c967e33bb7966107dea8937e0ab48802fb80170c Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Fri, 28 Feb 2025 17:31:08 +0000
Subject: [PATCH 10/10] [AArch64][SVE] Add lowering for PARTIAL_REDUCE_U/SMLA
 to USDOT

Add lowering for PARTIAL_REDUCE_U/SMLA nodes to USDOT instructions.
This happens when there is a MUL instruction as the second operand
in the ISD node. Then the extends on the operands of the MUL op
need to have a different signedness.
---
 .../CodeGen/SelectionDAG/LegalizeTypes.cpp    |  15 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  81 +++++++++
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   1 +
 .../AArch64/sve-partial-reduce-dot-product.ll | 158 ++----------------
 4 files changed, 109 insertions(+), 146 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index b6abad830c371..cfff61727ecb5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -920,8 +920,19 @@ SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
 /// illegal ResNo in that case.
 bool DAGTypeLegalizer::CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult) {
   // See if the target wants to custom lower this node.
-  if (TLI.getOperationAction(N->getOpcode(), VT) != TargetLowering::Custom)
-    return false;
+  unsigned Opcode = N->getOpcode();
+  bool IsPRMLAOpcode =
+      Opcode == ISD::PARTIAL_REDUCE_UMLA || Opcode == ISD::PARTIAL_REDUCE_SMLA;
+
+  if (IsPRMLAOpcode) {
+    if (TLI.getPartialReduceMLAAction(N->getValueType(0),
+                                      N->getOperand(1).getValueType()) !=
+        TargetLowering::Custom)
+      return false;
+  } else {
+    if (TLI.getOperationAction(Opcode, VT) != TargetLowering::Custom)
+      return false;
+  }
 
   SmallVector<SDValue, 8> Results;
   if (LegalizeResult)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 261ecb3f2c11e..6b020d71204a0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7736,6 +7736,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerFLDEXP(Op, DAG);
   case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
     return LowerVECTOR_HISTOGRAM(Op, DAG);
+  case ISD::PARTIAL_REDUCE_UMLA:
+  case ISD::PARTIAL_REDUCE_SMLA:
+    return LowerPARTIAL_REDUCE_MLA(Op, DAG);
   }
 }
 
@@ -27474,6 +27477,10 @@ void AArch64TargetLowering::ReplaceNodeResults(
     if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG))
       Results.push_back(Res);
     return;
+  case ISD::PARTIAL_REDUCE_UMLA:
+  case ISD::PARTIAL_REDUCE_SMLA:
+    Results.push_back(LowerPARTIAL_REDUCE_MLA(SDValue(N, 0), DAG));
+    return;
   case ISD::ADD:
   case ISD::FADD:
     ReplaceAddWithADDP(N, Results, DAG, Subtarget);
@@ -29396,6 +29403,80 @@ SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
   return Scatter;
 }
 
+// Lower PARTIAL_REDUCE_*MLA(Acc, MUL(ZEXT(MulOpLHS), SEXT(MulOpRHS)), Splat 1)
+// to USDOT(Acc, MulOpLHS, MulOpRHS)
+// Lower PARTIAL_REDUCE_*MLA(Acc, MUL(SEXT(MulOpLHS), ZEXT(MulOpRHS)), Splat 1)
+// to USDOT(Acc, MulOpRHS, MulOpLHS)
+SDValue
+AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  bool Scalable = Op.getValueType().isScalableVector();
+  auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+  if (Scalable && !Subtarget.isSVEorStreamingSVEAvailable())
+    return SDValue();
+  if (!Scalable && (!Subtarget.isNeonAvailable() || !Subtarget.hasDotProd()))
+    return SDValue();
+  if (!Subtarget.hasMatMulInt8())
+    return SDValue();
+  SDLoc DL(Op);
+
+  if (Op.getOperand(1).getOpcode() != ISD::MUL)
+    return SDValue();
+
+  SDValue Acc = Op.getOperand(0);
+  SDValue Mul = Op.getOperand(1);
+
+  APInt ConstantOne;
+  if (!ISD::isConstantSplatVector(Op.getOperand(2).getNode(), ConstantOne) ||
+      !ConstantOne.isOne())
+    return SDValue();
+
+  SDValue ExtMulOpLHS = Mul.getOperand(0);
+  SDValue ExtMulOpRHS = Mul.getOperand(1);
+  unsigned ExtMulOpLHSOpcode = ExtMulOpLHS.getOpcode();
+  unsigned ExtMulOpRHSOpcode = ExtMulOpRHS.getOpcode();
+  if (!ISD::isExtOpcode(ExtMulOpLHSOpcode) ||
+      !ISD::isExtOpcode(ExtMulOpRHSOpcode))
+    return SDValue();
+
+  SDValue MulOpLHS = ExtMulOpLHS.getOperand(0);
+  SDValue MulOpRHS = ExtMulOpRHS.getOperand(0);
+  EVT MulOpLHSVT = MulOpLHS.getValueType();
+  if (MulOpLHSVT != MulOpRHS.getValueType())
+    return SDValue();
+
+  bool LHSIsSigned = ExtMulOpLHSOpcode == ISD::SIGN_EXTEND;
+  bool RHSIsSigned = ExtMulOpRHSOpcode == ISD::SIGN_EXTEND;
+  if (LHSIsSigned == RHSIsSigned)
+    return SDValue();
+
+  EVT AccVT = Acc.getValueType();
+  // There is no nxv2i64 version of usdot
+  if (Scalable && AccVT != MVT::nxv4i32 && AccVT != MVT::nxv4i64)
+    return SDValue();
+
+  // USDOT expects the signed operand to be last
+  if (!RHSIsSigned)
+    std::swap(MulOpLHS, MulOpRHS);
+
+  unsigned Opcode = AArch64ISD::USDOT;
+  // Partial reduction lowering for (nx)v16i8 to (nx)v4i64 requires an i32 dot
+  // product followed by a zero / sign extension
+  // Don't want this to be split because there is no nxv2i64 version of usdot
+  if ((AccVT == MVT::nxv4i64 && MulOpLHSVT == MVT::nxv16i8) ||
+      (AccVT == MVT::v4i64 && MulOpLHSVT == MVT::v16i8)) {
+    EVT AccVTI32 = (AccVT.isScalableVector()) ? MVT::nxv4i32 : MVT::v4i32;
+
+    SDValue DotI32 =
+        DAG.getNode(Opcode, DL, AccVTI32, DAG.getConstant(0, DL, AccVTI32),
+                    MulOpLHS, MulOpRHS);
+    SDValue Extended = DAG.getSExtOrTrunc(DotI32, DL, AccVT);
+    return DAG.getNode(ISD::ADD, DL, AccVT, Acc, Extended);
+  }
+
+  return DAG.getNode(Opcode, DL, AccVT, Acc, MulOpLHS, MulOpRHS);
+}
+
 SDValue
 AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
                                                     SelectionDAG &DAG) const {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 1987c892ac080..01f10e668da2f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1197,6 +1197,7 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerVECTOR_DEINTERLEAVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_HISTOGRAM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerPARTIAL_REDUCE_MLA(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index 5974bac348531..d08524c105a05 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -106,25 +106,7 @@ define <vscale x 4 x i32> @usdot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a,
 ;
 ; CHECK-NEWLOWERING-LABEL: usdot:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    uunpklo z3.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z7.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z24.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NEWLOWERING-NEXT:    mul z3.s, z3.s, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEWLOWERING-NEXT:    movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT:    mla z1.s, p0/m, z7.s, z24.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z1.s, z0.s
+; CHECK-NEWLOWERING-NEXT:    usdot z0.s, z1.b, z2.b
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
@@ -165,25 +147,7 @@ define <vscale x 4 x i32> @sudot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a,
 ;
 ; CHECK-NEWLOWERING-LABEL: sudot:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    sunpklo z3.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.h, z1.b
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z7.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z1.s, z1.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z24.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z5.s, z6.s
-; CHECK-NEWLOWERING-NEXT:    mul z3.s, z3.s, z4.s
-; CHECK-NEWLOWERING-NEXT:    mla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEWLOWERING-NEXT:    movprfx z1, z3
-; CHECK-NEWLOWERING-NEXT:    mla z1.s, p0/m, z7.s, z24.s
-; CHECK-NEWLOWERING-NEXT:    add z0.s, z1.s, z0.s
+; CHECK-NEWLOWERING-NEXT:    usdot z0.s, z2.b, z1.b
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
@@ -389,59 +353,12 @@ define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
 ;
 ; CHECK-NEWLOWERING-LABEL: usdot_8to64:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    addvl sp, sp, #-2
-; CHECK-NEWLOWERING-NEXT:    str z9, [sp] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEWLOWERING-NEXT:    .cfi_offset w29, -16
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-; CHECK-NEWLOWERING-NEXT:    uunpklo z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    sunpklo z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.h, z3.b
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z7.s, z5.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z24.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z25.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z26.d, z6.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z6.d, z6.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z27.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z28.d, z7.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z29.d, z5.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z7.d, z7.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z30.d, z24.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z31.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z8.d, z25.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z9.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    mul z27.d, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z28.d
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    mul z4.d, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z26.d, z7.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z2.d, z9.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z2, z27
-; CHECK-NEWLOWERING-NEXT:    mla z2.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT:    ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z31.d, z3.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z3, z4
-; CHECK-NEWLOWERING-NEXT:    mla z3.d, p0/m, z30.d, z8.d
-; CHECK-NEWLOWERING-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z2.d, z0.d
-; CHECK-NEWLOWERING-NEXT:    add z1.d, z3.d, z1.d
-; CHECK-NEWLOWERING-NEXT:    addvl sp, sp, #2
-; CHECK-NEWLOWERING-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-NEWLOWERING-NEXT:    usdot z4.s, z2.b, z3.b
+; CHECK-NEWLOWERING-NEXT:    sunpklo z2.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-NEWLOWERING-NEXT:    add z1.d, z1.d, z3.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
@@ -522,59 +439,12 @@ define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i
 ;
 ; CHECK-NEWLOWERING-LABEL: sudot_8to64:
 ; CHECK-NEWLOWERING:       // %bb.0: // %entry
-; CHECK-NEWLOWERING-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    addvl sp, sp, #-2
-; CHECK-NEWLOWERING-NEXT:    str z9, [sp] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEWLOWERING-NEXT:    .cfi_offset w29, -16
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEWLOWERING-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
-; CHECK-NEWLOWERING-NEXT:    sunpklo z4.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    uunpklo z5.h, z3.b
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.h, z2.b
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.h, z3.b
-; CHECK-NEWLOWERING-NEXT:    ptrue p0.d
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.s, z4.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z7.s, z5.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z5.s, z5.h
-; CHECK-NEWLOWERING-NEXT:    sunpklo z24.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEWLOWERING-NEXT:    uunpklo z25.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.s, z3.h
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z26.d, z6.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z6.d, z6.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z27.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z28.d, z7.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z29.d, z5.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z4.d, z4.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z7.d, z7.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z5.d, z5.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z30.d, z24.s
-; CHECK-NEWLOWERING-NEXT:    sunpkhi z31.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z24.d, z24.s
-; CHECK-NEWLOWERING-NEXT:    sunpklo z2.d, z2.s
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z8.d, z25.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z25.d, z25.s
-; CHECK-NEWLOWERING-NEXT:    uunpklo z9.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    mul z27.d, z27.d, z29.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z6.d, z28.d
-; CHECK-NEWLOWERING-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEWLOWERING-NEXT:    mul z4.d, z4.d, z5.d
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z26.d, z7.d
-; CHECK-NEWLOWERING-NEXT:    mla z0.d, p0/m, z2.d, z9.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z2, z27
-; CHECK-NEWLOWERING-NEXT:    mla z2.d, p0/m, z24.d, z25.d
-; CHECK-NEWLOWERING-NEXT:    ldr z9, [sp] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    mla z1.d, p0/m, z31.d, z3.d
-; CHECK-NEWLOWERING-NEXT:    movprfx z3, z4
-; CHECK-NEWLOWERING-NEXT:    mla z3.d, p0/m, z30.d, z8.d
-; CHECK-NEWLOWERING-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEWLOWERING-NEXT:    add z0.d, z2.d, z0.d
-; CHECK-NEWLOWERING-NEXT:    add z1.d, z3.d, z1.d
-; CHECK-NEWLOWERING-NEXT:    addvl sp, sp, #2
-; CHECK-NEWLOWERING-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-NEWLOWERING-NEXT:    usdot z4.s, z3.b, z2.b
+; CHECK-NEWLOWERING-NEXT:    sunpklo z2.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    sunpkhi z3.d, z4.s
+; CHECK-NEWLOWERING-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-NEWLOWERING-NEXT:    add z1.d, z1.d, z3.d
 ; CHECK-NEWLOWERING-NEXT:    ret
 entry:
   %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>