[llvm] [AArch64][SVE] Add partial reduction SDNodes (PR #117185)

Mon Jan 20 07:14:44 PST 2025

https://github.com/JamesChesterman updated https://github.com/llvm/llvm-project/pull/117185

>From a476b641dd4cd13bdd7420fe2da1129ba84c96cc Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Thu, 21 Nov 2024 16:19:19 +0000
Subject: [PATCH 01/13] [AArch64][SVE] Add partial reduction SDNodes

Add the opcode 'ISD::PARTIAL_REDUCE_ADD' and use it when making
SDNodes. When the inputs and outputs have types that can allow for
lowering to wide add or dot product instruction(s), then convert
the corresponding intrinsic to an SDNode. This will allow
legalisation, which will be added in a future patch, to be done
more easily.
---
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |  5 +
 llvm/include/llvm/CodeGen/SelectionDAG.h      |  5 +
 llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 16 ++++
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  6 ++
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 12 ++-
 .../SelectionDAG/SelectionDAGBuilder.h        |  1 +
 .../SelectionDAG/SelectionDAGDumper.cpp       |  3 +
 .../Target/AArch64/AArch64ISelLowering.cpp    | 94 +++++++++----------
 8 files changed, 93 insertions(+), 49 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 604dc9419025b0..f93f82eb35d1f3 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1451,6 +1451,11 @@ enum NodeType {
   VECREDUCE_UMAX,
   VECREDUCE_UMIN,
 
+  // The `llvm.experimental.vector.partial.reduce.add` intrinsic
+  // Operands: Accumulator, Input
+  // Outputs: Output
+  PARTIAL_REDUCE_ADD,
+
   // The `llvm.experimental.stackmap` intrinsic.
   // Operands: input chain, glue, <id>, <numShadowBytes>, [live0[, live1...]]
   // Outputs: output chain, glue
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index ff7caec41855fd..0ca0fa827c6784 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1602,6 +1602,11 @@ class SelectionDAG {
   /// the target's desired shift amount type.
   SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op);
 
+  /// Get a partial reduction SD node for the DAG. This is done when the input
+  /// and output types can be legalised for wide add(s) or dot product(s)
+  SDValue getPartialReduceAddSDNode(SDLoc DL, SDValue Chain, SDValue Acc,
+                                    SDValue Input);
+
   /// Create the DAG equivalent of vector_partial_reduce where Op1 and Op2 are
   /// its operands and ReducedTY is the intrinsic's return type.
   SDValue getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 03899493847b39..18e571f8eb063f 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -3033,6 +3033,22 @@ class MaskedHistogramSDNode : public MaskedGatherScatterSDNode {
   }
 };
 
+class PartialReduceAddSDNode : public SDNode {
+public:
+  friend class SelectionDAG;
+
+  PartialReduceAddSDNode(const DebugLoc &dl, SDVTList VTs)
+      : SDNode(ISD::PARTIAL_REDUCE_ADD, 0, dl, VTs) {}
+
+  const SDValue &getChain() const { return getOperand(0); }
+  const SDValue &getAcc() const { return getOperand(1); }
+  const SDValue &getInput() const { return getOperand(2); }
+
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == ISD::PARTIAL_REDUCE_ADD;
+  }
+};
+
 class FPStateAccessSDNode : public MemSDNode {
 public:
   friend class SelectionDAG;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0dfd0302ae5438..c2a52cf1e8314f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2467,6 +2467,12 @@ SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
   return getZExtOrTrunc(Op, SDLoc(Op), ShTy);
 }
 
+SDValue SelectionDAG::getPartialReduceAddSDNode(SDLoc DL, SDValue Chain,
+                                                SDValue Acc, SDValue Input) {
+  return getNode(ISD::PARTIAL_REDUCE_ADD, DL, Acc.getValueType(), Chain, Acc,
+                 Input);
+}
+
 SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
                                           SDValue Op2) {
   EVT FullTy = Op2.getValueType();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index f8d7c3ef7bbe71..b516986f0a5869 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6422,6 +6422,16 @@ void SelectionDAGBuilder::visitVectorHistogram(const CallInst &I,
   DAG.setRoot(Histogram);
 }
 
+void SelectionDAGBuilder::visitPartialReduceAdd(const CallInst &I,
+                                                unsigned IntrinsicID) {
+  SDLoc dl = getCurSDLoc();
+  SDValue Acc = getValue(I.getOperand(0));
+  SDValue Input = getValue(I.getOperand(1));
+  SDValue Chain = getRoot();
+
+  setValue(&I, DAG.getPartialReduceAddSDNode(dl, Chain, Acc, Input));
+}
+
 void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I,
                                                        unsigned Intrinsic) {
   assert(Intrinsic == Intrinsic::experimental_vector_extract_last_active &&
@@ -8137,7 +8147,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::experimental_vector_partial_reduce_add: {
 
     if (!TLI.shouldExpandPartialReductionIntrinsic(cast<IntrinsicInst>(&I))) {
-      visitTargetIntrinsic(I, Intrinsic);
+      visitPartialReduceAdd(I, Intrinsic);
       return;
     }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 3a8dc25e98700e..a9e0c8f1ea10c1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -629,6 +629,7 @@ class SelectionDAGBuilder {
   void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI);
   void visitConvergenceControl(const CallInst &I, unsigned Intrinsic);
   void visitVectorHistogram(const CallInst &I, unsigned IntrinsicID);
+  void visitPartialReduceAdd(const CallInst &, unsigned IntrinsicID);
   void visitVectorExtractLastActive(const CallInst &I, unsigned Intrinsic);
   void visitVPLoad(const VPIntrinsic &VPIntrin, EVT VT,
                    const SmallVectorImpl<SDValue> &OpValues);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 580ff19065557b..8ce03b14bda46c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -567,6 +567,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
     return "histogram";
 
+  case ISD::PARTIAL_REDUCE_ADD:
+    return "partial_reduce_add";
+
     // Vector Predication
 #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...)                    \
   case ISD::SDID:                                                              \
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ef00b092fe5e06..36c205808d8b0d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1124,6 +1124,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(
       {ISD::MGATHER, ISD::MSCATTER, ISD::EXPERIMENTAL_VECTOR_HISTOGRAM});
 
+  setTargetDAGCombine(ISD::PARTIAL_REDUCE_ADD);
+
   setTargetDAGCombine(ISD::FP_EXTEND);
 
   setTargetDAGCombine(ISD::GlobalAddress);
@@ -21969,40 +21971,23 @@ static SDValue tryCombineWhileLo(SDNode *N,
   return SDValue(N, 0);
 }
 
-SDValue tryLowerPartialReductionToDot(SDNode *N,
+SDValue tryLowerPartialReductionToDot(PartialReduceAddSDNode *PR,
                                       const AArch64Subtarget *Subtarget,
                                       SelectionDAG &DAG) {
 
-  assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
-         getIntrinsicID(N) ==
-             Intrinsic::experimental_vector_partial_reduce_add &&
-         "Expected a partial reduction node");
-
-  bool Scalable = N->getValueType(0).isScalableVector();
+  bool Scalable = PR->getValueType(0).isScalableVector();
   if (Scalable && !Subtarget->isSVEorStreamingSVEAvailable())
     return SDValue();
   if (!Scalable && (!Subtarget->isNeonAvailable() || !Subtarget->hasDotProd()))
     return SDValue();
 
-  SDLoc DL(N);
+  SDLoc DL(PR);
 
-  SDValue Op2 = N->getOperand(2);
-  unsigned Op2Opcode = Op2->getOpcode();
-  SDValue MulOpLHS, MulOpRHS;
-  bool MulOpLHSIsSigned, MulOpRHSIsSigned;
-  if (ISD::isExtOpcode(Op2Opcode)) {
-    MulOpLHSIsSigned = MulOpRHSIsSigned = (Op2Opcode == ISD::SIGN_EXTEND);
-    MulOpLHS = Op2->getOperand(0);
-    MulOpRHS = DAG.getConstant(1, DL, MulOpLHS.getValueType());
-  } else if (Op2Opcode == ISD::MUL) {
-    SDValue ExtMulOpLHS = Op2->getOperand(0);
-    SDValue ExtMulOpRHS = Op2->getOperand(1);
-
-    unsigned ExtMulOpLHSOpcode = ExtMulOpLHS->getOpcode();
-    unsigned ExtMulOpRHSOpcode = ExtMulOpRHS->getOpcode();
-    if (!ISD::isExtOpcode(ExtMulOpLHSOpcode) ||
-        !ISD::isExtOpcode(ExtMulOpRHSOpcode))
-      return SDValue();
+  // The narrower of the two operands. Used as the accumulator
+  auto NarrowOp = PR->getAcc();
+  auto MulOp = PR->getInput();
+  if (MulOp->getOpcode() != ISD::MUL)
+    return SDValue();
 
     MulOpLHSIsSigned = ExtMulOpLHSOpcode == ISD::SIGN_EXTEND;
     MulOpRHSIsSigned = ExtMulOpRHSOpcode == ISD::SIGN_EXTEND;
@@ -22015,9 +22000,8 @@ SDValue tryLowerPartialReductionToDot(SDNode *N,
   } else
     return SDValue();
 
-  SDValue Acc = N->getOperand(1);
-  EVT ReducedVT = N->getValueType(0);
-  EVT MulSrcVT = MulOpLHS.getValueType();
+  EVT ReducedType = PR->getValueType(0);
+  EVT MulSrcType = A.getValueType();
 
   // Dot products operate on chunks of four elements so there must be four times
   // as many elements in the wide type
@@ -22035,7 +22019,7 @@ SDValue tryLowerPartialReductionToDot(SDNode *N,
     if (!Subtarget->hasMatMulInt8())
       return SDValue();
 
-    bool Scalable = N->getValueType(0).isScalableVT();
+    bool Scalable = PR->getValueType(0).isScalableVT();
     // There's no nxv2i64 version of usdot
     if (Scalable && ReducedVT != MVT::nxv4i32 && ReducedVT != MVT::nxv4i64)
       return SDValue();
@@ -22064,24 +22048,18 @@ SDValue tryLowerPartialReductionToDot(SDNode *N,
   return DAG.getNode(Opcode, DL, ReducedVT, Acc, MulOpLHS, MulOpRHS);
 }
 
-SDValue tryLowerPartialReductionToWideAdd(SDNode *N,
+SDValue tryLowerPartialReductionToWideAdd(PartialReduceAddSDNode *PR,
                                           const AArch64Subtarget *Subtarget,
                                           SelectionDAG &DAG) {
 
-  assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
-         getIntrinsicID(N) ==
-             Intrinsic::experimental_vector_partial_reduce_add &&
-         "Expected a partial reduction node");
-
   if (!Subtarget->hasSVE2() && !Subtarget->isStreamingSVEAvailable())
     return SDValue();
 
-  SDLoc DL(N);
+  SDLoc DL(PR);
+
+  auto Acc = PR->getAcc();
+  auto ExtInput = PR->getInput();
 
-  if (!ISD::isExtOpcode(N->getOperand(2).getOpcode()))
-    return SDValue();
-  SDValue Acc = N->getOperand(1);
-  SDValue Ext = N->getOperand(2);
   EVT AccVT = Acc.getValueType();
   EVT ExtVT = Ext.getValueType();
   if (ExtVT.getVectorElementType() != AccVT.getVectorElementType())
@@ -22103,6 +22081,32 @@ SDValue tryLowerPartialReductionToWideAdd(SDNode *N,
   return DAG.getNode(TopOpcode, DL, AccVT, BottomNode, ExtOp);
 }
 
+static SDValue
+performPartialReduceAddCombine(SDNode *N, SelectionDAG &DAG,
+                               const AArch64Subtarget *Subtarget) {
+  auto *PR = cast<PartialReduceAddSDNode>(N);
+  if (auto Dot = tryLowerPartialReductionToDot(PR, Subtarget, DAG))
+    return Dot;
+  if (auto WideAdd = tryLowerPartialReductionToWideAdd(PR, Subtarget, DAG))
+    return WideAdd;
+  return DAG.getPartialReduceAdd(SDLoc(PR), PR->getValueType(0), PR->getAcc(),
+                                 PR->getInput());
+}
+
+
+
+static SDValue
+performPartialReduceAddCombine(SDNode *N, SelectionDAG &DAG,
+                               const AArch64Subtarget *Subtarget) {
+  auto *PR = cast<PartialReduceAddSDNode>(N);
+  if (auto Dot = tryLowerPartialReductionToDot(PR, Subtarget, DAG))
+    return Dot;
+  if (auto WideAdd = tryLowerPartialReductionToWideAdd(PR, Subtarget, DAG))
+    return WideAdd;
+  return DAG.getPartialReduceAdd(SDLoc(PR), PR->getValueType(0), PR->getAcc(),
+                                 PR->getInput());
+}
+
 static SDValue performIntrinsicCombine(SDNode *N,
                                        TargetLowering::DAGCombinerInfo &DCI,
                                        const AArch64Subtarget *Subtarget) {
@@ -22111,14 +22115,6 @@ static SDValue performIntrinsicCombine(SDNode *N,
   switch (IID) {
   default:
     break;
-  case Intrinsic::experimental_vector_partial_reduce_add: {
-    if (SDValue Dot = tryLowerPartialReductionToDot(N, Subtarget, DAG))
-      return Dot;
-    if (SDValue WideAdd = tryLowerPartialReductionToWideAdd(N, Subtarget, DAG))
-      return WideAdd;
-    return DAG.getPartialReduceAdd(SDLoc(N), N->getValueType(0),
-                                   N->getOperand(1), N->getOperand(2));
-  }
   case Intrinsic::aarch64_neon_vcvtfxs2fp:
   case Intrinsic::aarch64_neon_vcvtfxu2fp:
     return tryCombineFixedPointConvert(N, DCI, DAG);
@@ -26404,6 +26400,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::MSCATTER:
   case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
     return performMaskedGatherScatterCombine(N, DCI, DAG);
+  case ISD::PARTIAL_REDUCE_ADD:
+    return performPartialReduceAddCombine(N, DAG, Subtarget);
   case ISD::FP_EXTEND:
     return performFPExtendCombine(N, DAG, DCI, Subtarget);
   case AArch64ISD::BRCOND:

>From a01d2ffe1ec5cf368e8242a4bf3bf08358a17d9e Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Wed, 27 Nov 2024 09:48:06 +0000
Subject: [PATCH 02/13] Changes to previous patch. Involves removing
 PartialReduceAddSDNode as well as changing how the intrinsic is transformed
 into the SD node.

---
 llvm/include/llvm/CodeGen/SelectionDAG.h      | 13 +++-----
 llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 16 ----------
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 10 ++----
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 21 ++++---------
 .../SelectionDAG/SelectionDAGBuilder.h        |  1 -
 .../Target/AArch64/AArch64ISelLowering.cpp    | 31 +++++++++----------
 6 files changed, 27 insertions(+), 65 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 0ca0fa827c6784..7503c52e31348e 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1602,15 +1602,10 @@ class SelectionDAG {
   /// the target's desired shift amount type.
   SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op);
 
-  /// Get a partial reduction SD node for the DAG. This is done when the input
-  /// and output types can be legalised for wide add(s) or dot product(s)
-  SDValue getPartialReduceAddSDNode(SDLoc DL, SDValue Chain, SDValue Acc,
-                                    SDValue Input);
-
-  /// Create the DAG equivalent of vector_partial_reduce where Op1 and Op2 are
-  /// its operands and ReducedTY is the intrinsic's return type.
-  SDValue getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
-                              SDValue Op2);
+  /// Expands partial reduce node which can't be lowered to wide add or dot
+  /// product instruction(s)
+  SDValue expandPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
+                                 SDValue Op2);
 
   /// Expands a node with multiple results to an FP or vector libcall. The
   /// libcall is expected to take all the operands of the \p Node followed by
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 18e571f8eb063f..03899493847b39 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -3033,22 +3033,6 @@ class MaskedHistogramSDNode : public MaskedGatherScatterSDNode {
   }
 };
 
-class PartialReduceAddSDNode : public SDNode {
-public:
-  friend class SelectionDAG;
-
-  PartialReduceAddSDNode(const DebugLoc &dl, SDVTList VTs)
-      : SDNode(ISD::PARTIAL_REDUCE_ADD, 0, dl, VTs) {}
-
-  const SDValue &getChain() const { return getOperand(0); }
-  const SDValue &getAcc() const { return getOperand(1); }
-  const SDValue &getInput() const { return getOperand(2); }
-
-  static bool classof(const SDNode *N) {
-    return N->getOpcode() == ISD::PARTIAL_REDUCE_ADD;
-  }
-};
-
 class FPStateAccessSDNode : public MemSDNode {
 public:
   friend class SelectionDAG;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index c2a52cf1e8314f..7b38d7c424d0df 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2467,14 +2467,8 @@ SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
   return getZExtOrTrunc(Op, SDLoc(Op), ShTy);
 }
 
-SDValue SelectionDAG::getPartialReduceAddSDNode(SDLoc DL, SDValue Chain,
-                                                SDValue Acc, SDValue Input) {
-  return getNode(ISD::PARTIAL_REDUCE_ADD, DL, Acc.getValueType(), Chain, Acc,
-                 Input);
-}
-
-SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
-                                          SDValue Op2) {
+SDValue SelectionDAG::expandPartialReduceAdd(SDLoc DL, EVT ReducedTy,
+                                             SDValue Op1, SDValue Op2) {
   EVT FullTy = Op2.getValueType();
 
   unsigned Stride = ReducedTy.getVectorMinNumElements();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index b516986f0a5869..e4c8a7801c9332 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6422,16 +6422,6 @@ void SelectionDAGBuilder::visitVectorHistogram(const CallInst &I,
   DAG.setRoot(Histogram);
 }
 
-void SelectionDAGBuilder::visitPartialReduceAdd(const CallInst &I,
-                                                unsigned IntrinsicID) {
-  SDLoc dl = getCurSDLoc();
-  SDValue Acc = getValue(I.getOperand(0));
-  SDValue Input = getValue(I.getOperand(1));
-  SDValue Chain = getRoot();
-
-  setValue(&I, DAG.getPartialReduceAddSDNode(dl, Chain, Acc, Input));
-}
-
 void SelectionDAGBuilder::visitVectorExtractLastActive(const CallInst &I,
                                                        unsigned Intrinsic) {
   assert(Intrinsic == Intrinsic::experimental_vector_extract_last_active &&
@@ -8145,15 +8135,16 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   }
   case Intrinsic::experimental_vector_partial_reduce_add: {
+    SDLoc dl = getCurSDLoc();
+    SDValue Acc = getValue(I.getOperand(0));
+    EVT AccVT = Acc.getValueType();
+    SDValue Input = getValue(I.getOperand(1));
 
     if (!TLI.shouldExpandPartialReductionIntrinsic(cast<IntrinsicInst>(&I))) {
-      visitPartialReduceAdd(I, Intrinsic);
+      setValue(&I, DAG.getNode(ISD::PARTIAL_REDUCE_ADD, dl, AccVT, Acc, Input));
       return;
     }
-
-    setValue(&I, DAG.getPartialReduceAdd(sdl, EVT::getEVT(I.getType()),
-                                         getValue(I.getOperand(0)),
-                                         getValue(I.getOperand(1))));
+    setValue(&I, DAG.expandPartialReduceAdd(dl, AccVT, Acc, Input));
     return;
   }
   case Intrinsic::experimental_cttz_elts: {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index a9e0c8f1ea10c1..3a8dc25e98700e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -629,7 +629,6 @@ class SelectionDAGBuilder {
   void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI);
   void visitConvergenceControl(const CallInst &I, unsigned Intrinsic);
   void visitVectorHistogram(const CallInst &I, unsigned IntrinsicID);
-  void visitPartialReduceAdd(const CallInst &, unsigned IntrinsicID);
   void visitVectorExtractLastActive(const CallInst &I, unsigned Intrinsic);
   void visitVPLoad(const VPIntrinsic &VPIntrin, EVT VT,
                    const SmallVectorImpl<SDValue> &OpValues);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 36c205808d8b0d..b27c6aa092c39a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21971,21 +21971,21 @@ static SDValue tryCombineWhileLo(SDNode *N,
   return SDValue(N, 0);
 }
 
-SDValue tryLowerPartialReductionToDot(PartialReduceAddSDNode *PR,
+SDValue tryLowerPartialReductionToDot(SDNode *N,
                                       const AArch64Subtarget *Subtarget,
                                       SelectionDAG &DAG) {
 
-  bool Scalable = PR->getValueType(0).isScalableVector();
+  bool Scalable = N->getValueType(0).isScalableVector();
   if (Scalable && !Subtarget->isSVEorStreamingSVEAvailable())
     return SDValue();
   if (!Scalable && (!Subtarget->isNeonAvailable() || !Subtarget->hasDotProd()))
     return SDValue();
 
-  SDLoc DL(PR);
+  SDLoc DL(N);
 
   // The narrower of the two operands. Used as the accumulator
-  auto NarrowOp = PR->getAcc();
-  auto MulOp = PR->getInput();
+  auto NarrowOp = N->getOperand(0);
+  auto MulOp = N->getOperand(1);
   if (MulOp->getOpcode() != ISD::MUL)
     return SDValue();
 
@@ -22000,7 +22000,7 @@ SDValue tryLowerPartialReductionToDot(PartialReduceAddSDNode *PR,
   } else
     return SDValue();
 
-  EVT ReducedType = PR->getValueType(0);
+  EVT ReducedType = N->getValueType(0);
   EVT MulSrcType = A.getValueType();
 
   // Dot products operate on chunks of four elements so there must be four times
@@ -22019,7 +22019,7 @@ SDValue tryLowerPartialReductionToDot(PartialReduceAddSDNode *PR,
     if (!Subtarget->hasMatMulInt8())
       return SDValue();
 
-    bool Scalable = PR->getValueType(0).isScalableVT();
+    bool Scalable = N->getValueType(0).isScalableVT();
     // There's no nxv2i64 version of usdot
     if (Scalable && ReducedVT != MVT::nxv4i32 && ReducedVT != MVT::nxv4i64)
       return SDValue();
@@ -22048,17 +22048,17 @@ SDValue tryLowerPartialReductionToDot(PartialReduceAddSDNode *PR,
   return DAG.getNode(Opcode, DL, ReducedVT, Acc, MulOpLHS, MulOpRHS);
 }
 
-SDValue tryLowerPartialReductionToWideAdd(PartialReduceAddSDNode *PR,
+SDValue tryLowerPartialReductionToWideAdd(SDNode *N,
                                           const AArch64Subtarget *Subtarget,
                                           SelectionDAG &DAG) {
 
   if (!Subtarget->hasSVE2() && !Subtarget->isStreamingSVEAvailable())
     return SDValue();
 
-  SDLoc DL(PR);
+  SDLoc DL(N);
 
-  auto Acc = PR->getAcc();
-  auto ExtInput = PR->getInput();
+  auto Acc = N->getOperand(0);
+  auto ExtInput = N->getOperand(1);
 
   EVT AccVT = Acc.getValueType();
   EVT ExtVT = Ext.getValueType();
@@ -22084,13 +22084,12 @@ SDValue tryLowerPartialReductionToWideAdd(PartialReduceAddSDNode *PR,
 static SDValue
 performPartialReduceAddCombine(SDNode *N, SelectionDAG &DAG,
                                const AArch64Subtarget *Subtarget) {
-  auto *PR = cast<PartialReduceAddSDNode>(N);
-  if (auto Dot = tryLowerPartialReductionToDot(PR, Subtarget, DAG))
+  if (auto Dot = tryLowerPartialReductionToDot(N, Subtarget, DAG))
     return Dot;
-  if (auto WideAdd = tryLowerPartialReductionToWideAdd(PR, Subtarget, DAG))
+  if (auto WideAdd = tryLowerPartialReductionToWideAdd(N, Subtarget, DAG))
     return WideAdd;
-  return DAG.getPartialReduceAdd(SDLoc(PR), PR->getValueType(0), PR->getAcc(),
-                                 PR->getInput());
+  return DAG.expandPartialReduceAdd(SDLoc(N), N->getValueType(0),
+                                    N->getOperand(0), N->getOperand(1));
 }
 
 

>From 9b53a656d47cef02b3d346b6c64fd51c20f697df Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Thu, 28 Nov 2024 12:11:36 +0000
Subject: [PATCH 03/13] Remove unnecessary function parameter and update
 comments

---
 llvm/include/llvm/CodeGen/ISDOpcodes.h                | 3 ++-
 llvm/include/llvm/CodeGen/SelectionDAG.h              | 9 +++++----
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp        | 5 +++--
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 2 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp       | 3 +--
 5 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index f93f82eb35d1f3..31cf6bc371e1a5 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1451,7 +1451,8 @@ enum NodeType {
   VECREDUCE_UMAX,
   VECREDUCE_UMIN,
 
-  // The `llvm.experimental.vector.partial.reduce.add` intrinsic
+  // This corresponds to the `llvm.experimental.vector.partial.reduce.add`
+  // intrinsic
   // Operands: Accumulator, Input
   // Outputs: Output
   PARTIAL_REDUCE_ADD,
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 7503c52e31348e..0d25ef31d00ceb 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1602,10 +1602,11 @@ class SelectionDAG {
   /// the target's desired shift amount type.
   SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op);
 
-  /// Expands partial reduce node which can't be lowered to wide add or dot
-  /// product instruction(s)
-  SDValue expandPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
-                                 SDValue Op2);
+  /// Expands PARTIAL_REDUCE_ADD nodes which can't be lowered.
+  /// @param Op1 Accumulator for where the result is stored for the partial
+  /// reduction operation
+  /// @param Op2 Input for the partial reduction operation
+  SDValue expandPartialReduceAdd(SDLoc DL, SDValue Op1, SDValue Op2);
 
   /// Expands a node with multiple results to an FP or vector libcall. The
   /// libcall is expected to take all the operands of the \p Node followed by
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 7b38d7c424d0df..b720379497d911 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2467,8 +2467,9 @@ SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
   return getZExtOrTrunc(Op, SDLoc(Op), ShTy);
 }
 
-SDValue SelectionDAG::expandPartialReduceAdd(SDLoc DL, EVT ReducedTy,
-                                             SDValue Op1, SDValue Op2) {
+SDValue SelectionDAG::expandPartialReduceAdd(SDLoc DL, SDValue Op1,
+                                             SDValue Op2) {
+  EVT ReducedTy = Op1.getValueType();
   EVT FullTy = Op2.getValueType();
 
   unsigned Stride = ReducedTy.getVectorMinNumElements();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index e4c8a7801c9332..830d6f38313441 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8144,7 +8144,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
       setValue(&I, DAG.getNode(ISD::PARTIAL_REDUCE_ADD, dl, AccVT, Acc, Input));
       return;
     }
-    setValue(&I, DAG.expandPartialReduceAdd(dl, AccVT, Acc, Input));
+    setValue(&I, DAG.expandPartialReduceAdd(dl, Acc, Input));
     return;
   }
   case Intrinsic::experimental_cttz_elts: {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b27c6aa092c39a..020a6bdea44e89 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22088,8 +22088,7 @@ performPartialReduceAddCombine(SDNode *N, SelectionDAG &DAG,
     return Dot;
   if (auto WideAdd = tryLowerPartialReductionToWideAdd(N, Subtarget, DAG))
     return WideAdd;
-  return DAG.expandPartialReduceAdd(SDLoc(N), N->getValueType(0),
-                                    N->getOperand(0), N->getOperand(1));
+  return DAG.expandPartialReduceAdd(SDLoc(N), N->getOperand(0), N->getOperand(1));
 }
 
 

>From a735d477ca1efd3a8fe3f898a1c63378568a4d2e Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Thu, 28 Nov 2024 13:17:15 +0000
Subject: [PATCH 04/13] Code formatting changes

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 020a6bdea44e89..2437ed7ade62f5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22088,7 +22088,8 @@ performPartialReduceAddCombine(SDNode *N, SelectionDAG &DAG,
     return Dot;
   if (auto WideAdd = tryLowerPartialReductionToWideAdd(N, Subtarget, DAG))
     return WideAdd;
-  return DAG.expandPartialReduceAdd(SDLoc(N), N->getOperand(0), N->getOperand(1));
+  return DAG.expandPartialReduceAdd(SDLoc(N), N->getOperand(0),
+                                    N->getOperand(1));
 }
 
 

>From 21b7d58f19ddb20971d7af82837e9e7593793204 Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Mon, 2 Dec 2024 09:41:36 +0000
Subject: [PATCH 05/13] Make two ISD nodes for partial reductions as opposed to
 one

---
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |  5 +-
 llvm/include/llvm/CodeGen/SelectionDAG.h      |  9 ++--
 llvm/include/llvm/CodeGen/TargetLowering.h    | 10 ++++
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  7 ++-
 .../SelectionDAG/SelectionDAGDumper.cpp       |  6 ++-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 48 ++++++++++++-------
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  2 +
 7 files changed, 60 insertions(+), 27 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 31cf6bc371e1a5..e5bf4c1957c702 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1451,11 +1451,12 @@ enum NodeType {
   VECREDUCE_UMAX,
   VECREDUCE_UMIN,
 
-  // This corresponds to the `llvm.experimental.vector.partial.reduce.add`
+  // These correspond to the `llvm.experimental.vector.partial.reduce.add`
   // intrinsic
   // Operands: Accumulator, Input
   // Outputs: Output
-  PARTIAL_REDUCE_ADD,
+  PARTIAL_REDUCE_SADD,
+  PARTIAL_REDUCE_UADD,
 
   // The `llvm.experimental.stackmap` intrinsic.
   // Operands: input chain, glue, <id>, <numShadowBytes>, [live0[, live1...]]
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 0d25ef31d00ceb..74b7e27e3d839f 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1602,10 +1602,11 @@ class SelectionDAG {
   /// the target's desired shift amount type.
   SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op);
 
-  /// Expands PARTIAL_REDUCE_ADD nodes which can't be lowered.
-  /// @param Op1 Accumulator for where the result is stored for the partial
-  /// reduction operation
-  /// @param Op2 Input for the partial reduction operation
+  /// Expands PARTIAL_REDUCE_S/UADD nodes to a sequence of subvector extracts
+  /// followed by vector adds.
+  /// \p Op1 Accumulator for where the result is stored for the partial
+  /// reduction operation.
+  /// \p Op2 Input for the partial reduction operation.
   SDValue expandPartialReduceAdd(SDLoc DL, SDValue Op1, SDValue Op2);
 
   /// Expands a node with multiple results to an FP or vector libcall. The
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 3751aac4df8ead..bb3369007abd79 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -462,6 +462,16 @@ class TargetLoweringBase {
     return true;
   }
 
+  /// Return true if there is a sign extend on the input to this function. Used
+  /// to determine whether to transform the
+  /// @llvm.experimental.vector.partial.reduce.* intrinsic to
+  /// PARTIAL_REDUCE_SADD or PARTIAL_REDUCE_UADD. It also removes the extend
+  /// from the input. \p Input The 'Input' operand to the
+  /// @llvm.experimental.vector.partial.reduce.* intrinsic.
+  virtual bool isPartialReductionInputSigned(SDValue &Input) const {
+    return false;
+  }
+
   /// Return true if the @llvm.get.active.lane.mask intrinsic should be expanded
   /// using generic code in SelectionDAGBuilder.
   virtual bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 830d6f38313441..241eb4658b274a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8141,7 +8141,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue Input = getValue(I.getOperand(1));
 
     if (!TLI.shouldExpandPartialReductionIntrinsic(cast<IntrinsicInst>(&I))) {
-      setValue(&I, DAG.getNode(ISD::PARTIAL_REDUCE_ADD, dl, AccVT, Acc, Input));
+      if (TLI.isPartialReductionInputSigned(Input))
+        setValue(&I,
+                 DAG.getNode(ISD::PARTIAL_REDUCE_SADD, dl, AccVT, Acc, Input));
+      else
+        setValue(&I,
+                 DAG.getNode(ISD::PARTIAL_REDUCE_UADD, dl, AccVT, Acc, Input));
       return;
     }
     setValue(&I, DAG.expandPartialReduceAdd(dl, Acc, Input));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 8ce03b14bda46c..1a710a47095189 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -567,8 +567,10 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
     return "histogram";
 
-  case ISD::PARTIAL_REDUCE_ADD:
-    return "partial_reduce_add";
+  case ISD::PARTIAL_REDUCE_UADD:
+    return "partial_reduce_uadd";
+  case ISD::PARTIAL_REDUCE_SADD:
+    return "partial_reduce_sadd";
 
     // Vector Predication
 #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...)                    \
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2437ed7ade62f5..f1e7d00227afbd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1124,7 +1124,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(
       {ISD::MGATHER, ISD::MSCATTER, ISD::EXPERIMENTAL_VECTOR_HISTOGRAM});
 
-  setTargetDAGCombine(ISD::PARTIAL_REDUCE_ADD);
+  setTargetDAGCombine({ISD::PARTIAL_REDUCE_SADD, ISD::PARTIAL_REDUCE_UADD});
 
   setTargetDAGCombine(ISD::FP_EXTEND);
 
@@ -2042,13 +2042,30 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
   EVT VT = EVT::getEVT(I->getType());
   auto Op1 = I->getOperand(1);
   EVT Op1VT = EVT::getEVT(Op1->getType());
-  if (Op1VT.getVectorElementType() == VT.getVectorElementType() &&
-      (VT.getVectorElementCount() * 4 == Op1VT.getVectorElementCount() ||
-       VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount()))
+  if ((Op1VT == MVT::nxv4i64 && VT == MVT::nxv2i64) ||
+      (Op1VT == MVT::nxv8i32 && VT == MVT::nxv4i32) ||
+      (Op1VT == MVT::nxv16i16 && VT == MVT::nxv8i16) ||
+      (Op1VT == MVT::nxv16i64 && VT == MVT::nxv4i64) ||
+      (Op1VT == MVT::nxv16i32 && VT == MVT::nxv4i32) ||
+      (Op1VT == MVT::nxv8i64 && VT == MVT::nxv2i64) ||
+      (Op1VT == MVT::v16i64 && VT == MVT::v4i64) ||
+      (Op1VT == MVT::v16i32 && VT == MVT::v4i32) ||
+      (Op1VT == MVT::v8i32 && VT == MVT::v2i32))
     return false;
   return true;
 }
 
+bool AArch64TargetLowering::isPartialReductionInputSigned(
+    SDValue &Input) const {
+  unsigned InputOpcode = Input.getOpcode();
+  if (ISD::isExtOpcode(InputOpcode)) {
+    Input = Input.getOperand(0);
+    if (InputOpcode == ISD::SIGN_EXTEND)
+      return true;
+  }
+  return false;
+}
+
 bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
   if (!Subtarget->isSVEorStreamingSVEAvailable())
     return true;
@@ -22058,27 +22075,21 @@ SDValue tryLowerPartialReductionToWideAdd(SDNode *N,
   SDLoc DL(N);
 
   auto Acc = N->getOperand(0);
-  auto ExtInput = N->getOperand(1);
+  auto Input = N->getOperand(1);
 
   EVT AccVT = Acc.getValueType();
-  EVT ExtVT = Ext.getValueType();
-  if (ExtVT.getVectorElementType() != AccVT.getVectorElementType())
-    return SDValue();
-
-  SDValue ExtOp = Ext->getOperand(0);
-  EVT ExtOpVT = ExtOp.getValueType();
+  EVT InputVT = Input.getValueType();
 
   if (!(ExtOpVT == MVT::nxv4i32 && AccVT == MVT::nxv2i64) &&
       !(ExtOpVT == MVT::nxv8i16 && AccVT == MVT::nxv4i32) &&
       !(ExtOpVT == MVT::nxv16i8 && AccVT == MVT::nxv8i16))
     return SDValue();
 
-  bool ExtOpIsSigned = Ext.getOpcode() == ISD::SIGN_EXTEND;
-  unsigned BottomOpcode =
-      ExtOpIsSigned ? AArch64ISD::SADDWB : AArch64ISD::UADDWB;
-  unsigned TopOpcode = ExtOpIsSigned ? AArch64ISD::SADDWT : AArch64ISD::UADDWT;
-  SDValue BottomNode = DAG.getNode(BottomOpcode, DL, AccVT, Acc, ExtOp);
-  return DAG.getNode(TopOpcode, DL, AccVT, BottomNode, ExtOp);
+  bool InputIsSigned = N->getOpcode() == ISD::PARTIAL_REDUCE_SADD;
+  auto BottomOpcode = InputIsSigned ? AArch64ISD::SADDWB : AArch64ISD::UADDWB;
+  auto TopOpcode = InputIsSigned ? AArch64ISD::SADDWT : AArch64ISD::UADDWT;
+  auto BottomNode = DAG.getNode(BottomOpcode, DL, AccVT, Acc, Input);
+  return DAG.getNode(TopOpcode, DL, AccVT, BottomNode, Input);
 }
 
 static SDValue
@@ -26399,7 +26410,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::MSCATTER:
   case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
     return performMaskedGatherScatterCombine(N, DCI, DAG);
-  case ISD::PARTIAL_REDUCE_ADD:
+  case ISD::PARTIAL_REDUCE_UADD:
+  case ISD::PARTIAL_REDUCE_SADD:
     return performPartialReduceAddCombine(N, DAG, Subtarget);
   case ISD::FP_EXTEND:
     return performFPExtendCombine(N, DAG, DCI, Subtarget);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 85b62be5dd30dd..c4c77104a3af06 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -996,6 +996,8 @@ class AArch64TargetLowering : public TargetLowering {
   bool
   shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const override;
 
+  bool isPartialReductionInputSigned(SDValue &Input) const override;
+
   bool shouldExpandCttzElements(EVT VT) const override;
 
   bool shouldExpandVectorMatch(EVT VT, unsigned SearchSize) const override;

>From bacacda22f31af901d9e8a782143fa2299694534 Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Mon, 9 Dec 2024 13:41:41 +0000
Subject: [PATCH 06/13] Determine which ISD node to use in DAG combine rather
 than in SelectionDAGBuilder.

---
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |  4 +-
 llvm/include/llvm/CodeGen/SelectionDAG.h      |  3 +-
 llvm/include/llvm/CodeGen/TargetLowering.h    | 10 --
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  8 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 97 ++++++++++---------
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  2 -
 6 files changed, 55 insertions(+), 69 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index e5bf4c1957c702..d2d751cb1b7328 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1451,8 +1451,8 @@ enum NodeType {
   VECREDUCE_UMAX,
   VECREDUCE_UMIN,
 
-  // These correspond to the `llvm.experimental.vector.partial.reduce.add`
-  // intrinsic
+  // Nodes used to represent a partial reduction addition operation (signed and
+  // unsigned).
   // Operands: Accumulator, Input
   // Outputs: Output
   PARTIAL_REDUCE_SADD,
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 74b7e27e3d839f..aa0aa37f132417 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1602,8 +1602,7 @@ class SelectionDAG {
   /// the target's desired shift amount type.
   SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op);
 
-  /// Expands PARTIAL_REDUCE_S/UADD nodes to a sequence of subvector extracts
-  /// followed by vector adds.
+  /// Expands PARTIAL_REDUCE_S/UADD nodes.
   /// \p Op1 Accumulator for where the result is stored for the partial
   /// reduction operation.
   /// \p Op2 Input for the partial reduction operation.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index bb3369007abd79..3751aac4df8ead 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -462,16 +462,6 @@ class TargetLoweringBase {
     return true;
   }
 
-  /// Return true if there is a sign extend on the input to this function. Used
-  /// to determine whether to transform the
-  /// @llvm.experimental.vector.partial.reduce.* intrinsic to
-  /// PARTIAL_REDUCE_SADD or PARTIAL_REDUCE_UADD. It also removes the extend
-  /// from the input. \p Input The 'Input' operand to the
-  /// @llvm.experimental.vector.partial.reduce.* intrinsic.
-  virtual bool isPartialReductionInputSigned(SDValue &Input) const {
-    return false;
-  }
-
   /// Return true if the @llvm.get.active.lane.mask intrinsic should be expanded
   /// using generic code in SelectionDAGBuilder.
   virtual bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 241eb4658b274a..077a27878f4b6e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8141,12 +8141,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue Input = getValue(I.getOperand(1));
 
     if (!TLI.shouldExpandPartialReductionIntrinsic(cast<IntrinsicInst>(&I))) {
-      if (TLI.isPartialReductionInputSigned(Input))
-        setValue(&I,
-                 DAG.getNode(ISD::PARTIAL_REDUCE_SADD, dl, AccVT, Acc, Input));
-      else
-        setValue(&I,
-                 DAG.getNode(ISD::PARTIAL_REDUCE_UADD, dl, AccVT, Acc, Input));
+      setValue(&I,
+               DAG.getNode(ISD::PARTIAL_REDUCE_UADD, dl, AccVT, Acc, Input));
       return;
     }
     setValue(&I, DAG.expandPartialReduceAdd(dl, Acc, Input));
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index f1e7d00227afbd..933f24fea05b93 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2055,17 +2055,6 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
   return true;
 }
 
-bool AArch64TargetLowering::isPartialReductionInputSigned(
-    SDValue &Input) const {
-  unsigned InputOpcode = Input.getOpcode();
-  if (ISD::isExtOpcode(InputOpcode)) {
-    Input = Input.getOperand(0);
-    if (InputOpcode == ISD::SIGN_EXTEND)
-      return true;
-  }
-  return false;
-}
-
 bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
   if (!Subtarget->isSVEorStreamingSVEAvailable())
     return true;
@@ -22006,48 +21995,54 @@ SDValue tryLowerPartialReductionToDot(SDNode *N,
   if (MulOp->getOpcode() != ISD::MUL)
     return SDValue();
 
-    MulOpLHSIsSigned = ExtMulOpLHSOpcode == ISD::SIGN_EXTEND;
-    MulOpRHSIsSigned = ExtMulOpRHSOpcode == ISD::SIGN_EXTEND;
-
-    MulOpLHS = ExtMulOpLHS->getOperand(0);
-    MulOpRHS = ExtMulOpRHS->getOperand(0);
+  auto A = MulOp->getOperand(0);
+  auto B = MulOp->getOperand(1);
 
-    if (MulOpLHS.getValueType() != MulOpRHS.getValueType())
+  unsigned AOpcode = A->getOpcode();
+  unsigned BOpcode = B->getOpcode();
+  unsigned Opcode;
+  EVT ReducedType = N->getValueType(0);
+  EVT MulSrcType;
+  if (ISD::isExtOpcode(AOpcode) || ISD::isExtOpcode(BOpcode)) {
+    bool AIsSigned = AOpcode == ISD::SIGN_EXTEND;
+    bool BIsSigned = BOpcode == ISD::SIGN_EXTEND;
+
+    A = A->getOperand(0);
+    B = B->getOperand(0);
+    if (A.getValueType() != B.getValueType())
       return SDValue();
-  } else
-    return SDValue();
 
-  EVT ReducedType = N->getValueType(0);
-  EVT MulSrcType = A.getValueType();
+    if (AIsSigned != BIsSigned) {
+      if (!Subtarget->hasMatMulInt8())
+        return SDValue();
+
+      bool Scalable = N->getValueType(0).isScalableVT();
+      // There's no nxv2i64 version of usdot
+      if (Scalable && ReducedType != MVT::nxv4i32 &&
+          ReducedType != MVT::nxv4i64)
+        return SDValue();
+
+      Opcode = AArch64ISD::USDOT;
+      // USDOT expects the signed operand to be last
+      if (!BIsSigned)
+        std::swap(A, B);
+    } else if (AIsSigned)
+      Opcode = AArch64ISD::SDOT;
+    else
+      Opcode = AArch64ISD::UDOT;
+    MulSrcType = A.getValueType();
+  }
 
   // Dot products operate on chunks of four elements so there must be four times
   // as many elements in the wide type
-  if (!(ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) &&
-      !(ReducedVT == MVT::nxv4i32 && MulSrcVT == MVT::nxv16i8) &&
-      !(ReducedVT == MVT::nxv2i64 && MulSrcVT == MVT::nxv8i16) &&
-      !(ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8) &&
-      !(ReducedVT == MVT::v4i32 && MulSrcVT == MVT::v16i8) &&
-      !(ReducedVT == MVT::v2i32 && MulSrcVT == MVT::v8i8))
+  if (!(ReducedType == MVT::nxv4i64 && MulSrcType == MVT::nxv16i8) &&
+      !(ReducedType == MVT::nxv4i32 && MulSrcType == MVT::nxv16i8) &&
+      !(ReducedType == MVT::nxv2i64 && MulSrcType == MVT::nxv8i16) &&
+      !(ReducedType == MVT::v4i64 && MulSrcType == MVT::v16i8) &&
+      !(ReducedType == MVT::v4i32 && MulSrcType == MVT::v16i8) &&
+      !(ReducedType == MVT::v2i32 && MulSrcType == MVT::v8i8))
     return SDValue();
 
-  // If the extensions are mixed, we should lower it to a usdot instead
-  unsigned Opcode = 0;
-  if (MulOpLHSIsSigned != MulOpRHSIsSigned) {
-    if (!Subtarget->hasMatMulInt8())
-      return SDValue();
-
-    bool Scalable = N->getValueType(0).isScalableVT();
-    // There's no nxv2i64 version of usdot
-    if (Scalable && ReducedVT != MVT::nxv4i32 && ReducedVT != MVT::nxv4i64)
-      return SDValue();
-
-    Opcode = AArch64ISD::USDOT;
-    // USDOT expects the signed operand to be last
-    if (!MulOpRHSIsSigned)
-      std::swap(MulOpLHS, MulOpRHS);
-  } else
-    Opcode = MulOpLHSIsSigned ? AArch64ISD::SDOT : AArch64ISD::UDOT;
-
   // Partial reduction lowering for (nx)v16i8 to (nx)v4i64 requires an i32 dot
   // product followed by a zero / sign extension
   if ((ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) ||
@@ -22077,15 +22072,23 @@ SDValue tryLowerPartialReductionToWideAdd(SDNode *N,
   auto Acc = N->getOperand(0);
   auto Input = N->getOperand(1);
 
-  EVT AccVT = Acc.getValueType();
+  unsigned Opcode = N->getOpcode();
+  unsigned InputOpcode = Input.getOpcode();
+  if (ISD::isExtOpcode(InputOpcode)) {
+    Input = Input.getOperand(0);
+    if (InputOpcode == ISD::SIGN_EXTEND)
+      Opcode = ISD::PARTIAL_REDUCE_SADD;
+  }
+
   EVT InputVT = Input.getValueType();
+  EVT AccVT = Acc.getValueType();
 
   if (!(ExtOpVT == MVT::nxv4i32 && AccVT == MVT::nxv2i64) &&
       !(ExtOpVT == MVT::nxv8i16 && AccVT == MVT::nxv4i32) &&
       !(ExtOpVT == MVT::nxv16i8 && AccVT == MVT::nxv8i16))
     return SDValue();
 
-  bool InputIsSigned = N->getOpcode() == ISD::PARTIAL_REDUCE_SADD;
+  bool InputIsSigned = Opcode == ISD::PARTIAL_REDUCE_SADD;
   auto BottomOpcode = InputIsSigned ? AArch64ISD::SADDWB : AArch64ISD::UADDWB;
   auto TopOpcode = InputIsSigned ? AArch64ISD::SADDWT : AArch64ISD::UADDWT;
   auto BottomNode = DAG.getNode(BottomOpcode, DL, AccVT, Acc, Input);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index c4c77104a3af06..85b62be5dd30dd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -996,8 +996,6 @@ class AArch64TargetLowering : public TargetLowering {
   bool
   shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const override;
 
-  bool isPartialReductionInputSigned(SDValue &Input) const override;
-
   bool shouldExpandCttzElements(EVT VT) const override;
 
   bool shouldExpandVectorMatch(EVT VT, unsigned SearchSize) const override;

>From bd023480767019b726341ec6d19b030207645146 Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Wed, 11 Dec 2024 14:41:56 +0000
Subject: [PATCH 07/13] Separate lowering code for PARTIAL_REDUCE_U/SADD

Separate lowering code from all being in the DAG-combine function.
Now the DAG-combine decides whether the node should be the signed
or unsigned version of partial reduce add. Then there is a function
in LowerOperation that does the actual lowering to wide adds or dot
products if it is able to.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 265 ++++++++++--------
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   1 +
 2 files changed, 146 insertions(+), 120 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 933f24fea05b93..d6ba95339de6c7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1840,8 +1840,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv2i64,
                          Custom);
     }
+
+    for (auto VT : {MVT::nxv2i64, MVT::nxv4i32, MVT::nxv8i16}) {
+      setOperationAction(ISD::PARTIAL_REDUCE_UADD, VT, Custom);
+      setOperationAction(ISD::PARTIAL_REDUCE_SADD, VT, Custom);
+    }
   }
 
+  for (auto VT : {MVT::v4i64, MVT::v4i32, MVT::v2i32}) {
+    setOperationAction(ISD::PARTIAL_REDUCE_UADD, VT, Custom);
+    setOperationAction(ISD::PARTIAL_REDUCE_SADD, VT, Custom);
+  }
 
   if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
     // Only required for llvm.aarch64.mops.memset.tag
@@ -2040,17 +2049,18 @@ bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
     return true;
 
   EVT VT = EVT::getEVT(I->getType());
-  auto Op1 = I->getOperand(1);
-  EVT Op1VT = EVT::getEVT(Op1->getType());
-  if ((Op1VT == MVT::nxv4i64 && VT == MVT::nxv2i64) ||
-      (Op1VT == MVT::nxv8i32 && VT == MVT::nxv4i32) ||
-      (Op1VT == MVT::nxv16i16 && VT == MVT::nxv8i16) ||
-      (Op1VT == MVT::nxv16i64 && VT == MVT::nxv4i64) ||
-      (Op1VT == MVT::nxv16i32 && VT == MVT::nxv4i32) ||
-      (Op1VT == MVT::nxv8i64 && VT == MVT::nxv2i64) ||
-      (Op1VT == MVT::v16i64 && VT == MVT::v4i64) ||
-      (Op1VT == MVT::v16i32 && VT == MVT::v4i32) ||
-      (Op1VT == MVT::v8i32 && VT == MVT::v2i32))
+  auto Input = I->getOperand(1);
+  EVT InputVT = EVT::getEVT(Input->getType());
+
+  if ((InputVT == MVT::nxv4i64 && VT == MVT::nxv2i64) ||
+      (InputVT == MVT::nxv8i32 && VT == MVT::nxv4i32) ||
+      (InputVT == MVT::nxv16i16 && VT == MVT::nxv8i16) ||
+      (InputVT == MVT::nxv16i64 && VT == MVT::nxv4i64) ||
+      (InputVT == MVT::nxv16i32 && VT == MVT::nxv4i32) ||
+      (InputVT == MVT::nxv8i64 && VT == MVT::nxv2i64) ||
+      (InputVT == MVT::v16i64 && VT == MVT::v4i64) ||
+      (InputVT == MVT::v16i32 && VT == MVT::v4i32) ||
+      (InputVT == MVT::v8i32 && VT == MVT::v2i32))
     return false;
   return true;
 }
@@ -7596,6 +7606,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerFLDEXP(Op, DAG);
   case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
     return LowerVECTOR_HISTOGRAM(Op, DAG);
+  case ISD::PARTIAL_REDUCE_UADD:
+  case ISD::PARTIAL_REDUCE_SADD:
+    return LowerPARTIAL_REDUCE_ADD(Op, DAG);
   }
 }
 
@@ -21977,147 +21990,126 @@ static SDValue tryCombineWhileLo(SDNode *N,
   return SDValue(N, 0);
 }
 
-SDValue tryLowerPartialReductionToDot(SDNode *N,
-                                      const AArch64Subtarget *Subtarget,
-                                      SelectionDAG &DAG) {
-
-  bool Scalable = N->getValueType(0).isScalableVector();
+SDValue tryCombineToDotProduct(SDValue &Acc, SDValue &Input, SelectionDAG &DAG,
+                               const AArch64Subtarget *Subtarget, SDLoc &DL) {
+  bool Scalable = Acc.getValueType().isScalableVector();
   if (Scalable && !Subtarget->isSVEorStreamingSVEAvailable())
     return SDValue();
   if (!Scalable && (!Subtarget->isNeonAvailable() || !Subtarget->hasDotProd()))
     return SDValue();
 
-  SDLoc DL(N);
-
-  // The narrower of the two operands. Used as the accumulator
-  auto NarrowOp = N->getOperand(0);
-  auto MulOp = N->getOperand(1);
-  if (MulOp->getOpcode() != ISD::MUL)
+  unsigned InputOpcode = Input->getOpcode();
+  if (InputOpcode != ISD::MUL)
     return SDValue();
-
-  auto A = MulOp->getOperand(0);
-  auto B = MulOp->getOperand(1);
-
+  auto A = Input->getOperand(0);
+  auto B = Input->getOperand(1);
   unsigned AOpcode = A->getOpcode();
   unsigned BOpcode = B->getOpcode();
-  unsigned Opcode;
-  EVT ReducedType = N->getValueType(0);
-  EVT MulSrcType;
-  if (ISD::isExtOpcode(AOpcode) || ISD::isExtOpcode(BOpcode)) {
-    bool AIsSigned = AOpcode == ISD::SIGN_EXTEND;
-    bool BIsSigned = BOpcode == ISD::SIGN_EXTEND;
-
-    A = A->getOperand(0);
-    B = B->getOperand(0);
-    if (A.getValueType() != B.getValueType())
-      return SDValue();
+  EVT AccVT = Acc->getValueType(0);
 
-    if (AIsSigned != BIsSigned) {
-      if (!Subtarget->hasMatMulInt8())
-        return SDValue();
+  if (!ISD::isExtOpcode(AOpcode) || !ISD::isExtOpcode(BOpcode))
+    return DAG.expandPartialReduceAdd(DL, Acc, Input);
 
-      bool Scalable = N->getValueType(0).isScalableVT();
-      // There's no nxv2i64 version of usdot
-      if (Scalable && ReducedType != MVT::nxv4i32 &&
-          ReducedType != MVT::nxv4i64)
-        return SDValue();
+  bool AIsSigned = AOpcode == ISD::SIGN_EXTEND;
+  bool BIsSigned = BOpcode == ISD::SIGN_EXTEND;
 
-      Opcode = AArch64ISD::USDOT;
-      // USDOT expects the signed operand to be last
-      if (!BIsSigned)
-        std::swap(A, B);
-    } else if (AIsSigned)
-      Opcode = AArch64ISD::SDOT;
-    else
-      Opcode = AArch64ISD::UDOT;
-    MulSrcType = A.getValueType();
-  }
+  A = A->getOperand(0);
+  B = B->getOperand(0);
+  EVT MulSrcVT = A.getValueType();
 
   // Dot products operate on chunks of four elements so there must be four times
   // as many elements in the wide type
-  if (!(ReducedType == MVT::nxv4i64 && MulSrcType == MVT::nxv16i8) &&
-      !(ReducedType == MVT::nxv4i32 && MulSrcType == MVT::nxv16i8) &&
-      !(ReducedType == MVT::nxv2i64 && MulSrcType == MVT::nxv8i16) &&
-      !(ReducedType == MVT::v4i64 && MulSrcType == MVT::v16i8) &&
-      !(ReducedType == MVT::v4i32 && MulSrcType == MVT::v16i8) &&
-      !(ReducedType == MVT::v2i32 && MulSrcType == MVT::v8i8))
-    return SDValue();
+  if (!(AccVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) &&
+      !(AccVT == MVT::nxv4i32 && MulSrcVT == MVT::nxv16i8) &&
+      !(AccVT == MVT::nxv2i64 && MulSrcVT == MVT::nxv8i16) &&
+      !(AccVT == MVT::v4i64 && MulSrcVT == MVT::v16i8) &&
+      !(AccVT == MVT::v4i32 && MulSrcVT == MVT::v16i8) &&
+      !(AccVT == MVT::v2i32 && MulSrcVT == MVT::v8i8))
+    return DAG.expandPartialReduceAdd(DL, Acc, Input);
+
+  unsigned DotOpcode = AIsSigned ? AArch64ISD::SDOT : AArch64ISD::UDOT;
+  if (AIsSigned != BIsSigned) {
+    if (!Subtarget->hasMatMulInt8())
+      return DAG.expandPartialReduceAdd(DL, Acc, Input);
+
+    bool Scalable = AccVT.isScalableVT();
+    // There's no nxv2i64 version of usdot
+    if (Scalable && AccVT != MVT::nxv4i32 && AccVT != MVT::nxv4i64)
+      return DAG.expandPartialReduceAdd(DL, Acc, Input);
+
+    if (!BIsSigned)
+      std::swap(A, B);
+    DotOpcode = AArch64ISD::USDOT;
+    // Lower usdot patterns here because legalisation would attempt to split it
+    // unless exts are removed. But, removing the exts would lose the
+    // information about whether each operand is signed.
+    if ((AccVT != MVT::nxv4i64 || MulSrcVT != MVT::nxv16i8) &&
+        (AccVT != MVT::v4i64 || MulSrcVT != MVT::v16i8))
+      return DAG.getNode(DotOpcode, DL, AccVT, Acc, A, B);
+  }
 
   // Partial reduction lowering for (nx)v16i8 to (nx)v4i64 requires an i32 dot
-  // product followed by a zero / sign extension
-  if ((ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) ||
-      (ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8)) {
-    EVT ReducedVTI32 =
-        (ReducedVT.isScalableVector()) ? MVT::nxv4i32 : MVT::v4i32;
+  // product followed by a zero / sign extension. Need to lower this here
+  // because legalisation would attempt to split it.
+  if ((AccVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) ||
+      (AccVT == MVT::v4i64 && MulSrcVT == MVT::v16i8)) {
+    EVT AccVTI32 = (AccVT.isScalableVector()) ? MVT::nxv4i32 : MVT::v4i32;
 
-    SDValue DotI32 =
-        DAG.getNode(Opcode, DL, ReducedVTI32,
-                    DAG.getConstant(0, DL, ReducedVTI32), MulOpLHS, MulOpRHS);
-    SDValue Extended = DAG.getSExtOrTrunc(DotI32, DL, ReducedVT);
-    return DAG.getNode(ISD::ADD, DL, ReducedVT, Acc, Extended);
+    auto DotI32 = DAG.getNode(DotOpcode, DL, AccVTI32,
+                              DAG.getConstant(0, DL, AccVTI32), A, B);
+    auto Extended = DAG.getSExtOrTrunc(DotI32, DL, AccVT);
+    return DAG.getNode(ISD::ADD, DL, AccVT, Acc, Extended);
   }
 
-  return DAG.getNode(Opcode, DL, ReducedVT, Acc, MulOpLHS, MulOpRHS);
-}
+  if (A.getValueType() != B.getValueType())
+    return DAG.expandPartialReduceAdd(DL, Acc, Input);
 
-SDValue tryLowerPartialReductionToWideAdd(SDNode *N,
-                                          const AArch64Subtarget *Subtarget,
-                                          SelectionDAG &DAG) {
+  unsigned NewOpcode =
+      AIsSigned ? ISD::PARTIAL_REDUCE_SADD : ISD::PARTIAL_REDUCE_UADD;
+  auto NewMul = DAG.getNode(ISD::MUL, DL, A.getValueType(), A, B);
+  return DAG.getNode(NewOpcode, DL, AccVT, Acc, NewMul);
+}
 
+SDValue tryCombineToWideAdd(SDValue &Acc, SDValue &Input, SelectionDAG &DAG,
+                            const AArch64Subtarget *Subtarget, SDLoc &DL) {
   if (!Subtarget->hasSVE2() && !Subtarget->isStreamingSVEAvailable())
-    return SDValue();
-
-  SDLoc DL(N);
-
-  auto Acc = N->getOperand(0);
-  auto Input = N->getOperand(1);
-
-  unsigned Opcode = N->getOpcode();
-  unsigned InputOpcode = Input.getOpcode();
-  if (ISD::isExtOpcode(InputOpcode)) {
-    Input = Input.getOperand(0);
-    if (InputOpcode == ISD::SIGN_EXTEND)
-      Opcode = ISD::PARTIAL_REDUCE_SADD;
-  }
-
+    return DAG.expandPartialReduceAdd(DL, Acc, Input);
+  unsigned InputOpcode = Input->getOpcode();
+  if (!ISD::isExtOpcode(InputOpcode))
+    return DAG.expandPartialReduceAdd(DL, Acc, Input);
+  Input = Input->getOperand(0);
   EVT InputVT = Input.getValueType();
-  EVT AccVT = Acc.getValueType();
+  EVT AccVT = Acc->getValueType(0);
 
-  if (!(ExtOpVT == MVT::nxv4i32 && AccVT == MVT::nxv2i64) &&
-      !(ExtOpVT == MVT::nxv8i16 && AccVT == MVT::nxv4i32) &&
-      !(ExtOpVT == MVT::nxv16i8 && AccVT == MVT::nxv8i16))
+  if (!(InputVT == MVT::nxv4i32 && AccVT == MVT::nxv2i64) &&
+      !(InputVT == MVT::nxv8i16 && AccVT == MVT::nxv4i32) &&
+      !(InputVT == MVT::nxv16i8 && AccVT == MVT::nxv8i16))
     return SDValue();
 
-  bool InputIsSigned = Opcode == ISD::PARTIAL_REDUCE_SADD;
-  auto BottomOpcode = InputIsSigned ? AArch64ISD::SADDWB : AArch64ISD::UADDWB;
-  auto TopOpcode = InputIsSigned ? AArch64ISD::SADDWT : AArch64ISD::UADDWT;
-  auto BottomNode = DAG.getNode(BottomOpcode, DL, AccVT, Acc, Input);
-  return DAG.getNode(TopOpcode, DL, AccVT, BottomNode, Input);
-}
-
-static SDValue
-performPartialReduceAddCombine(SDNode *N, SelectionDAG &DAG,
-                               const AArch64Subtarget *Subtarget) {
-  if (auto Dot = tryLowerPartialReductionToDot(N, Subtarget, DAG))
-    return Dot;
-  if (auto WideAdd = tryLowerPartialReductionToWideAdd(N, Subtarget, DAG))
-    return WideAdd;
-  return DAG.expandPartialReduceAdd(SDLoc(N), N->getOperand(0),
-                                    N->getOperand(1));
+  unsigned NewOpcode = InputOpcode == ISD::SIGN_EXTEND
+                           ? ISD::PARTIAL_REDUCE_SADD
+                           : ISD::PARTIAL_REDUCE_UADD;
+  return DAG.getNode(NewOpcode, DL, AccVT, Acc, Input);
 }
 
+SDValue performPartialReduceAddCombine(SDNode *N, SelectionDAG &DAG,
+                                       const AArch64Subtarget *Subtarget) {
+  SDLoc DL(N);
+  auto Acc = N->getOperand(0);
+  auto Input = N->getOperand(1);
+  EVT AccElemVT = Acc.getValueType().getVectorElementType();
+  EVT InputElemVT = Input.getValueType().getVectorElementType();
 
+  // If the exts have already been removed or it has already been lowered to an
+  // usdot instruction, then the element types will not be equal
+  if (InputElemVT != AccElemVT || Input.getOpcode() == AArch64ISD::USDOT)
+    return SDValue(N, 0);
 
-static SDValue
-performPartialReduceAddCombine(SDNode *N, SelectionDAG &DAG,
-                               const AArch64Subtarget *Subtarget) {
-  auto *PR = cast<PartialReduceAddSDNode>(N);
-  if (auto Dot = tryLowerPartialReductionToDot(PR, Subtarget, DAG))
+  if (auto Dot = tryCombineToDotProduct(Acc, Input, DAG, Subtarget, DL))
     return Dot;
-  if (auto WideAdd = tryLowerPartialReductionToWideAdd(PR, Subtarget, DAG))
+  if (auto WideAdd = tryCombineToWideAdd(Acc, Input, DAG, Subtarget, DL))
     return WideAdd;
-  return DAG.getPartialReduceAdd(SDLoc(PR), PR->getValueType(0), PR->getAcc(),
-                                 PR->getInput());
+  return SDValue();
 }
 
 static SDValue performIntrinsicCombine(SDNode *N,
@@ -29176,6 +29168,39 @@ SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
   return Scatter;
 }
 
+SDValue
+AArch64TargetLowering::LowerPARTIAL_REDUCE_ADD(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Acc = Op.getOperand(0);
+  SDValue Input = Op.getOperand(1);
+
+  EVT AccVT = Acc.getValueType();
+  EVT InputVT = Input.getValueType();
+
+  unsigned Opcode = Op.getOpcode();
+
+  if (AccVT.getVectorElementCount() * 4 == InputVT.getVectorElementCount()) {
+    unsigned IndexAdd = 0;
+    // ISD::MUL may have already been lowered, meaning the operands would be in
+    // different positions.
+    if (Input.getOpcode() != ISD::MUL)
+      IndexAdd = 1;
+    auto A = Input.getOperand(IndexAdd);
+    auto B = Input.getOperand(IndexAdd + 1);
+
+    unsigned DotOpcode = Opcode == ISD::PARTIAL_REDUCE_SADD ? AArch64ISD::SDOT
+                                                            : AArch64ISD::UDOT;
+    return DAG.getNode(DotOpcode, DL, AccVT, Acc, A, B);
+  }
+  bool InputIsSigned = Opcode == ISD::PARTIAL_REDUCE_SADD;
+  unsigned BottomOpcode =
+      InputIsSigned ? AArch64ISD::SADDWB : AArch64ISD::UADDWB;
+  unsigned TopOpcode = InputIsSigned ? AArch64ISD::SADDWT : AArch64ISD::UADDWT;
+  auto BottomNode = DAG.getNode(BottomOpcode, DL, AccVT, Acc, Input);
+  return DAG.getNode(TopOpcode, DL, AccVT, BottomNode, Input);
+}
+
 SDValue
 AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
                                                     SelectionDAG &DAG) const {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 85b62be5dd30dd..88b05f8eeba284 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1184,6 +1184,7 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerVECTOR_DEINTERLEAVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_HISTOGRAM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerPARTIAL_REDUCE_ADD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;

>From 92318044b08d22a7d8c7f85bc160db21539034e7 Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Thu, 12 Dec 2024 13:50:36 +0000
Subject: [PATCH 08/13] Change the way the dot product pattern is checked for
 lowering. Add condition in wide add combine to not allow fixed length
 vectors.

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d6ba95339de6c7..a4438997157c7c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21999,13 +21999,18 @@ SDValue tryCombineToDotProduct(SDValue &Acc, SDValue &Input, SelectionDAG &DAG,
     return SDValue();
 
   unsigned InputOpcode = Input->getOpcode();
+  EVT AccVT = Acc->getValueType(0);
+  if (AccVT.getVectorElementCount() * 4 ==
+          Input->getValueType(0).getVectorElementCount() &&
+      InputOpcode != ISD::MUL)
+    return DAG.expandPartialReduceAdd(DL, Acc, Input);
   if (InputOpcode != ISD::MUL)
     return SDValue();
+
   auto A = Input->getOperand(0);
   auto B = Input->getOperand(1);
   unsigned AOpcode = A->getOpcode();
   unsigned BOpcode = B->getOpcode();
-  EVT AccVT = Acc->getValueType(0);
 
   if (!ISD::isExtOpcode(AOpcode) || !ISD::isExtOpcode(BOpcode))
     return DAG.expandPartialReduceAdd(DL, Acc, Input);
@@ -22080,6 +22085,8 @@ SDValue tryCombineToWideAdd(SDValue &Acc, SDValue &Input, SelectionDAG &DAG,
   Input = Input->getOperand(0);
   EVT InputVT = Input.getValueType();
   EVT AccVT = Acc->getValueType(0);
+  if (!AccVT.isScalableVector())
+    return DAG.expandPartialReduceAdd(DL, Acc, Input);
 
   if (!(InputVT == MVT::nxv4i32 && AccVT == MVT::nxv2i64) &&
       !(InputVT == MVT::nxv8i16 && AccVT == MVT::nxv4i32) &&
@@ -29180,6 +29187,9 @@ AArch64TargetLowering::LowerPARTIAL_REDUCE_ADD(SDValue Op,
 
   unsigned Opcode = Op.getOpcode();
 
+  // If the following condition is true and the input opcode was not ISD::MUL
+  // during the DAG-combine, it is already expanded. So this condition means the
+  // input opcode must have been ISD::MUL.
   if (AccVT.getVectorElementCount() * 4 == InputVT.getVectorElementCount()) {
     unsigned IndexAdd = 0;
     // ISD::MUL may have already been lowered, meaning the operands would be in

>From b43db728cf2f00b6793b3b1c6db17dd5678c55b2 Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Tue, 17 Dec 2024 13:59:13 +0000
Subject: [PATCH 09/13] Change from adding ISD::PARTIAL_REDUCE_S/UADD to adding
 ISD::PARTIAL_REDUCE_S/UMLA

This makes the lowering function easier as you do not need to worry
about whether the MUL is lowered or not. Instead its operands are
taken from it. If there is no MUL instruction and just one operand,
the other operand is a vector of ones (for value types eligible for
wide add lowering).
---
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |  4 +-
 llvm/include/llvm/CodeGen/SelectionDAG.h      |  2 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  2 +-
 .../SelectionDAG/SelectionDAGDumper.cpp       |  8 +--
 .../Target/AArch64/AArch64ISelLowering.cpp    | 60 ++++++++-----------
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  2 +-
 6 files changed, 34 insertions(+), 44 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index d2d751cb1b7328..2436d914c80cba 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1455,8 +1455,8 @@ enum NodeType {
   // unsigned).
   // Operands: Accumulator, Input
   // Outputs: Output
-  PARTIAL_REDUCE_SADD,
-  PARTIAL_REDUCE_UADD,
+  PARTIAL_REDUCE_SMLA,
+  PARTIAL_REDUCE_UMLA,
 
   // The `llvm.experimental.stackmap` intrinsic.
   // Operands: input chain, glue, <id>, <numShadowBytes>, [live0[, live1...]]
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index aa0aa37f132417..349dadc8237b0a 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1602,7 +1602,7 @@ class SelectionDAG {
   /// the target's desired shift amount type.
   SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op);
 
-  /// Expands PARTIAL_REDUCE_S/UADD nodes.
+  /// Expands PARTIAL_REDUCE_S/UMLA nodes.
   /// \p Op1 Accumulator for where the result is stored for the partial
   /// reduction operation.
   /// \p Op2 Input for the partial reduction operation.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 077a27878f4b6e..3ff3e3ce54d821 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8142,7 +8142,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
 
     if (!TLI.shouldExpandPartialReductionIntrinsic(cast<IntrinsicInst>(&I))) {
       setValue(&I,
-               DAG.getNode(ISD::PARTIAL_REDUCE_UADD, dl, AccVT, Acc, Input));
+               DAG.getNode(ISD::PARTIAL_REDUCE_UMLA, dl, AccVT, Acc, Input));
       return;
     }
     setValue(&I, DAG.expandPartialReduceAdd(dl, Acc, Input));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 1a710a47095189..402dc949294983 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -567,10 +567,10 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
     return "histogram";
 
-  case ISD::PARTIAL_REDUCE_UADD:
-    return "partial_reduce_uadd";
-  case ISD::PARTIAL_REDUCE_SADD:
-    return "partial_reduce_sadd";
+  case ISD::PARTIAL_REDUCE_UMLA:
+    return "partial_reduce_umla";
+  case ISD::PARTIAL_REDUCE_SMLA:
+    return "partial_reduce_smla";
 
     // Vector Predication
 #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...)                    \
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a4438997157c7c..0f60a9c4280322 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1124,7 +1124,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(
       {ISD::MGATHER, ISD::MSCATTER, ISD::EXPERIMENTAL_VECTOR_HISTOGRAM});
 
-  setTargetDAGCombine({ISD::PARTIAL_REDUCE_SADD, ISD::PARTIAL_REDUCE_UADD});
+  setTargetDAGCombine({ISD::PARTIAL_REDUCE_SMLA, ISD::PARTIAL_REDUCE_UMLA});
 
   setTargetDAGCombine(ISD::FP_EXTEND);
 
@@ -1842,14 +1842,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     }
 
     for (auto VT : {MVT::nxv2i64, MVT::nxv4i32, MVT::nxv8i16}) {
-      setOperationAction(ISD::PARTIAL_REDUCE_UADD, VT, Custom);
-      setOperationAction(ISD::PARTIAL_REDUCE_SADD, VT, Custom);
+      setOperationAction(ISD::PARTIAL_REDUCE_UMLA, VT, Custom);
+      setOperationAction(ISD::PARTIAL_REDUCE_SMLA, VT, Custom);
     }
   }
 
   for (auto VT : {MVT::v4i64, MVT::v4i32, MVT::v2i32}) {
-    setOperationAction(ISD::PARTIAL_REDUCE_UADD, VT, Custom);
-    setOperationAction(ISD::PARTIAL_REDUCE_SADD, VT, Custom);
+    setOperationAction(ISD::PARTIAL_REDUCE_UMLA, VT, Custom);
+    setOperationAction(ISD::PARTIAL_REDUCE_SMLA, VT, Custom);
   }
 
   if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
@@ -7606,9 +7606,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerFLDEXP(Op, DAG);
   case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
     return LowerVECTOR_HISTOGRAM(Op, DAG);
-  case ISD::PARTIAL_REDUCE_UADD:
-  case ISD::PARTIAL_REDUCE_SADD:
-    return LowerPARTIAL_REDUCE_ADD(Op, DAG);
+  case ISD::PARTIAL_REDUCE_UMLA:
+  case ISD::PARTIAL_REDUCE_SMLA:
+    return LowerPARTIAL_REDUCE_MLA(Op, DAG);
   }
 }
 
@@ -22070,9 +22070,8 @@ SDValue tryCombineToDotProduct(SDValue &Acc, SDValue &Input, SelectionDAG &DAG,
     return DAG.expandPartialReduceAdd(DL, Acc, Input);
 
   unsigned NewOpcode =
-      AIsSigned ? ISD::PARTIAL_REDUCE_SADD : ISD::PARTIAL_REDUCE_UADD;
-  auto NewMul = DAG.getNode(ISD::MUL, DL, A.getValueType(), A, B);
-  return DAG.getNode(NewOpcode, DL, AccVT, Acc, NewMul);
+      AIsSigned ? ISD::PARTIAL_REDUCE_SMLA : ISD::PARTIAL_REDUCE_UMLA;
+  return DAG.getNode(NewOpcode, DL, AccVT, Acc, A, B);
 }
 
 SDValue tryCombineToWideAdd(SDValue &Acc, SDValue &Input, SelectionDAG &DAG,
@@ -22094,9 +22093,10 @@ SDValue tryCombineToWideAdd(SDValue &Acc, SDValue &Input, SelectionDAG &DAG,
     return SDValue();
 
   unsigned NewOpcode = InputOpcode == ISD::SIGN_EXTEND
-                           ? ISD::PARTIAL_REDUCE_SADD
-                           : ISD::PARTIAL_REDUCE_UADD;
-  return DAG.getNode(NewOpcode, DL, AccVT, Acc, Input);
+                           ? ISD::PARTIAL_REDUCE_SMLA
+                           : ISD::PARTIAL_REDUCE_UMLA;
+  return DAG.getNode(NewOpcode, DL, AccVT, Acc, Input,
+                     DAG.getConstant(1, DL, InputVT));
 }
 
 SDValue performPartialReduceAddCombine(SDNode *N, SelectionDAG &DAG,
@@ -26412,8 +26412,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::MSCATTER:
   case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
     return performMaskedGatherScatterCombine(N, DCI, DAG);
-  case ISD::PARTIAL_REDUCE_UADD:
-  case ISD::PARTIAL_REDUCE_SADD:
+  case ISD::PARTIAL_REDUCE_UMLA:
+  case ISD::PARTIAL_REDUCE_SMLA:
     return performPartialReduceAddCombine(N, DAG, Subtarget);
   case ISD::FP_EXTEND:
     return performFPExtendCombine(N, DAG, DCI, Subtarget);
@@ -29176,39 +29176,29 @@ SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
 }
 
 SDValue
-AArch64TargetLowering::LowerPARTIAL_REDUCE_ADD(SDValue Op,
+AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDLoc DL(Op);
   SDValue Acc = Op.getOperand(0);
-  SDValue Input = Op.getOperand(1);
+  SDValue Input1 = Op.getOperand(1);
+  SDValue Input2 = Op.getOperand(2);
 
   EVT AccVT = Acc.getValueType();
-  EVT InputVT = Input.getValueType();
+  EVT InputVT = Input1.getValueType();
 
   unsigned Opcode = Op.getOpcode();
 
-  // If the following condition is true and the input opcode was not ISD::MUL
-  // during the DAG-combine, it is already expanded. So this condition means the
-  // input opcode must have been ISD::MUL.
   if (AccVT.getVectorElementCount() * 4 == InputVT.getVectorElementCount()) {
-    unsigned IndexAdd = 0;
-    // ISD::MUL may have already been lowered, meaning the operands would be in
-    // different positions.
-    if (Input.getOpcode() != ISD::MUL)
-      IndexAdd = 1;
-    auto A = Input.getOperand(IndexAdd);
-    auto B = Input.getOperand(IndexAdd + 1);
-
-    unsigned DotOpcode = Opcode == ISD::PARTIAL_REDUCE_SADD ? AArch64ISD::SDOT
+    unsigned DotOpcode = Opcode == ISD::PARTIAL_REDUCE_SMLA ? AArch64ISD::SDOT
                                                             : AArch64ISD::UDOT;
-    return DAG.getNode(DotOpcode, DL, AccVT, Acc, A, B);
+    return DAG.getNode(DotOpcode, DL, AccVT, Acc, Input1, Input2);
   }
-  bool InputIsSigned = Opcode == ISD::PARTIAL_REDUCE_SADD;
+  bool InputIsSigned = Opcode == ISD::PARTIAL_REDUCE_SMLA;
   unsigned BottomOpcode =
       InputIsSigned ? AArch64ISD::SADDWB : AArch64ISD::UADDWB;
   unsigned TopOpcode = InputIsSigned ? AArch64ISD::SADDWT : AArch64ISD::UADDWT;
-  auto BottomNode = DAG.getNode(BottomOpcode, DL, AccVT, Acc, Input);
-  return DAG.getNode(TopOpcode, DL, AccVT, BottomNode, Input);
+  auto BottomNode = DAG.getNode(BottomOpcode, DL, AccVT, Acc, Input1);
+  return DAG.getNode(TopOpcode, DL, AccVT, BottomNode, Input1);
 }
 
 SDValue
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 88b05f8eeba284..d4ce065a82172d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1184,7 +1184,7 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerVECTOR_DEINTERLEAVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_HISTOGRAM(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerPARTIAL_REDUCE_ADD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerPARTIAL_REDUCE_MLA(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;

>From 5bc3fccb96a1fee66b82bcdab3862f23b2653908 Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Thu, 19 Dec 2024 09:35:48 +0000
Subject: [PATCH 10/13] MUL instructions now included in DAG combines.

---
 llvm/include/llvm/CodeGen/SelectionDAG.h      |  8 ++-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 16 +++--
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  8 ++-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 67 +++++++++++--------
 4 files changed, 62 insertions(+), 37 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 349dadc8237b0a..19427b61efcd6a 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1606,7 +1606,13 @@ class SelectionDAG {
   /// \p Op1 Accumulator for where the result is stored for the partial
   /// reduction operation.
   /// \p Op2 Input for the partial reduction operation.
-  SDValue expandPartialReduceAdd(SDLoc DL, SDValue Op1, SDValue Op2);
+  /// Expands PARTIAL_REDUCE_S/UMLA nodes.
+  /// \p Acc Accumulator for where the result is stored for the partial
+  /// reduction operation.
+  /// \p Input1 First input for the partial reduction operation.
+  /// \p Input2 Second input for the partial reduction operation.
+  SDValue expandPartialReduceAdd(SDLoc DL, SDValue Acc, SDValue Input1,
+                                 SDValue Input2);
 
   /// Expands a node with multiple results to an FP or vector libcall. The
   /// libcall is expected to take all the operands of the \p Node followed by
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index b720379497d911..2e82385d5a8aff 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2467,20 +2467,24 @@ SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
   return getZExtOrTrunc(Op, SDLoc(Op), ShTy);
 }
 
-SDValue SelectionDAG::expandPartialReduceAdd(SDLoc DL, SDValue Op1,
-                                             SDValue Op2) {
-  EVT ReducedTy = Op1.getValueType();
-  EVT FullTy = Op2.getValueType();
+SDValue SelectionDAG::expandPartialReduceAdd(SDLoc DL, SDValue Acc,
+                                             SDValue Input1, SDValue Input2) {
+
+  EVT FullTy = Input1.getValueType();
+  Input2 = getAnyExtOrTrunc(Input2, DL, FullTy);
+  SDValue Input = getNode(ISD::MUL, DL, FullTy, Input1, Input2);
+
+  EVT ReducedTy = Acc.getValueType();
 
   unsigned Stride = ReducedTy.getVectorMinNumElements();
   unsigned ScaleFactor = FullTy.getVectorMinNumElements() / Stride;
 
   // Collect all of the subvectors
-  std::deque<SDValue> Subvectors = {Op1};
+  std::deque<SDValue> Subvectors = {Acc};
   for (unsigned I = 0; I < ScaleFactor; I++) {
     auto SourceIndex = getVectorIdxConstant(I * Stride, DL);
     Subvectors.push_back(
-        getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {Op2, SourceIndex}));
+        getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {Input, SourceIndex}));
   }
 
   // Flatten the subvector tree
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 3ff3e3ce54d821..08dab4295a626b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8141,11 +8141,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue Input = getValue(I.getOperand(1));
 
     if (!TLI.shouldExpandPartialReductionIntrinsic(cast<IntrinsicInst>(&I))) {
-      setValue(&I,
-               DAG.getNode(ISD::PARTIAL_REDUCE_UMLA, dl, AccVT, Acc, Input));
+      setValue(&I, DAG.getNode(ISD::PARTIAL_REDUCE_UMLA, dl, AccVT, Acc, Input,
+                               DAG.getConstant(1, dl, Input.getValueType())));
       return;
     }
-    setValue(&I, DAG.expandPartialReduceAdd(dl, Acc, Input));
+    setValue(&I,
+             DAG.expandPartialReduceAdd(
+                 dl, Acc, Input, DAG.getConstant(1, dl, Input.getValueType())));
     return;
   }
   case Intrinsic::experimental_cttz_elts: {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0f60a9c4280322..7445d2451406f5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21990,7 +21990,8 @@ static SDValue tryCombineWhileLo(SDNode *N,
   return SDValue(N, 0);
 }
 
-SDValue tryCombineToDotProduct(SDValue &Acc, SDValue &Input, SelectionDAG &DAG,
+SDValue tryCombineToDotProduct(SDValue &Acc, SDValue &Input1, SDValue &Input2,
+                               SelectionDAG &DAG,
                                const AArch64Subtarget *Subtarget, SDLoc &DL) {
   bool Scalable = Acc.getValueType().isScalableVector();
   if (Scalable && !Subtarget->isSVEorStreamingSVEAvailable())
@@ -21998,22 +21999,22 @@ SDValue tryCombineToDotProduct(SDValue &Acc, SDValue &Input, SelectionDAG &DAG,
   if (!Scalable && (!Subtarget->isNeonAvailable() || !Subtarget->hasDotProd()))
     return SDValue();
 
-  unsigned InputOpcode = Input->getOpcode();
+  unsigned Input1Opcode = Input1->getOpcode();
   EVT AccVT = Acc->getValueType(0);
   if (AccVT.getVectorElementCount() * 4 ==
-          Input->getValueType(0).getVectorElementCount() &&
-      InputOpcode != ISD::MUL)
-    return DAG.expandPartialReduceAdd(DL, Acc, Input);
-  if (InputOpcode != ISD::MUL)
+          Input1->getValueType(0).getVectorElementCount() &&
+      Input1Opcode != ISD::MUL)
+    return DAG.expandPartialReduceAdd(DL, Acc, Input1, Input2);
+  if (Input1Opcode != ISD::MUL)
     return SDValue();
 
-  auto A = Input->getOperand(0);
-  auto B = Input->getOperand(1);
+  auto A = Input1->getOperand(0);
+  auto B = Input1->getOperand(1);
   unsigned AOpcode = A->getOpcode();
   unsigned BOpcode = B->getOpcode();
 
   if (!ISD::isExtOpcode(AOpcode) || !ISD::isExtOpcode(BOpcode))
-    return DAG.expandPartialReduceAdd(DL, Acc, Input);
+    return DAG.expandPartialReduceAdd(DL, Acc, Input1, Input2);
 
   bool AIsSigned = AOpcode == ISD::SIGN_EXTEND;
   bool BIsSigned = BOpcode == ISD::SIGN_EXTEND;
@@ -22022,6 +22023,10 @@ SDValue tryCombineToDotProduct(SDValue &Acc, SDValue &Input, SelectionDAG &DAG,
   B = B->getOperand(0);
   EVT MulSrcVT = A.getValueType();
 
+  Input2 = DAG.getAnyExtOrTrunc(Input2, DL, MulSrcVT);
+  A = DAG.getNode(ISD::MUL, DL, MulSrcVT, A, Input2);
+  B = DAG.getNode(ISD::MUL, DL, MulSrcVT, B, Input2);
+
   // Dot products operate on chunks of four elements so there must be four times
   // as many elements in the wide type
   if (!(AccVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) &&
@@ -22030,17 +22035,17 @@ SDValue tryCombineToDotProduct(SDValue &Acc, SDValue &Input, SelectionDAG &DAG,
       !(AccVT == MVT::v4i64 && MulSrcVT == MVT::v16i8) &&
       !(AccVT == MVT::v4i32 && MulSrcVT == MVT::v16i8) &&
       !(AccVT == MVT::v2i32 && MulSrcVT == MVT::v8i8))
-    return DAG.expandPartialReduceAdd(DL, Acc, Input);
+    return DAG.expandPartialReduceAdd(DL, Acc, Input1, Input2);
 
   unsigned DotOpcode = AIsSigned ? AArch64ISD::SDOT : AArch64ISD::UDOT;
   if (AIsSigned != BIsSigned) {
     if (!Subtarget->hasMatMulInt8())
-      return DAG.expandPartialReduceAdd(DL, Acc, Input);
+      return DAG.expandPartialReduceAdd(DL, Acc, Input1, Input2);
 
     bool Scalable = AccVT.isScalableVT();
     // There's no nxv2i64 version of usdot
     if (Scalable && AccVT != MVT::nxv4i32 && AccVT != MVT::nxv4i64)
-      return DAG.expandPartialReduceAdd(DL, Acc, Input);
+      return DAG.expandPartialReduceAdd(DL, Acc, Input1, Input2);
 
     if (!BIsSigned)
       std::swap(A, B);
@@ -22067,32 +22072,37 @@ SDValue tryCombineToDotProduct(SDValue &Acc, SDValue &Input, SelectionDAG &DAG,
   }
 
   if (A.getValueType() != B.getValueType())
-    return DAG.expandPartialReduceAdd(DL, Acc, Input);
+    return DAG.expandPartialReduceAdd(DL, Acc, Input1, Input2);
 
   unsigned NewOpcode =
       AIsSigned ? ISD::PARTIAL_REDUCE_SMLA : ISD::PARTIAL_REDUCE_UMLA;
   return DAG.getNode(NewOpcode, DL, AccVT, Acc, A, B);
 }
 
-SDValue tryCombineToWideAdd(SDValue &Acc, SDValue &Input, SelectionDAG &DAG,
+SDValue tryCombineToWideAdd(SDValue &Acc, SDValue &Input1, SDValue &Input2,
+                            SelectionDAG &DAG,
                             const AArch64Subtarget *Subtarget, SDLoc &DL) {
   if (!Subtarget->hasSVE2() && !Subtarget->isStreamingSVEAvailable())
-    return DAG.expandPartialReduceAdd(DL, Acc, Input);
-  unsigned InputOpcode = Input->getOpcode();
-  if (!ISD::isExtOpcode(InputOpcode))
-    return DAG.expandPartialReduceAdd(DL, Acc, Input);
-  Input = Input->getOperand(0);
-  EVT InputVT = Input.getValueType();
+    return DAG.expandPartialReduceAdd(DL, Acc, Input1, Input2);
+  unsigned Input1Opcode = Input1->getOpcode();
+  if (!ISD::isExtOpcode(Input1Opcode))
+    return DAG.expandPartialReduceAdd(DL, Acc, Input1, Input2);
+
   EVT AccVT = Acc->getValueType(0);
+  Input1 = Input1->getOperand(0);
+  EVT InputVT = Input1.getValueType();
+  Input2 = DAG.getAnyExtOrTrunc(Input2, DL, InputVT);
+  SDValue Input = DAG.getNode(ISD::MUL, DL, InputVT, Input1, Input2);
+
   if (!AccVT.isScalableVector())
-    return DAG.expandPartialReduceAdd(DL, Acc, Input);
+    return DAG.expandPartialReduceAdd(DL, Acc, Input1, Input2);
 
   if (!(InputVT == MVT::nxv4i32 && AccVT == MVT::nxv2i64) &&
       !(InputVT == MVT::nxv8i16 && AccVT == MVT::nxv4i32) &&
       !(InputVT == MVT::nxv16i8 && AccVT == MVT::nxv8i16))
     return SDValue();
 
-  unsigned NewOpcode = InputOpcode == ISD::SIGN_EXTEND
+  unsigned NewOpcode = Input1Opcode == ISD::SIGN_EXTEND
                            ? ISD::PARTIAL_REDUCE_SMLA
                            : ISD::PARTIAL_REDUCE_UMLA;
   return DAG.getNode(NewOpcode, DL, AccVT, Acc, Input,
@@ -22103,18 +22113,21 @@ SDValue performPartialReduceAddCombine(SDNode *N, SelectionDAG &DAG,
                                        const AArch64Subtarget *Subtarget) {
   SDLoc DL(N);
   auto Acc = N->getOperand(0);
-  auto Input = N->getOperand(1);
+  auto Input1 = N->getOperand(1);
+  auto Input2 = N->getOperand(2);
   EVT AccElemVT = Acc.getValueType().getVectorElementType();
-  EVT InputElemVT = Input.getValueType().getVectorElementType();
+  EVT InputElemVT = Input1.getValueType().getVectorElementType();
 
   // If the exts have already been removed or it has already been lowered to an
   // usdot instruction, then the element types will not be equal
-  if (InputElemVT != AccElemVT || Input.getOpcode() == AArch64ISD::USDOT)
+  if (InputElemVT != AccElemVT || Input1.getOpcode() == AArch64ISD::USDOT)
     return SDValue(N, 0);
 
-  if (auto Dot = tryCombineToDotProduct(Acc, Input, DAG, Subtarget, DL))
+  if (auto Dot =
+          tryCombineToDotProduct(Acc, Input1, Input2, DAG, Subtarget, DL))
     return Dot;
-  if (auto WideAdd = tryCombineToWideAdd(Acc, Input, DAG, Subtarget, DL))
+  if (auto WideAdd =
+          tryCombineToWideAdd(Acc, Input1, Input2, DAG, Subtarget, DL))
     return WideAdd;
   return SDValue();
 }

>From 405e7fc46a6ffd789dbbd19cea8d2581340d94a0 Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Fri, 10 Jan 2025 09:27:11 +0000
Subject: [PATCH 11/13] Make the no bin op changes work with adding Partial
 Reduction SDNodes.

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 186 +++++++++---------
 1 file changed, 96 insertions(+), 90 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7445d2451406f5..43f4b8c6595731 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21990,144 +21990,150 @@ static SDValue tryCombineWhileLo(SDNode *N,
   return SDValue(N, 0);
 }
 
-SDValue tryCombineToDotProduct(SDValue &Acc, SDValue &Input1, SDValue &Input2,
+SDValue tryCombineToDotProduct(SDValue &Op0, SDValue &Op1, SDValue &Op2,
                                SelectionDAG &DAG,
                                const AArch64Subtarget *Subtarget, SDLoc &DL) {
-  bool Scalable = Acc.getValueType().isScalableVector();
+  bool Scalable = Op0->getValueType(0).isScalableVector();
   if (Scalable && !Subtarget->isSVEorStreamingSVEAvailable())
-    return SDValue();
+    return DAG.expandPartialReduceAdd(DL, Op0, Op1, Op2);
   if (!Scalable && (!Subtarget->isNeonAvailable() || !Subtarget->hasDotProd()))
+    return DAG.expandPartialReduceAdd(DL, Op0, Op1, Op2);
+
+  unsigned Op1Opcode = Op1->getOpcode();
+  SDValue MulOpLHS, MulOpRHS;
+  bool MulOpLHSIsSigned, MulOpRHSIsSigned;
+  if (ISD::isExtOpcode(Op1Opcode)) {
+    MulOpLHSIsSigned = MulOpRHSIsSigned = (Op1Opcode == ISD::SIGN_EXTEND);
+    MulOpLHS = Op1->getOperand(0);
+    MulOpRHS = DAG.getAnyExtOrTrunc(Op2, DL, MulOpLHS.getValueType());
+  } else if (Op1Opcode == ISD::MUL) {
+    SDValue ExtMulOpLHS = Op1->getOperand(0);
+    SDValue ExtMulOpRHS = Op1->getOperand(1);
+
+    unsigned ExtMulOpLHSOpcode = ExtMulOpLHS->getOpcode();
+    unsigned ExtMulOpRHSOpcode = ExtMulOpRHS->getOpcode();
+    if (!ISD::isExtOpcode(ExtMulOpLHSOpcode) ||
+        !ISD::isExtOpcode(ExtMulOpRHSOpcode))
+      return DAG.expandPartialReduceAdd(DL, Op0, Op1, Op2);
+
+    MulOpLHSIsSigned = ExtMulOpLHSOpcode == ISD::SIGN_EXTEND;
+    MulOpRHSIsSigned = ExtMulOpRHSOpcode == ISD::SIGN_EXTEND;
+
+    MulOpLHS = ExtMulOpLHS->getOperand(0);
+    MulOpRHS = ExtMulOpRHS->getOperand(0);
+    EVT MulOpLHSVT = MulOpLHS.getValueType();
+
+    if (MulOpLHSVT != MulOpRHS.getValueType())
+      return DAG.expandPartialReduceAdd(DL, Op0, Op1, Op2);
+
+    Op2 = DAG.getAnyExtOrTrunc(Op2, DL, MulOpLHSVT);
+    MulOpLHS = DAG.getNode(ISD::MUL, DL, MulOpLHSVT, MulOpLHS, Op2);
+    MulOpRHS = DAG.getNode(ISD::MUL, DL, MulOpLHSVT, MulOpRHS, Op2);
+  } else
     return SDValue();
 
-  unsigned Input1Opcode = Input1->getOpcode();
-  EVT AccVT = Acc->getValueType(0);
-  if (AccVT.getVectorElementCount() * 4 ==
-          Input1->getValueType(0).getVectorElementCount() &&
-      Input1Opcode != ISD::MUL)
-    return DAG.expandPartialReduceAdd(DL, Acc, Input1, Input2);
-  if (Input1Opcode != ISD::MUL)
-    return SDValue();
-
-  auto A = Input1->getOperand(0);
-  auto B = Input1->getOperand(1);
-  unsigned AOpcode = A->getOpcode();
-  unsigned BOpcode = B->getOpcode();
-
-  if (!ISD::isExtOpcode(AOpcode) || !ISD::isExtOpcode(BOpcode))
-    return DAG.expandPartialReduceAdd(DL, Acc, Input1, Input2);
-
-  bool AIsSigned = AOpcode == ISD::SIGN_EXTEND;
-  bool BIsSigned = BOpcode == ISD::SIGN_EXTEND;
-
-  A = A->getOperand(0);
-  B = B->getOperand(0);
-  EVT MulSrcVT = A.getValueType();
-
-  Input2 = DAG.getAnyExtOrTrunc(Input2, DL, MulSrcVT);
-  A = DAG.getNode(ISD::MUL, DL, MulSrcVT, A, Input2);
-  B = DAG.getNode(ISD::MUL, DL, MulSrcVT, B, Input2);
+  SDValue Acc = Op0;
+  EVT ReducedVT = Acc->getValueType(0);
+  EVT MulSrcVT = MulOpLHS.getValueType();
 
   // Dot products operate on chunks of four elements so there must be four times
   // as many elements in the wide type
-  if (!(AccVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) &&
-      !(AccVT == MVT::nxv4i32 && MulSrcVT == MVT::nxv16i8) &&
-      !(AccVT == MVT::nxv2i64 && MulSrcVT == MVT::nxv8i16) &&
-      !(AccVT == MVT::v4i64 && MulSrcVT == MVT::v16i8) &&
-      !(AccVT == MVT::v4i32 && MulSrcVT == MVT::v16i8) &&
-      !(AccVT == MVT::v2i32 && MulSrcVT == MVT::v8i8))
-    return DAG.expandPartialReduceAdd(DL, Acc, Input1, Input2);
-
-  unsigned DotOpcode = AIsSigned ? AArch64ISD::SDOT : AArch64ISD::UDOT;
-  if (AIsSigned != BIsSigned) {
+  if (!(ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) &&
+      !(ReducedVT == MVT::nxv4i32 && MulSrcVT == MVT::nxv16i8) &&
+      !(ReducedVT == MVT::nxv2i64 && MulSrcVT == MVT::nxv8i16) &&
+      !(ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8) &&
+      !(ReducedVT == MVT::v4i32 && MulSrcVT == MVT::v16i8) &&
+      !(ReducedVT == MVT::v2i32 && MulSrcVT == MVT::v8i8))
+    return SDValue();
+
+  // If the extensions are mixed, we should lower it to a usdot instead
+  unsigned DotOpcode = MulOpLHSIsSigned ? AArch64ISD::SDOT : AArch64ISD::UDOT;
+  if (MulOpLHSIsSigned != MulOpRHSIsSigned) {
     if (!Subtarget->hasMatMulInt8())
-      return DAG.expandPartialReduceAdd(DL, Acc, Input1, Input2);
+      return DAG.expandPartialReduceAdd(DL, Op0, Op1, Op2);
 
-    bool Scalable = AccVT.isScalableVT();
+    bool Scalable = ReducedVT.isScalableVT();
     // There's no nxv2i64 version of usdot
-    if (Scalable && AccVT != MVT::nxv4i32 && AccVT != MVT::nxv4i64)
-      return DAG.expandPartialReduceAdd(DL, Acc, Input1, Input2);
+    if (Scalable && ReducedVT != MVT::nxv4i32 && ReducedVT != MVT::nxv4i64)
+      return DAG.expandPartialReduceAdd(DL, Op0, Op1, Op2);
 
-    if (!BIsSigned)
-      std::swap(A, B);
+    if (!MulOpRHSIsSigned)
+      std::swap(MulOpLHS, MulOpRHS);
     DotOpcode = AArch64ISD::USDOT;
     // Lower usdot patterns here because legalisation would attempt to split it
     // unless exts are removed. But, removing the exts would lose the
     // information about whether each operand is signed.
-    if ((AccVT != MVT::nxv4i64 || MulSrcVT != MVT::nxv16i8) &&
-        (AccVT != MVT::v4i64 || MulSrcVT != MVT::v16i8))
-      return DAG.getNode(DotOpcode, DL, AccVT, Acc, A, B);
+    if ((ReducedVT != MVT::nxv4i64 || MulSrcVT != MVT::nxv16i8) &&
+        (ReducedVT != MVT::v4i64 || MulSrcVT != MVT::v16i8))
+      return DAG.getNode(DotOpcode, DL, ReducedVT, Acc, MulOpLHS, MulOpRHS);
   }
 
   // Partial reduction lowering for (nx)v16i8 to (nx)v4i64 requires an i32 dot
   // product followed by a zero / sign extension. Need to lower this here
   // because legalisation would attempt to split it.
-  if ((AccVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) ||
-      (AccVT == MVT::v4i64 && MulSrcVT == MVT::v16i8)) {
-    EVT AccVTI32 = (AccVT.isScalableVector()) ? MVT::nxv4i32 : MVT::v4i32;
+  if ((ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) ||
+      (ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8)) {
+    EVT ReducedVTI32 =
+        (ReducedVT.isScalableVector()) ? MVT::nxv4i32 : MVT::v4i32;
 
-    auto DotI32 = DAG.getNode(DotOpcode, DL, AccVTI32,
-                              DAG.getConstant(0, DL, AccVTI32), A, B);
-    auto Extended = DAG.getSExtOrTrunc(DotI32, DL, AccVT);
-    return DAG.getNode(ISD::ADD, DL, AccVT, Acc, Extended);
+    SDValue DotI32 =
+        DAG.getNode(DotOpcode, DL, ReducedVTI32,
+                    DAG.getConstant(0, DL, ReducedVTI32), MulOpLHS, MulOpRHS);
+    SDValue Extended = DAG.getSExtOrTrunc(DotI32, DL, ReducedVT);
+    return DAG.getNode(ISD::ADD, DL, ReducedVT, Acc, Extended);
   }
 
-  if (A.getValueType() != B.getValueType())
-    return DAG.expandPartialReduceAdd(DL, Acc, Input1, Input2);
-
   unsigned NewOpcode =
-      AIsSigned ? ISD::PARTIAL_REDUCE_SMLA : ISD::PARTIAL_REDUCE_UMLA;
-  return DAG.getNode(NewOpcode, DL, AccVT, Acc, A, B);
+      MulOpLHSIsSigned ? ISD::PARTIAL_REDUCE_SMLA : ISD::PARTIAL_REDUCE_UMLA;
+  return DAG.getNode(NewOpcode, DL, ReducedVT, Acc, MulOpLHS, MulOpRHS);
 }
 
-SDValue tryCombineToWideAdd(SDValue &Acc, SDValue &Input1, SDValue &Input2,
+SDValue tryCombineToWideAdd(SDValue &Op0, SDValue &Op1, SDValue &Op2,
                             SelectionDAG &DAG,
                             const AArch64Subtarget *Subtarget, SDLoc &DL) {
   if (!Subtarget->hasSVE2() && !Subtarget->isStreamingSVEAvailable())
-    return DAG.expandPartialReduceAdd(DL, Acc, Input1, Input2);
-  unsigned Input1Opcode = Input1->getOpcode();
-  if (!ISD::isExtOpcode(Input1Opcode))
-    return DAG.expandPartialReduceAdd(DL, Acc, Input1, Input2);
+    return DAG.expandPartialReduceAdd(DL, Op0, Op1, Op2);
+  unsigned Op1Opcode = Op1->getOpcode();
+  if (!ISD::isExtOpcode(Op1Opcode))
+    return DAG.expandPartialReduceAdd(DL, Op0, Op1, Op2);
 
-  EVT AccVT = Acc->getValueType(0);
-  Input1 = Input1->getOperand(0);
-  EVT InputVT = Input1.getValueType();
-  Input2 = DAG.getAnyExtOrTrunc(Input2, DL, InputVT);
-  SDValue Input = DAG.getNode(ISD::MUL, DL, InputVT, Input1, Input2);
+  EVT AccVT = Op0->getValueType(0);
+  Op1 = Op1->getOperand(0);
+  EVT Op1VT = Op1.getValueType();
+  Op2 = DAG.getAnyExtOrTrunc(Op2, DL, Op1VT);
+  SDValue Input = DAG.getNode(ISD::MUL, DL, Op1VT, Op1, Op2);
 
   if (!AccVT.isScalableVector())
-    return DAG.expandPartialReduceAdd(DL, Acc, Input1, Input2);
+    return DAG.expandPartialReduceAdd(DL, Op0, Op1, Op2);
 
-  if (!(InputVT == MVT::nxv4i32 && AccVT == MVT::nxv2i64) &&
-      !(InputVT == MVT::nxv8i16 && AccVT == MVT::nxv4i32) &&
-      !(InputVT == MVT::nxv16i8 && AccVT == MVT::nxv8i16))
+  if (!(Op1VT == MVT::nxv4i32 && AccVT == MVT::nxv2i64) &&
+      !(Op1VT == MVT::nxv8i16 && AccVT == MVT::nxv4i32) &&
+      !(Op1VT == MVT::nxv16i8 && AccVT == MVT::nxv8i16))
     return SDValue();
 
-  unsigned NewOpcode = Input1Opcode == ISD::SIGN_EXTEND
-                           ? ISD::PARTIAL_REDUCE_SMLA
-                           : ISD::PARTIAL_REDUCE_UMLA;
-  return DAG.getNode(NewOpcode, DL, AccVT, Acc, Input,
-                     DAG.getConstant(1, DL, InputVT));
+  unsigned NewOpcode = Op1Opcode == ISD::SIGN_EXTEND ? ISD::PARTIAL_REDUCE_SMLA
+                                                     : ISD::PARTIAL_REDUCE_UMLA;
+  return DAG.getNode(NewOpcode, DL, AccVT, Op0, Input,
+                     DAG.getConstant(1, DL, Op1VT));
 }
 
 SDValue performPartialReduceAddCombine(SDNode *N, SelectionDAG &DAG,
                                        const AArch64Subtarget *Subtarget) {
   SDLoc DL(N);
-  auto Acc = N->getOperand(0);
-  auto Input1 = N->getOperand(1);
-  auto Input2 = N->getOperand(2);
-  EVT AccElemVT = Acc.getValueType().getVectorElementType();
-  EVT InputElemVT = Input1.getValueType().getVectorElementType();
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDValue Op2 = N->getOperand(2);
+  EVT Op0ElemVT = Op0.getValueType().getVectorElementType();
+  EVT Op1ElemVT = Op1.getValueType().getVectorElementType();
 
   // If the exts have already been removed or it has already been lowered to an
   // usdot instruction, then the element types will not be equal
-  if (InputElemVT != AccElemVT || Input1.getOpcode() == AArch64ISD::USDOT)
+  if (Op0ElemVT != Op1ElemVT || Op1.getOpcode() == AArch64ISD::USDOT)
     return SDValue(N, 0);
 
-  if (auto Dot =
-          tryCombineToDotProduct(Acc, Input1, Input2, DAG, Subtarget, DL))
+  if (auto Dot = tryCombineToDotProduct(Op0, Op1, Op2, DAG, Subtarget, DL))
     return Dot;
-  if (auto WideAdd =
-          tryCombineToWideAdd(Acc, Input1, Input2, DAG, Subtarget, DL))
+  if (auto WideAdd = tryCombineToWideAdd(Op0, Op1, Op2, DAG, Subtarget, DL))
     return WideAdd;
   return SDValue();
 }

>From 86ec3b3bfa9a93065a61b339d50f584d05a67af2 Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Mon, 20 Jan 2025 09:59:44 +0000
Subject: [PATCH 12/13] Address comments on patch. Remove
 shouldExpandPartialReductionIntrinsic().

---
 llvm/include/llvm/CodeGen/ISDOpcodes.h        | 15 +++++--
 llvm/include/llvm/CodeGen/SelectionDAG.h      |  4 --
 llvm/include/llvm/CodeGen/TargetLowering.h    |  7 ---
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 25 +++++++----
 .../Target/AArch64/AArch64ISelLowering.cpp    | 45 ++++++-------------
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  3 --
 6 files changed, 41 insertions(+), 58 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 2436d914c80cba..53b9e3efdaf07b 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1451,9 +1451,18 @@ enum NodeType {
   VECREDUCE_UMAX,
   VECREDUCE_UMIN,
 
-  // Nodes used to represent a partial reduction addition operation (signed and
-  // unsigned).
-  // Operands: Accumulator, Input
+  // Partial Reduction nodes. These represent multiply-add instructions because
+  // Input1 and Input2 are multiplied together first. This result is then
+  // reduced, by addition, to the number of elements that the Accumulator's type
+  // has.
+  // Input1 and Input2 must be the same type. Accumulator's element type must
+  // match that of Input1 and Input2. The number of elements in Input1 and
+  // Input2 must be a positive integer multiple of the number of elements in the
+  // Accumulator.
+  // The signedness of this node will dictate the signedness of nodes expanded
+  // from it. The signedness of the node is dictated by the signedness of
+  // Input1.
+  // Operands: Accumulator, Input1, Input2
   // Outputs: Output
   PARTIAL_REDUCE_SMLA,
   PARTIAL_REDUCE_UMLA,
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 19427b61efcd6a..7c1a22d2593439 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1602,10 +1602,6 @@ class SelectionDAG {
   /// the target's desired shift amount type.
   SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op);
 
-  /// Expands PARTIAL_REDUCE_S/UMLA nodes.
-  /// \p Op1 Accumulator for where the result is stored for the partial
-  /// reduction operation.
-  /// \p Op2 Input for the partial reduction operation.
   /// Expands PARTIAL_REDUCE_S/UMLA nodes.
   /// \p Acc Accumulator for where the result is stored for the partial
   /// reduction operation.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index ce58777655e063..bb6dcf7543fe3a 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -455,13 +455,6 @@ class TargetLoweringBase {
     return true;
   }
 
-  /// Return true if the @llvm.experimental.vector.partial.reduce.* intrinsic
-  /// should be expanded using generic code in SelectionDAGBuilder.
-  virtual bool
-  shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const {
-    return true;
-  }
-
   /// Return true if the @llvm.get.active.lane.mask intrinsic should be expanded
   /// using generic code in SelectionDAGBuilder.
   virtual bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 08dab4295a626b..94a351676c1e36 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8139,15 +8139,22 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue Acc = getValue(I.getOperand(0));
     EVT AccVT = Acc.getValueType();
     SDValue Input = getValue(I.getOperand(1));
-
-    if (!TLI.shouldExpandPartialReductionIntrinsic(cast<IntrinsicInst>(&I))) {
-      setValue(&I, DAG.getNode(ISD::PARTIAL_REDUCE_UMLA, dl, AccVT, Acc, Input,
-                               DAG.getConstant(1, dl, Input.getValueType())));
-      return;
-    }
-    setValue(&I,
-             DAG.expandPartialReduceAdd(
-                 dl, Acc, Input, DAG.getConstant(1, dl, Input.getValueType())));
+    EVT InputVT = Input.getValueType();
+
+    assert(AccVT.getVectorElementType() == InputVT.getVectorElementType() &&
+           "Expected operands to have the same vector element type!");
+    assert(InputVT.getVectorElementCount().getKnownMinValue() %
+                   AccVT.getVectorElementCount().getKnownMinValue() ==
+               0 &&
+           "Expected the element count of the Input operand to be a positive "
+           "integer multiple of the element count of the Accumulator operand!");
+
+    // ISD::PARTIAL_REDUCE_UMLA is chosen arbitrarily and would function the
+    // same if ISD::PARTIAL_REDUCE_SMLA was used instead. It should be changed
+    // to its correct signedness when combining or expanding, according to
+    // extends being performed on Input.
+    setValue(&I, DAG.getNode(ISD::PARTIAL_REDUCE_UMLA, dl, AccVT, Acc, Input,
+                             DAG.getConstant(1, dl, InputVT)));
     return;
   }
   case Intrinsic::experimental_cttz_elts: {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 64e4675b5e535c..266c810611485a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2051,28 +2051,6 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
   return false;
 }
 
-bool AArch64TargetLowering::shouldExpandPartialReductionIntrinsic(
-    const IntrinsicInst *I) const {
-  if (I->getIntrinsicID() != Intrinsic::experimental_vector_partial_reduce_add)
-    return true;
-
-  EVT VT = EVT::getEVT(I->getType());
-  auto Input = I->getOperand(1);
-  EVT InputVT = EVT::getEVT(Input->getType());
-
-  if ((InputVT == MVT::nxv4i64 && VT == MVT::nxv2i64) ||
-      (InputVT == MVT::nxv8i32 && VT == MVT::nxv4i32) ||
-      (InputVT == MVT::nxv16i16 && VT == MVT::nxv8i16) ||
-      (InputVT == MVT::nxv16i64 && VT == MVT::nxv4i64) ||
-      (InputVT == MVT::nxv16i32 && VT == MVT::nxv4i32) ||
-      (InputVT == MVT::nxv8i64 && VT == MVT::nxv2i64) ||
-      (InputVT == MVT::v16i64 && VT == MVT::v4i64) ||
-      (InputVT == MVT::v16i32 && VT == MVT::v4i32) ||
-      (InputVT == MVT::v8i32 && VT == MVT::v2i32))
-    return false;
-  return true;
-}
-
 bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
   if (!Subtarget->isSVEorStreamingSVEAvailable())
     return true;
@@ -22059,9 +22037,9 @@ SDValue tryCombineToDotProduct(SDValue &Op0, SDValue &Op1, SDValue &Op2,
                                const AArch64Subtarget *Subtarget, SDLoc &DL) {
   bool Scalable = Op0->getValueType(0).isScalableVector();
   if (Scalable && !Subtarget->isSVEorStreamingSVEAvailable())
-    return DAG.expandPartialReduceAdd(DL, Op0, Op1, Op2);
+    return SDValue();
   if (!Scalable && (!Subtarget->isNeonAvailable() || !Subtarget->hasDotProd()))
-    return DAG.expandPartialReduceAdd(DL, Op0, Op1, Op2);
+    return SDValue();
 
   unsigned Op1Opcode = Op1->getOpcode();
   SDValue MulOpLHS, MulOpRHS;
@@ -22078,7 +22056,7 @@ SDValue tryCombineToDotProduct(SDValue &Op0, SDValue &Op1, SDValue &Op2,
     unsigned ExtMulOpRHSOpcode = ExtMulOpRHS->getOpcode();
     if (!ISD::isExtOpcode(ExtMulOpLHSOpcode) ||
         !ISD::isExtOpcode(ExtMulOpRHSOpcode))
-      return DAG.expandPartialReduceAdd(DL, Op0, Op1, Op2);
+      return SDValue();
 
     MulOpLHSIsSigned = ExtMulOpLHSOpcode == ISD::SIGN_EXTEND;
     MulOpRHSIsSigned = ExtMulOpRHSOpcode == ISD::SIGN_EXTEND;
@@ -22088,7 +22066,7 @@ SDValue tryCombineToDotProduct(SDValue &Op0, SDValue &Op1, SDValue &Op2,
     EVT MulOpLHSVT = MulOpLHS.getValueType();
 
     if (MulOpLHSVT != MulOpRHS.getValueType())
-      return DAG.expandPartialReduceAdd(DL, Op0, Op1, Op2);
+      return SDValue();
 
     Op2 = DAG.getAnyExtOrTrunc(Op2, DL, MulOpLHSVT);
     MulOpLHS = DAG.getNode(ISD::MUL, DL, MulOpLHSVT, MulOpLHS, Op2);
@@ -22114,12 +22092,12 @@ SDValue tryCombineToDotProduct(SDValue &Op0, SDValue &Op1, SDValue &Op2,
   unsigned DotOpcode = MulOpLHSIsSigned ? AArch64ISD::SDOT : AArch64ISD::UDOT;
   if (MulOpLHSIsSigned != MulOpRHSIsSigned) {
     if (!Subtarget->hasMatMulInt8())
-      return DAG.expandPartialReduceAdd(DL, Op0, Op1, Op2);
+      return SDValue();
 
     bool Scalable = ReducedVT.isScalableVT();
     // There's no nxv2i64 version of usdot
     if (Scalable && ReducedVT != MVT::nxv4i32 && ReducedVT != MVT::nxv4i64)
-      return DAG.expandPartialReduceAdd(DL, Op0, Op1, Op2);
+      return SDValue();
 
     if (!MulOpRHSIsSigned)
       std::swap(MulOpLHS, MulOpRHS);
@@ -22156,10 +22134,10 @@ SDValue tryCombineToWideAdd(SDValue &Op0, SDValue &Op1, SDValue &Op2,
                             SelectionDAG &DAG,
                             const AArch64Subtarget *Subtarget, SDLoc &DL) {
   if (!Subtarget->hasSVE2() && !Subtarget->isStreamingSVEAvailable())
-    return DAG.expandPartialReduceAdd(DL, Op0, Op1, Op2);
+    return SDValue();
   unsigned Op1Opcode = Op1->getOpcode();
   if (!ISD::isExtOpcode(Op1Opcode))
-    return DAG.expandPartialReduceAdd(DL, Op0, Op1, Op2);
+    return SDValue();
 
   EVT AccVT = Op0->getValueType(0);
   Op1 = Op1->getOperand(0);
@@ -22168,7 +22146,7 @@ SDValue tryCombineToWideAdd(SDValue &Op0, SDValue &Op1, SDValue &Op2,
   SDValue Input = DAG.getNode(ISD::MUL, DL, Op1VT, Op1, Op2);
 
   if (!AccVT.isScalableVector())
-    return DAG.expandPartialReduceAdd(DL, Op0, Op1, Op2);
+    return SDValue();
 
   if (!(Op1VT == MVT::nxv4i32 && AccVT == MVT::nxv2i64) &&
       !(Op1VT == MVT::nxv8i16 && AccVT == MVT::nxv4i32) &&
@@ -22199,7 +22177,10 @@ SDValue performPartialReduceAddCombine(SDNode *N, SelectionDAG &DAG,
     return Dot;
   if (auto WideAdd = tryCombineToWideAdd(Op0, Op1, Op2, DAG, Subtarget, DL))
     return WideAdd;
-  return SDValue();
+  // N->getOperand needs calling again because the Op variables may have been
+  // changed by the functions above
+  return DAG.expandPartialReduceAdd(DL, N->getOperand(0), N->getOperand(1),
+                                    N->getOperand(2));
 }
 
 static SDValue performIntrinsicCombine(SDNode *N,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index d4ce065a82172d..c2f0b7863d308f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -993,9 +993,6 @@ class AArch64TargetLowering : public TargetLowering {
 
   bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override;
 
-  bool
-  shouldExpandPartialReductionIntrinsic(const IntrinsicInst *I) const override;
-
   bool shouldExpandCttzElements(EVT VT) const override;
 
   bool shouldExpandVectorMatch(EVT VT, unsigned SearchSize) const override;

>From ed4f2e5d8c6d3af00f9d9115c0aacf8622580ca8 Mon Sep 17 00:00:00 2001
From: James Chesterman <james.chesterman at arm.com>
Date: Mon, 20 Jan 2025 15:10:52 +0000
Subject: [PATCH 13/13] Add the MUL in LowerPARTIAL_REDUCE_MLA()

Only do it if Input2 is a splat vector of constant 1s. Still create
the MUL in the DAG combine for the wide add pattern. This is
because it is pruned if an operand is constant 1s, or changed to
a shift instruction if an operand is a power of 2. This would not
happen if the MUL was made in LowerPARTIAL_REDUCE_MLA.
---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 266c810611485a..2a9bb4a3c58f97 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22142,7 +22142,10 @@ SDValue tryCombineToWideAdd(SDValue &Op0, SDValue &Op1, SDValue &Op2,
   EVT AccVT = Op0->getValueType(0);
   Op1 = Op1->getOperand(0);
   EVT Op1VT = Op1.getValueType();
+  // Makes Op2's value type match the value type of Op1 without its extend.
   Op2 = DAG.getAnyExtOrTrunc(Op2, DL, Op1VT);
+  // Make a MUL between Op1 and Op2 here so the MUL can be changed if possible
+  // (can be pruned or changed to a shift instruction for example).
   SDValue Input = DAG.getNode(ISD::MUL, DL, Op1VT, Op1, Op2);
 
   if (!AccVT.isScalableVector())
@@ -22155,6 +22158,7 @@ SDValue tryCombineToWideAdd(SDValue &Op0, SDValue &Op1, SDValue &Op2,
 
   unsigned NewOpcode = Op1Opcode == ISD::SIGN_EXTEND ? ISD::PARTIAL_REDUCE_SMLA
                                                      : ISD::PARTIAL_REDUCE_UMLA;
+  // Return a constant of 1s for Op2 so the MUL is not performed again.
   return DAG.getNode(NewOpcode, DL, AccVT, Op0, Input,
                      DAG.getConstant(1, DL, Op1VT));
 }
@@ -29305,11 +29309,19 @@ AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
                                                             : AArch64ISD::UDOT;
     return DAG.getNode(DotOpcode, DL, AccVT, Acc, Input1, Input2);
   }
+
+  SDValue MulInput = Input1;
+  // If Input2 is a splat vector of constant 1 then the MUL instruction is not
+  // needed. If it was created here it would not be automatically pruned.
+  if (Input2.getOpcode() != ISD::SPLAT_VECTOR || Input2.getNumOperands() == 0 ||
+      !isOneConstant(Input2.getOperand(0)))
+    MulInput = DAG.getNode(ISD::MUL, DL, InputVT, Input1, Input2);
+
   bool InputIsSigned = Opcode == ISD::PARTIAL_REDUCE_SMLA;
   unsigned BottomOpcode =
       InputIsSigned ? AArch64ISD::SADDWB : AArch64ISD::UADDWB;
   unsigned TopOpcode = InputIsSigned ? AArch64ISD::SADDWT : AArch64ISD::UADDWT;
-  auto BottomNode = DAG.getNode(BottomOpcode, DL, AccVT, Acc, Input1);
+  SDValue BottomNode = DAG.getNode(BottomOpcode, DL, AccVT, Acc, Input1);
   return DAG.getNode(TopOpcode, DL, AccVT, BottomNode, Input1);
 }