[llvm] [AArch64][SVE] Add codegen support for partial reduction lowering to wide add instructions (PR #114406)

Thu Oct 31 07:45:33 PDT 2024

================
@@ -21783,6 +21784,62 @@ SDValue tryLowerPartialReductionToDot(SDNode *N,
   return DAG.getNode(Opcode, DL, ReducedType, NarrowOp, A, B);
 }
 
+SDValue tryLowerPartialReductionToWideAdd(SDNode *N,
+                                          const AArch64Subtarget *Subtarget,
+                                          SelectionDAG &DAG) {
+
+  assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+         getIntrinsicID(N) ==
+             Intrinsic::experimental_vector_partial_reduce_add &&
+         "Expected a partial reduction node");
+
+  bool Scalable = N->getValueType(0).isScalableVector();
+  if (Scalable && !Subtarget->isSVEorStreamingSVEAvailable())
+    return SDValue();
+
+  SDLoc DL(N);
+
+  auto Accumulator = N->getOperand(1);
+  auto ExtInput = N->getOperand(2);
+
+  EVT AccumulatorType = Accumulator.getValueType();
+  EVT AccumulatorElementType = AccumulatorType.getVectorElementType();
+
+  if (ExtInput.getValueType().getVectorElementType() != AccumulatorElementType)
+    return SDValue();
+
+  unsigned ExtInputOpcode = ExtInput->getOpcode();
+  if (!ISD::isExtOpcode(ExtInputOpcode))
+    return SDValue();
+
+  auto Input = ExtInput->getOperand(0);
+  EVT InputType = Input.getValueType();
+
+  // To do this transformation, output element size needs to be double input
+  // element size, and output number of elements needs to be half the input
+  // number of elements
+  if (!(InputType.getVectorElementType().getSizeInBits() * 2 ==
+        AccumulatorElementType.getSizeInBits()) ||
+      !(AccumulatorType.getVectorElementCount() * 2 ==
+        InputType.getVectorElementCount()) ||
+      !(AccumulatorType.isScalableVector() == InputType.isScalableVector()))
+    return SDValue();
+
+  bool InputIsSigned = ExtInputOpcode == ISD::SIGN_EXTEND;
+  auto BottomIntrinsic = InputIsSigned ? Intrinsic::aarch64_sve_saddwb
----------------
huntergr-arm wrote:

I think adding dedicated AArch64ISD nodes for these operations would be good, though that can also be in another patch.

https://github.com/llvm/llvm-project/pull/114406