[llvm] [RISCV] Add DAG combine to convert (iN reduce.add (zext (vXi1 A to vXiN)) into vcpop.m (PR #127497)

Wed Feb 19 06:16:11 PST 2025

================
@@ -18100,25 +18100,38 @@ static SDValue combineTruncToVnclip(SDNode *N, SelectionDAG &DAG,
 //   (iX ctpop (bitcast (vXi1 A)))
 // ->
 //   (zext (vcpop.m (nxvYi1 (insert_subvec (vXi1 A)))))
+// and
+//   (iN reduce.add (zext (vXi1 A to vXiN))
+// ->
+//   (zext (vcpop.m (nxvYi1 (insert_subvec (vXi1 A)))))
 // FIXME: It's complicated to match all the variations of this after type
 // legalization so we only handle the pre-type legalization pattern, but that
 // requires the fixed vector type to be legal.
-static SDValue combineScalarCTPOPToVCPOP(SDNode *N, SelectionDAG &DAG,
-                                         const RISCVSubtarget &Subtarget) {
+static SDValue combineToVCPOP(SDNode *N, SelectionDAG &DAG,
+                              const RISCVSubtarget &Subtarget) {
+  unsigned Opc = N->getOpcode();
+  assert((Opc == ISD::CTPOP || Opc == ISD::VECREDUCE_ADD) &&
+         "Unexpected opcode");
   EVT VT = N->getValueType(0);
   if (!VT.isScalarInteger())
     return SDValue();
 
   SDValue Src = N->getOperand(0);
 
-  // Peek through zero_extend. It doesn't change the count.
-  if (Src.getOpcode() == ISD::ZERO_EXTEND)
-    Src = Src.getOperand(0);
+  if (Opc == ISD::CTPOP) {
+    // Peek through zero_extend. It doesn't change the count.
+    if (Src.getOpcode() == ISD::ZERO_EXTEND)
+      Src = Src.getOperand(0);
 
-  if (Src.getOpcode() != ISD::BITCAST)
-    return SDValue();
+    if (Src.getOpcode() != ISD::BITCAST)
+      return SDValue();
+    Src = Src.getOperand(0);
+  } else if (Opc == ISD::VECREDUCE_ADD) {
+    if (Src.getOpcode() != ISD::ZERO_EXTEND)
+      return SDValue();
----------------
skachkov-sc wrote:

Good catch, thank you! I've made a fix that estimates possible maximum number of elements in mask and ensures that destination type is large enough to hold it. However, I've discovered that for this particular case (zext <16 x i1> to <16 x i4> + reduce.add) vcpop.m is still generated because type was extended to i8 during type legalization. This promotion also can be observed here: https://godbolt.org/z/avWEdcjvW, and it's confusing because if we have all-ones mask as an input, we should give zero result due to wrapping behaviour of add reduce, but the generated code will return 16 in this case. Also, I can't find any documentation on oveflowing behavior for integer reductions (intrinsics or ISD nodes)...

https://github.com/llvm/llvm-project/pull/127497