[llvm] [NVPTX] fold movs into loads and stores (PR #144581)

Tue Jun 17 13:37:32 PDT 2025

================
@@ -5047,26 +5043,229 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
   return SDValue();
 }
 
-static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front,
-                                         std::size_t Back) {
+/// Combine extractelts into a load by increasing the number of return values.
+static SDValue
+combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  // Don't run this optimization before the legalizer
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
+  EVT ElemVT = N->getValueType(0);
+  if (!Isv2x16VT(ElemVT))
+    return SDValue();
+
+  // Check whether all outputs are either used by an extractelt or are
+  // glue/chain nodes
+  if (!all_of(N->uses(), [&](SDUse &U) {
+        return U.getValueType() != ElemVT ||
+               (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+                // also check that the extractelt is used if this is an
+                // ISD::LOAD, otherwise it may be optimized by something else
+                (N->getOpcode() != ISD::LOAD || !U.getUser()->use_empty()));
+      }))
+    return SDValue();
+
+  auto *LD = cast<MemSDNode>(N);
+  EVT MemVT = LD->getMemoryVT();
+  SDLoc DL(LD);
+
+  // the new opcode after we double the number of operands
+  NVPTXISD::NodeType Opcode;
+  SmallVector<SDValue> Operands(LD->ops());
+  switch (LD->getOpcode()) {
+  // Any packed type is legal, so the legalizer will not have lowered ISD::LOAD
+  // -> NVPTXISD::Load. We have to do it here.
+  case ISD::LOAD:
+    Opcode = NVPTXISD::LoadV2;
+    {
+      Operands.push_back(DCI.DAG.getIntPtrConstant(
+          cast<LoadSDNode>(LD)->getExtensionType(), DL));
+      Align Alignment = LD->getAlign();
+      const auto &TD = DCI.DAG.getDataLayout();
+      Align PrefAlign =
+          TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DCI.DAG.getContext()));
+      if (Alignment < PrefAlign) {
+        // This load is not sufficiently aligned, so bail out and let this
+        // vector load be scalarized.  Note that we may still be able to emit
+        // smaller vector loads.  For example, if we are loading a <4 x float>
+        // with an alignment of 8, this check will fail but the legalizer will
+        // try again with 2 x <2 x float>, which will succeed with an alignment
+        // of 8.
+        return SDValue();
+      }
+    }
+    break;
+  case NVPTXISD::LoadParamV2:
+    Opcode = NVPTXISD::LoadParamV4;
+    break;
+  case NVPTXISD::LoadV2:
+    Opcode = NVPTXISD::LoadV4;
+    break;
+  case NVPTXISD::LoadV4:
+    // PTX doesn't support v8 for 16-bit values
+  case NVPTXISD::LoadV8:
+    // PTX doesn't support the next doubling of outputs
+    return SDValue();
+  }
+
+  SmallVector<EVT> NewVTs;
+  for (EVT VT : LD->values()) {
+    if (VT == ElemVT) {
+      const EVT ScalarVT = ElemVT.getVectorElementType();
+      NewVTs.insert(NewVTs.end(), {ScalarVT, ScalarVT});
+    } else
+      NewVTs.push_back(VT);
+  }
+
+  // Create the new load
+  SDValue NewLoad =
+      DCI.DAG.getMemIntrinsicNode(Opcode, DL, DCI.DAG.getVTList(NewVTs),
+                                  Operands, MemVT, LD->getMemOperand());
+
+  // Now we use a combination of BUILD_VECTORs and a MERGE_VALUES node to keep
+  // the outputs the same. These nodes will be optimized away in later
+  // DAGCombiner iterations.
+  SmallVector<SDValue> Results;
+  for (unsigned I = 0; I < NewLoad->getNumValues();) {
+    if (NewLoad->getValueType(I) == ElemVT.getVectorElementType()) {
+      Results.push_back(DCI.DAG.getBuildVector(
+          ElemVT, DL, {NewLoad.getValue(I), NewLoad.getValue(I + 1)}));
+      I += 2;
+    } else {
+      Results.push_back(NewLoad.getValue(I));
+      I += 1;
+    }
+  }
+
+  return DCI.DAG.getMergeValues(Results, DL);
+}
+
+/// Fold a packing mov into a store. This may help lower register pressure.
+///
+/// ex:
+/// v: v2f16 = build_vector a:f16, b:f16
+/// StoreRetval v
+///
+/// ...is turned into...
+///
+/// StoreRetvalV2 a:f16, b:f16
+static SDValue combinePackingMovIntoStore(SDNode *N,
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          unsigned Front, unsigned Back) {
+  // Don't run this optimization before the legalizer
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
+  // Get the type of the operands being stored.
+  EVT ElementVT = N->getOperand(Front).getValueType();
+
+  if (!Isv2x16VT(ElementVT))
+    return SDValue();
+
+  // We want to run this as late as possible since other optimizations may
+  // eliminate the BUILD_VECTORs.
+  if (!DCI.isAfterLegalizeDAG())
+    return SDValue();
----------------
AlexMaclean wrote:

Can this be combined with the `isBeforeLegalize` check above? 

https://github.com/llvm/llvm-project/pull/144581