[llvm] [NVPTX] fold movs into loads and stores (PR #144581)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 18 09:52:08 PDT 2025
================
@@ -5047,26 +5044,244 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
return SDValue();
}
-static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front,
- std::size_t Back) {
+/// Combine extractelts into a load by increasing the number of return values.
+static SDValue
+combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+ // Don't run this optimization before the legalizer
+ if (!DCI.isAfterLegalizeDAG())
+ return SDValue();
+
+ EVT ElemVT = N->getValueType(0);
+ if (!Isv2x16VT(ElemVT))
+ return SDValue();
+
+ // Check whether all outputs are either used by an extractelt or are
+ // glue/chain nodes
+ if (!all_of(N->uses(), [&](SDUse &U) {
+ // Skip glue, chain nodes
+ if (U.getValueType() == MVT::Glue || U.getValueType() == MVT::Other)
+ return true;
+ if (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ if (N->getOpcode() != ISD::LOAD)
+ return true;
+ // Since this is an ISD::LOAD, check all extractelts are used. If
+ // any are not used, we don't want to defeat another optimization that
+ // will narrow the load.
+ //
+ // For example:
+ //
+ // L: v2f16,ch = load <p>
+ // e0: f16 = extractelt L:0, 0
+ // e1: f16 = extractelt L:0, 1 <-- unused
+ // store e0
+ //
+ // Can be optimized by DAGCombiner to:
+ //
+ // L: f16,ch = load <p>
+ // store L:0
+ return !U.getUser()->use_empty();
+ }
+
+ // Otherwise, this use prevents us from splitting a value.
+ return false;
+ }))
+ return SDValue();
+
+ auto *LD = cast<MemSDNode>(N);
+ EVT MemVT = LD->getMemoryVT();
+ SDLoc DL(LD);
+
+ // the new opcode after we double the number of operands
+ NVPTXISD::NodeType Opcode;
+ SmallVector<SDValue> Operands(LD->ops());
+ unsigned OldNumValues;
+ switch (LD->getOpcode()) {
+ case ISD::LOAD:
+ OldNumValues = 1;
+ // Any packed type is legal, so the legalizer will not have lowered
+ // ISD::LOAD -> NVPTXISD::Load (unless it's under-aligned). We have to do it
+ // here.
+ Opcode = NVPTXISD::LoadV2;
+ Operands.push_back(DCI.DAG.getIntPtrConstant(
+ cast<LoadSDNode>(LD)->getExtensionType(), DL));
+ break;
+ case NVPTXISD::LoadParamV2:
+ OldNumValues = 2;
+ Opcode = NVPTXISD::LoadParamV4;
+ break;
+ case NVPTXISD::LoadV2:
+ OldNumValues = 2;
+ Opcode = NVPTXISD::LoadV4;
+ break;
+ case NVPTXISD::LoadV4:
+ // PTX doesn't support v8 for 16-bit values
+ case NVPTXISD::LoadV8:
+ // PTX doesn't support the next doubling of outputs
+ return SDValue();
+ }
+
+ SmallVector<EVT> NewVTs(OldNumValues * 2, ElemVT.getVectorElementType());
+ // add remaining chain and glue values
+ for (unsigned I = OldNumValues; I < LD->getNumValues(); ++I)
+ NewVTs.push_back(LD->getValueType(I));
+
+ // Create the new load
+ SDValue NewLoad =
+ DCI.DAG.getMemIntrinsicNode(Opcode, DL, DCI.DAG.getVTList(NewVTs),
+ Operands, MemVT, LD->getMemOperand());
+
+ // Now we use a combination of BUILD_VECTORs and a MERGE_VALUES node to keep
+ // the outputs the same. These nodes will be optimized away in later
+ // DAGCombiner iterations.
+ SmallVector<SDValue> Results;
+ for (unsigned I = 0; I < NewLoad->getNumValues();) {
----------------
AlexMaclean wrote:
Similar to how we're constructing the value list above can we be a little more explicit here and not rely on checking the value type?
https://github.com/llvm/llvm-project/pull/144581
More information about the llvm-commits
mailing list