[llvm] [NVPTX] fold movs into loads and stores (PR #144581)
Princeton Ferro via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 18 14:59:25 PDT 2025
================
@@ -5047,26 +5044,237 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
return SDValue();
}
-static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front,
- std::size_t Back) {
+/// Combine extractelts into a load by increasing the number of return values.
+static SDValue
+combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+ // Don't run this optimization before the legalizer
+ if (!DCI.isAfterLegalizeDAG())
+ return SDValue();
+
+ EVT ElemVT = N->getValueType(0);
+ if (!Isv2x16VT(ElemVT))
+ return SDValue();
+
+ // Check whether all outputs are either used by an extractelt or are
+ // glue/chain nodes
+ if (!all_of(N->uses(), [&](SDUse &U) {
+ // Skip glue, chain nodes
+ if (U.getValueType() == MVT::Glue || U.getValueType() == MVT::Other)
+ return true;
+ if (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+ if (N->getOpcode() != ISD::LOAD)
+ return true;
+ // Since this is an ISD::LOAD, check all extractelts are used. If
+ // any are not used, we don't want to defeat another optimization that
+ // will narrow the load.
+ //
+ // For example:
+ //
+ // L: v2f16,ch = load <p>
+ // e0: f16 = extractelt L:0, 0
+ // e1: f16 = extractelt L:0, 1 <-- unused
+ // store e0
+ //
+ // Can be optimized by DAGCombiner to:
+ //
+ // L: f16,ch = load <p>
+ // store L:0
+ return !U.getUser()->use_empty();
+ }
+
+ // Otherwise, this use prevents us from splitting a value.
+ return false;
+ }))
+ return SDValue();
+
+ auto *LD = cast<MemSDNode>(N);
+ EVT MemVT = LD->getMemoryVT();
+ SDLoc DL(LD);
+
+ // the new opcode after we double the number of operands
+ NVPTXISD::NodeType Opcode;
+ SmallVector<SDValue> Operands(LD->ops());
+ unsigned OldNumValues; // non-glue, non-chain outputs
+ switch (LD->getOpcode()) {
+ case ISD::LOAD:
+ OldNumValues = 1;
+ // Any packed type is legal, so the legalizer will not have lowered
+ // ISD::LOAD -> NVPTXISD::Load (unless it's under-aligned). We have to do it
+ // here.
+ Opcode = NVPTXISD::LoadV2;
+ Operands.push_back(DCI.DAG.getIntPtrConstant(
+ cast<LoadSDNode>(LD)->getExtensionType(), DL));
+ break;
+ case NVPTXISD::LoadParamV2:
+ OldNumValues = 2;
+ Opcode = NVPTXISD::LoadParamV4;
+ break;
+ case NVPTXISD::LoadV2:
+ OldNumValues = 2;
+ Opcode = NVPTXISD::LoadV4;
+ break;
+ case NVPTXISD::LoadV4:
+ // PTX doesn't support v8 for 16-bit values
+ case NVPTXISD::LoadV8:
+ // PTX doesn't support the next doubling of outputs
+ return SDValue();
+ }
+
+ SmallVector<EVT> NewVTs(OldNumValues * 2, ElemVT.getVectorElementType());
+ // add remaining chain and glue values
+ NewVTs.append(LD->value_begin() + OldNumValues, LD->value_end());
+
+ // Create the new load
+ SDValue NewLoad =
+ DCI.DAG.getMemIntrinsicNode(Opcode, DL, DCI.DAG.getVTList(NewVTs),
+ Operands, MemVT, LD->getMemOperand());
+
+ // Now we use a combination of BUILD_VECTORs and a MERGE_VALUES node to keep
+ // the outputs the same. These nodes will be optimized away in later
+ // DAGCombiner iterations.
+ SmallVector<SDValue> Results;
+ for (unsigned I = 0; I < NewLoad->getNumValues();) {
+ if (I < OldNumValues * 2) {
+ Results.push_back(DCI.DAG.getBuildVector(
+ ElemVT, DL, {NewLoad.getValue(I), NewLoad.getValue(I + 1)}));
+ I += 2;
+ } else {
+ Results.push_back(NewLoad.getValue(I));
+ I += 1;
----------------
Prince781 wrote:
Is the idea to do something like `Results.append(NewLoad->value_begin() + OldNumValues *2, NewLoad->value_end())` or did you have something else in mind? This doesn't work because the iterator element (`EVT`) isn't convertible to `SDValue`. So I chose to use `push_back()` here.
https://github.com/llvm/llvm-project/pull/144581
More information about the llvm-commits
mailing list