[llvm] [NVPTX] support packed f32 instructions for sm_100+ (PR #126337)

Tue Apr 22 15:24:02 PDT 2025

================
@@ -5051,26 +5080,395 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
   return SDValue();
 }
 
-static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front,
-                                         std::size_t Back) {
+/// OverrideVT - allows overriding result and memory type
+static std::optional<std::pair<SDValue, SDValue>>
+convertVectorLoad(SDNode *N, SelectionDAG &DAG, bool BuildVector,
+                  std::optional<EVT> OverrideVT = std::nullopt) {
+  EVT ResVT = N->getValueType(0);
+  if (OverrideVT)
+    ResVT = *OverrideVT;
+  SDLoc DL(N);
+
+  assert(ResVT.isVector() && "Vector load must have vector type");
+
+  auto NumEltsAndEltVT = getVectorLoweringShape(ResVT);
+  if (!NumEltsAndEltVT)
+    return std::nullopt;
+  auto [NumElts, EltVT] = NumEltsAndEltVT.value();
+
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+
+  Align Alignment = LD->getAlign();
+  auto &TD = DAG.getDataLayout();
+  Align PrefAlign = TD.getPrefTypeAlign(
+      OverrideVT.value_or(LD->getMemoryVT()).getTypeForEVT(*DAG.getContext()));
+  if (Alignment < PrefAlign) {
+    // This load is not sufficiently aligned, so bail out and let this vector
+    // load be scalarized.  Note that we may still be able to emit smaller
+    // vector loads.  For example, if we are loading a <4 x float> with an
+    // alignment of 8, this check will fail but the legalizer will try again
+    // with 2 x <2 x float>, which will succeed with an alignment of 8.
+    return std::nullopt;
+  }
+
+  // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
+  // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
+  // loaded type to i16 and propagate the "real" type as the memory type.
+  bool NeedTrunc = false;
+  if (EltVT.getSizeInBits() < 16) {
+    EltVT = MVT::i16;
+    NeedTrunc = true;
+  }
+
+  unsigned Opcode = 0;
+  SDVTList LdResVTs;
+
+  switch (NumElts) {
+  default:
+    return std::nullopt;
+  case 2:
+    Opcode = NVPTXISD::LoadV2;
+    LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
+    break;
+  case 4: {
+    Opcode = NVPTXISD::LoadV4;
+    EVT ListVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other};
+    LdResVTs = DAG.getVTList(ListVTs);
+    break;
+  }
+  }
+
+  // Copy regular operands
+  SmallVector<SDValue, 8> OtherOps(N->ops());
+
+  // The select routine does not have access to the LoadSDNode instance, so
+  // pass along the extension information
+  OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL));
+
+  SDValue NewLD = DAG.getMemIntrinsicNode(
+      Opcode, DL, LdResVTs, OtherOps, OverrideVT.value_or(LD->getMemoryVT()),
+      LD->getMemOperand());
+
+  SDValue LoadChain = NewLD.getValue(NumElts);
+
+  if (BuildVector) {
+    SmallVector<SDValue> ScalarRes;
+    assert(NumElts <= ResVT.getVectorNumElements() &&
+           "NumElts should not increase, only decrease or stay the same.");
+    if (NumElts < ResVT.getVectorNumElements()) {
+      // If the number of elements has decreased, getVectorLoweringShape has
+      // upsized the element types
+      assert(EltVT.isVector() && EltVT.getSizeInBits() == 32 &&
+             EltVT.getVectorNumElements() <= 4 && "Unexpected upsized type.");
+      // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
+      // into individual elements.
+      for (unsigned i = 0; i < NumElts; ++i) {
+        SDValue SubVector = NewLD.getValue(i);
+        DAG.ExtractVectorElements(SubVector, ScalarRes);
+      }
+    } else {
+      for (unsigned i = 0; i < NumElts; ++i) {
+        SDValue Res = NewLD.getValue(i);
+        if (NeedTrunc)
+          Res =
+              DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+        ScalarRes.push_back(Res);
+      }
+    }
+
+    SDValue BuildVec = DAG.getBuildVector(ResVT, DL, ScalarRes);
+    return {{BuildVec, LoadChain}};
+  }
+
+  return {{NewLD, LoadChain}};
+}
+
+static SDValue PerformLoadCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI) {
+  auto *MemN = cast<MemSDNode>(N);
+  // only operate on vectors of f32s / i64s
----------------
AlexMaclean wrote:

Why is this?

https://github.com/llvm/llvm-project/pull/126337