[llvm] [NVPTX] support packed f32 instructions for sm_100+ (PR #126337)

Thu Jul 10 20:31:27 PDT 2025

================
@@ -330,22 +329,30 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
   }
 
   ComputeValueVTs(TLI, DL, Ty, TempVTs, &TempOffsets, StartingOffset);
-  for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
-    EVT VT = TempVTs[i];
-    uint64_t Off = TempOffsets[i];
-    // Split vectors into individual elements, except for v2f16, which
-    // we will pass as a single scalar.
+  for (auto [VT, Off] : zip(TempVTs, TempOffsets)) {
+    // Split vectors into individual elements, except for packed types
     if (VT.isVector()) {
       unsigned NumElts = VT.getVectorNumElements();
       EVT EltVT = VT.getVectorElementType();
       // We require power-of-2 sized vectors because
       // TargetLoweringBase::getVectorTypeBreakdown() which is invoked in
       // ComputePTXValueVTs() cannot currently break down non-power-of-2 sized
       // vectors.
-      if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0 &&
-          isPowerOf2_32(NumElts)) {
-        // Vectors with an even number of f16 elements will be passed to
-        // us as an array of v2f16/v2bf16 elements. We must match this so we
+
+      // Special case handling for packed i8s.
+      if (EltVT.getSimpleVT() == MVT::i8 &&
+          ((NumElts % 4 == 0 && isPowerOf2_32(NumElts)) || NumElts == 3)) {
+        // v*i8 are formally lowered as v4i8
+        EltVT = MVT::v4i8;
+        NumElts = (NumElts + 3) / 4;
+      } else if (EltVT.getSimpleVT() == MVT::i8 && NumElts == 2) {
+        // v2i8 is promoted to v2i16
+        NumElts = 1;
+        EltVT = MVT::v2i8;
+      } else if (isPackedElementTy(EltVT) && NumElts % 2 == 0 &&
+                 isPowerOf2_32(NumElts)) {
----------------
Prince781 wrote:

Okay, I redid this section to rely on a declared list of supported types (see changes in `NVPTXUtilities.h`) and added more comments.

https://github.com/llvm/llvm-project/pull/126337