[llvm] [NVPTX] support packed f32 instructions for sm_100+ (PR #126337)

Wed Jun 18 09:30:29 PDT 2025

================
@@ -5103,6 +5158,54 @@ static SDValue PerformFADDCombine(SDNode *N,
   return PerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
 }
 
+// For vector reductions that are unrolled into packed ops (e.g. fadd.f32x2),
+// the final reduction op needs to produce a scalar. By default, this results in
+// a final packed op where one of the lanes is undef:
+//
+// v1: v2f32 = fadd reassoc a, b
+// ...
+// v[N-2]: v2f32 = fadd reassoc v[N-4], v[N-3]
+//
+// # now we combine v[N-2]:0 with v[N-2]:1
+//
+// v[N-1]: v2f32 = vector_shuffle<1,u> v[N-2], undef:v2f32
+// vN: v2f32 = fadd reassoc v[N-1], v[N-2]
+//
+// result: f32 = extractelt vN, 0
+//
+// We convert this to a scalar op.
+static SDValue PerformPackedFOpCombine(SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI) {
+  // Convert (fop.x2 (vector_shuffle<1,u> V), V) -> ((fop V:1, V:0), undef)
+  const EVT VectorVT = N->getValueType(0);
+  if (!(VectorVT == MVT::v2f32 || Isv2x16VT(VectorVT)))
----------------
Prince781 wrote:

Sure. https://github.com/llvm/llvm-project/pull/143943

https://github.com/llvm/llvm-project/pull/126337