[llvm] [LV] Vectorize FMax w/o fast-math flags. (PR #146711)

Sun Jul 13 04:17:52 PDT 2025

================
@@ -628,3 +628,118 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
     Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
   }
 }
+
+bool VPlanTransforms::handleFMaxReductionsWithoutFastMath(VPlan &Plan) {
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  VPReductionPHIRecipe *RedPhiR = nullptr;
+  VPRecipeWithIRFlags *MaxOp = nullptr;
+  VPWidenIntOrFpInductionRecipe *WideIV = nullptr;
+
+  // Check if there are any FCmpOGTSelect reductions using wide selects that we
+  // can fix up. To do so, we also need a  wide canonical IV to keep track of
+  // the indices of the max values.
+  for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) {
+    // We need a wide canonical IV
+    if (auto *CurIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
+      if (!CurIV->isCanonical())
+        continue;
+      WideIV = CurIV;
+      continue;
+    }
+
+    // And a single FCmpOGTSelect reduction phi.
+    // TODO: Support FMin reductions as well.
+    auto *CurRedPhiR = dyn_cast<VPReductionPHIRecipe>(&R);
+    if (!CurRedPhiR)
+      continue;
+    if (RedPhiR)
+      return false;
+    if (CurRedPhiR->getRecurrenceKind() != RecurKind::FCmpOGTSelect ||
+        CurRedPhiR->isInLoop() || CurRedPhiR->isOrdered())
+      continue;
+    RedPhiR = CurRedPhiR;
+
+    // MaxOp feeding the reduction phi must be a select (either wide or a
+    // replicate recipe), where the phi is the last operand, and the compare
+    // predicate is strict. This ensures NaNs won't get propagated unless the
+    // initial value is NaN
+    VPRecipeBase *Inc = RedPhiR->getBackedgeValue()->getDefiningRecipe();
+    auto *RepR = dyn_cast<VPReplicateRecipe>(Inc);
+    if (!isa<VPWidenSelectRecipe>(Inc) &&
+        !(RepR && (isa<SelectInst>(RepR->getUnderlyingInstr()))))
+      return false;
+
+    MaxOp = cast<VPRecipeWithIRFlags>(Inc);
+    auto *Cmp = cast<VPRecipeWithIRFlags>(MaxOp->getOperand(0));
+    if (MaxOp->getOperand(1) == RedPhiR ||
+        !CmpInst::isStrictPredicate(Cmp->getPredicate()))
+      return false;
+  }
+
+  // Nothing to do.
+  if (!RedPhiR)
+    return true;
+
+  // A wide canonical IV is currently required.
+  // TODO: Create an induction if no suitable existing one is available.
+  if (!WideIV)
+    return false;
+
+  // Create a reduction that tracks the first indices where the latest maximum
+  // value has been selected. This is later used to select the max value from
+  // the partial reductions in a way that correctly handles signed zeros and
+  // NaNs in the input.
+  // Note that we do not need to check if the induction may hit the sentinel
+  // value. If the sentinel value gets hit, the final reduction value is at the
+  // last index or the maximum was never set and all lanes contain the start
+  // value. In either case, the correct value is selected.
+  unsigned IVWidth =
+      VPTypeAnalysis(Plan).inferScalarType(WideIV)->getScalarSizeInBits();
+  LLVMContext &Ctx = Plan.getScalarHeader()->getIRBasicBlock()->getContext();
+  VPValue *UMinSentinel =
+      Plan.getOrAddLiveIn(ConstantInt::get(Ctx, APInt::getMaxValue(IVWidth)));
+  auto *IdxPhi = new VPReductionPHIRecipe(nullptr, RecurKind::FindFirstIVUMin,
+                                          *UMinSentinel, false, false, 1);
+  IdxPhi->insertBefore(RedPhiR);
+  auto *MinIdxSel = new VPInstruction(Instruction::Select,
+                                      {MaxOp->getOperand(0), WideIV, IdxPhi});
+  MinIdxSel->insertAfter(MaxOp);
+  IdxPhi->addOperand(MinIdxSel);
+
+  // Find the first index of with the maximum value. This is used to extract the
+  // lane with the final max value and is needed to handle signed zeros and NaNs
+  // in the input.
+  auto *MaxResult = find_singleton<VPSingleDefRecipe>(
+      RedPhiR->users(), [](VPUser *U, bool) -> VPSingleDefRecipe * {
+        auto *VPI = dyn_cast<VPInstruction>(U);
+        if (VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult)
+          return VPI;
+        return nullptr;
+      });
+  VPBuilder Builder(MaxResult->getParent(),
+                    std::next(MaxResult->getIterator()));
+
+  // Create mask for lanes that have the max value and use it to mask out
+  // indices that don't contain maximum values.
+  auto *MaskFinalMaxValue = Builder.createNaryOp(
+      Instruction::FCmp, {MaxResult->getOperand(1), MaxResult},
+      VPIRFlags(CmpInst::FCMP_OEQ));
+  auto *IndicesWithMaxValue = Builder.createNaryOp(
+      Instruction::Select, {MaskFinalMaxValue, MinIdxSel, UMinSentinel});
+  auto *FirstMaxIdx = Builder.createNaryOp(
+      VPInstruction::ComputeFindIVResult,
+      {IdxPhi, WideIV->getStartValue(), UMinSentinel, IndicesWithMaxValue});
+  // Convert the index of the first max value to an index in the vector lanes of
+  // the partial reduction results. This ensures we select the first max value
+  // and acts as a tie-breaker if the partial reductions contain signed zeros.
----------------
fhahn wrote:

Yes, the tie-breaking is only needed to handle signed zeroes when computing the final reduction results.

Consider a final partial reduction vector with `-0.0, +0.0` and -0.0 was encountered before +0.0 (e.g. the max at iteration 2 is -0.0 and at iteration 3 it is +0.0. Doing a plain horizontal fmax reduction will produce +0.0 (-0.0 < +0.0).

We then compare the partial reduction values to the result of the horizontal reduction (-0.0 == +0.0 will also be true, selecting all lanes with zeros of any signed-ness)

Out of those, we select the one encountered first using FindFirstIV. Note that this only works for strict predicates. 


https://github.com/llvm/llvm-project/pull/146711