[llvm] [LoopVectorize] Generate wide active lane masks (PR #147535)

Florian Hahn via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 14 12:49:38 PDT 2025


================
@@ -1432,20 +1433,93 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan,
   return SE.isKnownPredicate(CmpInst::ICMP_EQ, TripCount, C);
 }
 
+static void extractFromWideActiveLaneMask(VPlan &Plan, ElementCount VF,
+                                          unsigned UF) {
+  VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
+  auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
+  VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
+  auto *Term = &ExitingVPBB->back();
+
+  VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
+  LLVMContext &Ctx = CanonicalIV->getScalarType()->getContext();
+  using namespace llvm::VPlanPatternMatch;
+
+  auto extractFromALM = [&](VPInstruction *ALM, VPInstruction *InsBefore,
+                            SmallVectorImpl<VPValue *> &Extracts) {
+    VPBuilder Builder(InsBefore);
+    DebugLoc DL = ALM->getDebugLoc();
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      SmallVector<VPValue *> Ops;
+      Ops.append({ALM, Plan.getOrAddLiveIn(
+                           ConstantInt::get(IntegerType::getInt64Ty(Ctx),
+                                            VF.getKnownMinValue() * Part))});
+      Extracts.push_back(
+          Builder.createNaryOp(VPInstruction::ExtractSubvector, Ops, DL));
+    }
+  };
+
+  // Create a list of each active lane mask phi, ordered by unroll part.
+  SmallVector<VPActiveLaneMaskPHIRecipe *> Phis(UF, nullptr);
+  for (VPRecipeBase &R : Header->phis())
+    if (auto *Phi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R))
+      Phis[Phi->getUnrollPart()] = Phi;
+
+  assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
+         "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
+
+  // When using wide lane masks, the return type of the get.active.lane.mask
+  // intrinsic is VF x UF (second operand).
+  VPValue *ALMMultiplier =
+      Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
+  cast<VPInstruction>(Phis[0]->getStartValue())->setOperand(2, ALMMultiplier);
+  cast<VPInstruction>(Phis[0]->getBackedgeValue())
+      ->setOperand(2, ALMMultiplier);
+
+  // Create UF x extract vectors and insert into preheader.
+  SmallVector<VPValue *> EntryExtracts;
+  auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
+  extractFromALM(EntryALM, cast<VPInstruction>(&EntryALM->getParent()->back()),
+                 EntryExtracts);
+
+  // Create UF x extract vectors and insert before the loop compare & branch,
+  // updating the compare to use the first extract.
+  SmallVector<VPValue *> LoopExtracts;
+  auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
+  VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
+  extractFromALM(LoopALM, Not, LoopExtracts);
+  Not->setOperand(0, LoopExtracts[0]);
+
+  // Update the incoming values of active lane mask phis.
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    Phis[Part]->setStartValue(EntryExtracts[Part]);
+    Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
+  }
+
+  return;
+}
+
 /// Try to simplify the branch condition of \p Plan. This may restrict the
 /// resulting plan to \p BestVF and \p BestUF.
-static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
-                                              unsigned BestUF,
-                                              PredicatedScalarEvolution &PSE) {
+static bool
+simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
+                                  unsigned BestUF,
+                                  PredicatedScalarEvolution &PSE,
+                                  bool DataAndControlFlowWithoutRuntimeCheck) {
   VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
   VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
   auto *Term = &ExitingVPBB->back();
   VPValue *Cond;
   ScalarEvolution &SE = *PSE.getSE();
   using namespace llvm::VPlanPatternMatch;
-  if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
-      match(Term, m_BranchOnCond(
-                      m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) {
+  auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
+  bool BranchALM = match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(
+                                   m_VPValue(), m_VPValue(), m_VPValue()))));
+
+  if (BranchALM || match(Term, m_BranchOnCount(m_VPValue(), m_VPValue()))) {
+    if (BranchALM && DataAndControlFlowWithoutRuntimeCheck &&
+        EnableWideActiveLaneMask && BestVF.isVector() && BestUF > 1)
+      extractFromWideActiveLaneMask(Plan, BestVF, BestUF);
----------------
fhahn wrote:

is there any benefit from having this here? It doesn't seem to fit here, as it does not simplify the branch condition directly?

https://github.com/llvm/llvm-project/pull/147535


More information about the llvm-commits mailing list