[llvm] [VPlan] Delay adding canonical IV increment. (PR #82270)

via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 5 15:00:30 PST 2025


================
@@ -2043,6 +1986,98 @@ void VPlanTransforms::createInterleaveGroups(
   }
 }
 
+void VPlanTransforms::convertCanonicalIV(
+    VPlan &Plan, bool HasNUW, bool DataAndControlFlowWithoutRuntimeCheck) {
+  auto *CanIV = Plan.getCanonicalIV();
+
+  VPBasicBlock *Latch = Plan.getVectorLoopRegion()->getExitingBasicBlock();
+  auto *LatchTerm = Latch->getTerminator();
+  VPBuilder Builder(LatchTerm);
+  DebugLoc DL = CanIV->getDebugLoc();
+  // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
+  auto *CanonicalIVIncrement =
+      Builder.createOverflowingOp(Instruction::Add, {CanIV, &Plan.getVFxUF()},
+                                  {HasNUW, false}, DL, "index.next");
+  CanIV->addOperand(CanonicalIVIncrement);
+
+  auto FoundLaneMaskPhi = find_if(
+      Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
+      [](VPRecipeBase &P) { return isa<VPActiveLaneMaskPHIRecipe>(P); });
+
+  if (FoundLaneMaskPhi ==
+      Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis().end()) {
+
+    // Update BranchOnCount VPInstruction in the latch to use increment.
+    // TODO: Should have separate opcodes for separate semantics.
+    LatchTerm->setOperand(0, CanonicalIVIncrement);
+    return;
+  }
+
+  // Now adjust the start value of the active-lane-mask depending on
+  // DataAndControlFlowWithoutRuntimeCheck, introduce its increment and a
+  // conditional branch to control the loop until the lane mask is exhausted.
+  // Concretely, we add the following recipes:
+  //
+  // vector.ph:
+  //   %TripCount = calculate-trip-count-minus-VF (original TC)
+  //       [if DataWithControlFlowWithoutRuntimeCheck]
+  //   %EntryInc = canonical-iv-increment-for-part %StartV
+  //   %EntryALM = active-lane-mask %EntryInc, %TripCount (replaces the
+  //       existing start value for the existing active-lane-mask-phi)
+  //
+  // vector.body:
+  //   ...
+  //   (existing) %P = active-lane-mask-phi [ %EntryALM, %vector.ph ],
+  //   (added increment)                    [ %ALM, %vector.body ]
+  //   ...
+  //   %InLoopInc = canonical-iv-increment-for-part %IncrementValue
+  //   %ALM = active-lane-mask %InLoopInc, TripCount
+  //   %Negated = Not %ALM
+  //   branch-on-cond %Negated (replaces existing BranchOnCount)
+
+  auto *LaneMaskPhi = cast<VPActiveLaneMaskPHIRecipe>(&*FoundLaneMaskPhi);
+  auto *VecPreheader =
+      cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor());
+  Builder.setInsertPoint(VecPreheader);
+
+  VPValue *TC = Plan.getTripCount();
+
+  // TODO: Check if dropping the flags is needed if
+  // !DataAndControlFlowWithoutRuntimeCheck.
+  CanonicalIVIncrement->dropPoisonGeneratingFlags();
+  VPValue *TripCount, *IncrementValue;
+  if (!DataAndControlFlowWithoutRuntimeCheck) {
+    // When the loop is guarded by a runtime overflow check for the loop
+    // induction variable increment by VF, we can increment the value before
+    // the get.active.lane mask and use the unmodified tripcount.
+    IncrementValue = CanonicalIVIncrement;
+    TripCount = TC;
+  } else {
+    // When avoiding a runtime check, the active.lane.mask inside the loop
+    // uses a modified trip count and the induction variable increment is
+    // done after the active.lane.mask intrinsic is called.
+    IncrementValue = CanIV;
+    TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
+                                     {TC}, DL);
+  }
+  // Create the active lane mask for the next iteration of the loop before the
+  // original terminator.
+  Builder.setInsertPoint(Latch);
+  auto *InLoopIncrement =
+      Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
+                                  {IncrementValue}, {false, false}, DL);
----------------
ayalz wrote:

Worth clarifying the distinction between CanonicalIVIncrement and CanonicalIVIncrementForPart: the former is the scalar value CanIV + VF * UF which is uniform across VF * UF and belongs to the scalar canonical IV chain, whereas the latter is the scalar value CanIV + (VF * UF) + P * UF which is uniform across VF and uses the canonical IV chain (either pre or post increment), as in "build scalar steps" per part P. In particular, the latter is useful only for UF>1, i.e., when unrolling. Right?

https://github.com/llvm/llvm-project/pull/82270


More information about the llvm-commits mailing list