[llvm] [VPlan] First step towards VPlan cost modeling. (PR #67934)
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Fri May 10 07:43:35 PDT 2024
================
@@ -7395,6 +7396,177 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
return VF;
}
+static InstructionCost
+computeCostForRecipe(VPRecipeBase *R, ElementCount VF,
+ SmallPtrSetImpl<Instruction *> &SeenUI,
+ LoopVectorizationCostModel &CM,
+ const TargetTransformInfo &TTI, VPCostContext CostCtx) {
+ Instruction *UI = nullptr;
+ if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
+ UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
+ if (UI && (CM.VecValuesToIgnore.contains(UI) || !SeenUI.insert(UI).second))
+ return 0;
+
+ InstructionCost RecipeCost = R->computeCost(VF, CostCtx);
+ if (!RecipeCost.isValid()) {
+ if (auto *IG = dyn_cast<VPInterleaveRecipe>(R)) {
+ RecipeCost = CM.getInstructionCost(IG->getInsertPos(), VF).first;
+ } else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R)) {
+ RecipeCost = CM.getInstructionCost(&WidenMem->getIngredient(), VF).first;
+ } else if (UI) {
+ RecipeCost = CM.getInstructionCost(UI, VF).first;
+ } else
+ return 0;
+ }
+ if (ForceTargetInstructionCost.getNumOccurrences() > 0 &&
+ RecipeCost.isValid())
+ RecipeCost = InstructionCost(ForceTargetInstructionCost);
+
+ LLVM_DEBUG({
+ dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": ";
+ R->dump();
+ });
+ return RecipeCost;
+}
+
+static InstructionCost computeCostForReplicatorRegion(
+ VPRegionBlock *Region, ElementCount VF,
+ SmallPtrSetImpl<Instruction *> &SeenUI, LoopVectorizationCostModel &CM,
+ const TargetTransformInfo &TTI, LLVMContext &Ctx, VPCostContext CostCtx) {
+ using namespace llvm::VPlanPatternMatch;
+ InstructionCost RegionCost = 0;
+ assert(Region->isReplicator() &&
+ "can only compute cost for a replicator region");
+ VPBasicBlock *Then =
+ cast<VPBasicBlock>(Region->getEntry()->getSuccessors()[0]);
+ for (VPRecipeBase &R : *Then)
+ RegionCost += computeCostForRecipe(&R, VF, SeenUI, CM, CM.TTI, CostCtx);
+
+ // Note the cost estimates below closely match the current legacy cost model.
+ auto *BOM =
+ cast<VPBranchOnMaskRecipe>(&Region->getEntryBasicBlock()->front());
+ VPValue *Cond = BOM->getOperand(0);
+
+ // Check if Cond is a uniform compare.
+ auto IsUniformCompare = [Cond]() {
+ VPValue *Op = Cond;
+ if (match(Op, m_Not(m_VPValue())))
+ Op = Op->getDefiningRecipe()->getOperand(0);
+ auto *R = Op->getDefiningRecipe();
+ if (!R)
+ return true;
+ if (!match(R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())))
+ return false;
+ return all_of(R->operands(), [](VPValue *Op) {
+ return vputils::isUniformAfterVectorization(Op);
+ });
+ }();
+ bool IsHeaderMaskOrUniformCond =
+ IsUniformCompare ||
+ match(Cond, m_ActiveLaneMask(m_VPValue(), m_VPValue())) ||
+ match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) ||
+ isa<VPActiveLaneMaskPHIRecipe>(Cond);
+ if (IsHeaderMaskOrUniformCond || VF.isScalable())
+ return RegionCost;
+
+ // For the scalar case, we may not always execute the original predicated
+ // block, Thus, scale the block's cost by the probability of executing it.
+ // blockNeedsPredication from Legal is used so as to not include all blocks in
+ // tail folded loops.
+ if (VF.isScalar())
+ return RegionCost / getReciprocalPredBlockProb();
+
+ // Add the cost for branches around scalarized and predicated blocks.
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(Ctx), VF);
+ return RegionCost +
+ TTI.getScalarizationOverhead(
+ Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()),
+ /*Insert*/ false, /*Extract*/ true, CostKind) +
+ (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue());
+}
+
+InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan,
+ ElementCount VF) {
+ InstructionCost Cost = 0;
+ SmallPtrSet<Instruction *, 8> SeenUI;
+ LLVMContext &Ctx = OrigLoop->getHeader()->getContext();
+ VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), Ctx);
+
+ // Cost modeling for inductions is inaccurate in the legacy cost model
+ // compared to the recipes that are generated. To match here initially during
+ // VPlan cost model bring up directly use the induction costs from the legacy
+ // cost model and skip induction recipes.
+ for (const auto &[IV, _] : Legal->getInductionVars()) {
+ Instruction *IVInc = cast<Instruction>(
+ IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
+ InstructionCost RecipeCost = CM.getInstructionCost(IVInc, VF).first;
+ LLVM_DEBUG({
+ dbgs() << "Cost of " << RecipeCost << " for VF " << VF
+ << ":\n induction increment ";
+ IVInc->dump();
+ });
+ Cost += RecipeCost;
+ SeenUI.insert(IVInc);
+ }
+
+ VPBasicBlock *Header =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry());
+ for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Header))) {
+ if (auto *Region = dyn_cast<VPRegionBlock>(Block)) {
+ Cost += computeCostForReplicatorRegion(Region, VF, SeenUI, CM, CM.TTI,
+ Ctx, CostCtx);
+ continue;
+ }
+
+ for (VPRecipeBase &R : *cast<VPBasicBlock>(Block))
+ Cost += computeCostForRecipe(&R, VF, SeenUI, CM, CM.TTI, CostCtx);
+ }
+
+ // Add the cost for the backedge.
+ Cost += 1;
+ LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
+ return Cost;
+}
+
+std::pair<VPlan &, ElementCount> LoopVectorizationPlanner::getBestPlan() {
+ // If there is a single VPlan with a single VF, return it directly.
+ if (VPlans.size() == 1 && size(VPlans[0]->vectorFactors()) == 1) {
+ ElementCount VF = *VPlans[0]->vectorFactors().begin();
+ return {*VPlans[0], VF};
+ }
+
+ VPlan *BestPlan = &*VPlans[0];
+ assert(hasPlanWithVF(ElementCount::getFixed(1)));
----------------
fhahn wrote:
Done, thanks!
https://github.com/llvm/llvm-project/pull/67934
More information about the llvm-commits
mailing list