[llvm] [LV] Create LoopVectorizationCostModel hdr (NFC) (PR #159093)
Ramkumar Ramachandra via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 16 06:26:56 PDT 2025
https://github.com/artagnon created https://github.com/llvm/llvm-project/pull/159093
None
>From cbcf02bbacd0b34b6d240c90cfc0eb393e11d64e Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Tue, 16 Sep 2025 14:20:28 +0100
Subject: [PATCH] [LV] Create LoopVectorizationCostModel hdr (NFC)
---
.../Vectorize/LoopVectorizationCostModel.h | 847 ++++++++++++++
.../Vectorize/LoopVectorizationPlanner.h | 31 -
.../Transforms/Vectorize/LoopVectorize.cpp | 1008 ++---------------
3 files changed, 964 insertions(+), 922 deletions(-)
create mode 100644 llvm/lib/Transforms/Vectorize/LoopVectorizationCostModel.h
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationCostModel.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationCostModel.h
new file mode 100644
index 0000000000000..42ef0a44c1fc8
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationCostModel.h
@@ -0,0 +1,847 @@
+//===- LoopVectorizationCostModel.h - Costing for LoopVectorize -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONCOSTMODEL_H
+#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONCOSTMODEL_H
+
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+
+namespace llvm {
+extern cl::opt<bool> ForceTargetSupportsScalableVectors;
+extern cl::opt<cl::boolOrDefault> ForceSafeDivisor;
+extern cl::opt<bool> PreferPredicatedReductionSelect;
+
+/// A class that represents two vectorization factors (initialized with 0 by
+/// default). One for fixed-width vectorization and one for scalable
+/// vectorization. This can be used by the vectorizer to choose from a range of
+/// fixed and/or scalable VFs in order to find the most cost-effective VF to
+/// vectorize with.
+struct FixedScalableVFPair {
+ ElementCount FixedVF;
+ ElementCount ScalableVF;
+
+ FixedScalableVFPair()
+ : FixedVF(ElementCount::getFixed(0)),
+ ScalableVF(ElementCount::getScalable(0)) {}
+ FixedScalableVFPair(const ElementCount &Max) : FixedScalableVFPair() {
+ *(Max.isScalable() ? &ScalableVF : &FixedVF) = Max;
+ }
+ FixedScalableVFPair(const ElementCount &FixedVF,
+ const ElementCount &ScalableVF)
+ : FixedVF(FixedVF), ScalableVF(ScalableVF) {
+ assert(!FixedVF.isScalable() && ScalableVF.isScalable() &&
+ "Invalid scalable properties");
+ }
+
+ static FixedScalableVFPair getNone() { return FixedScalableVFPair(); }
+
+ /// \return true if either fixed- or scalable VF is non-zero.
+ explicit operator bool() const { return FixedVF || ScalableVF; }
+
+ /// \return true if either fixed- or scalable VF is a valid vector VF.
+ bool hasVector() const { return FixedVF.isVector() || ScalableVF.isVector(); }
+};
+
+// Loop vectorization cost-model hints how the scalar epilogue loop should be
+// lowered.
+enum ScalarEpilogueLowering {
+
+ // The default: allowing scalar epilogues.
+ CM_ScalarEpilogueAllowed,
+
+ // Vectorization with OptForSize: don't allow epilogues.
+ CM_ScalarEpilogueNotAllowedOptSize,
+
+ // A special case of vectorisation with OptForSize: loops with a very small
+ // trip count are considered for vectorization under OptForSize, thereby
+ // making sure the cost of their loop body is dominant, free of runtime
+ // guards and scalar iteration overheads.
+ CM_ScalarEpilogueNotAllowedLowTripLoop,
+
+ // Loop hint predicate indicating an epilogue is undesired.
+ CM_ScalarEpilogueNotNeededUsePredicate,
+
+ // Directive indicating we must either tail fold or not vectorize
+ CM_ScalarEpilogueNotAllowedUsePredicate
+};
+
+/// LoopVectorizationCostModel - estimates the expected speedups due to
+/// vectorization.
+/// In many cases vectorization is not profitable. This can happen because of
+/// a number of reasons. In this class we mainly attempt to predict the
+/// expected speedup/slowdowns due to the supported instruction set. We use the
+/// TargetTransformInfo to query the different backends for the cost of
+/// different operations.
+class LoopVectorizationCostModel {
+ friend class LoopVectorizationPlanner;
+
+public:
+ LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
+ PredicatedScalarEvolution &PSE, LoopInfo *LI,
+ LoopVectorizationLegality *Legal,
+ const TargetTransformInfo &TTI,
+ const TargetLibraryInfo *TLI, DemandedBits *DB,
+ AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE, const Function *F,
+ const LoopVectorizeHints *Hints,
+ InterleavedAccessInfo &IAI,
+ ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
+ : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
+ TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
+ Hints(Hints), InterleaveInfo(IAI) {
+ if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
+ initializeVScaleForTuning();
+ CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
+ // Query this against the original loop and save it here because the profile
+ // of the original loop header may change as the transformation happens.
+ OptForSize = llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+ PGSOQueryType::IRPass);
+ }
+
+ /// \return An upper bound for the vectorization factors (both fixed and
+ /// scalable). If the factors are 0, vectorization and interleaving should be
+ /// avoided up front.
+ FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
+
+ /// \return True if runtime checks are required for vectorization, and false
+ /// otherwise.
+ bool runtimeChecksRequired();
+
+ /// Setup cost-based decisions for user vectorization factor.
+ /// \return true if the UserVF is a feasible VF to be chosen.
+ bool selectUserVectorizationFactor(ElementCount UserVF) {
+ collectNonVectorizedAndSetWideningDecisions(UserVF);
+ return expectedCost(UserVF).isValid();
+ }
+
+ /// \return True if maximizing vector bandwidth is enabled by the target or
+ /// user options, for the given register kind.
+ bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
+
+ /// \return True if register pressure should be considered for the given VF.
+ bool shouldConsiderRegPressureForVF(ElementCount VF);
+
+ /// \return The size (in bits) of the smallest and widest types in the code
+ /// that needs to be vectorized. We ignore values that remain scalar such as
+ /// 64 bit loop indices.
+ std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
+
+ /// Memory access instruction may be vectorized in more than one way.
+ /// Form of instruction after vectorization depends on cost.
+ /// This function takes cost-based decisions for Load/Store instructions
+ /// and collects them in a map. This decisions map is used for building
+ /// the lists of loop-uniform and loop-scalar instructions.
+ /// The calculated cost is saved with widening decision in order to
+ /// avoid redundant calculations.
+ void setCostBasedWideningDecision(ElementCount VF);
+
+ /// A call may be vectorized in different ways depending on whether we have
+ /// vectorized variants available and whether the target supports masking.
+ /// This function analyzes all calls in the function at the supplied VF,
+ /// makes a decision based on the costs of available options, and stores that
+ /// decision in a map for use in planning and plan execution.
+ void setVectorizedCallDecision(ElementCount VF);
+
+ /// Collect values we want to ignore in the cost model.
+ void collectValuesToIgnore();
+
+ /// Collect all element types in the loop for which widening is needed.
+ void collectElementTypesForWidening();
+
+ /// Split reductions into those that happen in the loop, and those that happen
+ /// outside. In loop reductions are collected into InLoopReductions.
+ void collectInLoopReductions();
+
+ /// Returns true if we should use strict in-order reductions for the given
+ /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
+ /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
+ /// of FP operations.
+ bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
+ return !Hints->allowReordering() && RdxDesc.isOrdered();
+ }
+
+ /// \returns The smallest bitwidth each instruction can be represented with.
+ /// The vector equivalents of these instructions should be truncated to this
+ /// type.
+ const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
+ return MinBWs;
+ }
+
+ /// \returns True if it is more profitable to scalarize instruction \p I for
+ /// vectorization factor \p VF.
+ bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
+ assert(VF.isVector() &&
+ "Profitable to scalarize relevant only for VF > 1.");
+ assert(
+ TheLoop->isInnermost() &&
+ "cost-model should not be used for outer loops (in VPlan-native path)");
+
+ auto Scalars = InstsToScalarize.find(VF);
+ assert(Scalars != InstsToScalarize.end() &&
+ "VF not yet analyzed for scalarization profitability");
+ return Scalars->second.contains(I);
+ }
+
+ /// Returns true if \p I is known to be uniform after vectorization.
+ bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
+ assert(
+ TheLoop->isInnermost() &&
+ "cost-model should not be used for outer loops (in VPlan-native path)");
+ // Pseudo probe needs to be duplicated for each unrolled iteration and
+ // vector lane so that profiled loop trip count can be accurately
+ // accumulated instead of being under counted.
+ if (isa<PseudoProbeInst>(I))
+ return false;
+
+ if (VF.isScalar())
+ return true;
+
+ auto UniformsPerVF = Uniforms.find(VF);
+ assert(UniformsPerVF != Uniforms.end() &&
+ "VF not yet analyzed for uniformity");
+ return UniformsPerVF->second.count(I);
+ }
+
+ /// Returns true if \p I is known to be scalar after vectorization.
+ bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
+ assert(
+ TheLoop->isInnermost() &&
+ "cost-model should not be used for outer loops (in VPlan-native path)");
+ if (VF.isScalar())
+ return true;
+
+ auto ScalarsPerVF = Scalars.find(VF);
+ assert(ScalarsPerVF != Scalars.end() &&
+ "Scalar values are not calculated for VF");
+ return ScalarsPerVF->second.count(I);
+ }
+
+ /// \returns True if instruction \p I can be truncated to a smaller bitwidth
+ /// for vectorization factor \p VF.
+ bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
+ return VF.isVector() && MinBWs.contains(I) &&
+ !isProfitableToScalarize(I, VF) &&
+ !isScalarAfterVectorization(I, VF);
+ }
+
+ /// Decision that was taken during cost calculation for memory instruction.
+ enum InstWidening {
+ CM_Unknown,
+ CM_Widen, // For consecutive accesses with stride +1.
+ CM_Widen_Reverse, // For consecutive accesses with stride -1.
+ CM_Interleave,
+ CM_GatherScatter,
+ CM_Scalarize,
+ CM_VectorCall,
+ CM_IntrinsicCall
+ };
+
+ /// Save vectorization decision \p W and \p Cost taken by the cost model for
+ /// instruction \p I and vector width \p VF.
+ void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
+ InstructionCost Cost) {
+ assert(VF.isVector() && "Expected VF >=2");
+ WideningDecisions[{I, VF}] = {W, Cost};
+ }
+
+ /// Save vectorization decision \p W and \p Cost taken by the cost model for
+ /// interleaving group \p Grp and vector width \p VF.
+ void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
+ ElementCount VF, InstWidening W,
+ InstructionCost Cost) {
+ assert(VF.isVector() && "Expected VF >=2");
+ /// Broadcast this decicion to all instructions inside the group.
+ /// When interleaving, the cost will only be assigned one instruction, the
+ /// insert position. For other cases, add the appropriate fraction of the
+ /// total cost to each instruction. This ensures accurate costs are used,
+ /// even if the insert position instruction is not used.
+ InstructionCost InsertPosCost = Cost;
+ InstructionCost OtherMemberCost = 0;
+ if (W != CM_Interleave)
+ OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
+ ;
+ for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
+ if (auto *I = Grp->getMember(Idx)) {
+ if (Grp->getInsertPos() == I)
+ WideningDecisions[{I, VF}] = {W, InsertPosCost};
+ else
+ WideningDecisions[{I, VF}] = {W, OtherMemberCost};
+ }
+ }
+ }
+
+ /// Return the cost model decision for the given instruction \p I and vector
+ /// width \p VF. Return CM_Unknown if this instruction did not pass
+ /// through the cost modeling.
+ InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
+ assert(VF.isVector() && "Expected VF to be a vector VF");
+ assert(
+ TheLoop->isInnermost() &&
+ "cost-model should not be used for outer loops (in VPlan-native path)");
+
+ std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
+ auto Itr = WideningDecisions.find(InstOnVF);
+ if (Itr == WideningDecisions.end())
+ return CM_Unknown;
+ return Itr->second.first;
+ }
+
+ /// Return the vectorization cost for the given instruction \p I and vector
+ /// width \p VF.
+ InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
+ assert(VF.isVector() && "Expected VF >=2");
+ std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
+ assert(WideningDecisions.contains(InstOnVF) &&
+ "The cost is not calculated");
+ return WideningDecisions[InstOnVF].second;
+ }
+
+ struct CallWideningDecision {
+ InstWidening Kind;
+ Function *Variant;
+ Intrinsic::ID IID;
+ std::optional<unsigned> MaskPos;
+ InstructionCost Cost;
+ };
+
+ void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
+ Function *Variant, Intrinsic::ID IID,
+ std::optional<unsigned> MaskPos,
+ InstructionCost Cost) {
+ assert(!VF.isScalar() && "Expected vector VF");
+ CallWideningDecisions[{CI, VF}] = {Kind, Variant, IID, MaskPos, Cost};
+ }
+
+ CallWideningDecision getCallWideningDecision(CallInst *CI,
+ ElementCount VF) const {
+ assert(!VF.isScalar() && "Expected vector VF");
+ auto I = CallWideningDecisions.find({CI, VF});
+ if (I == CallWideningDecisions.end())
+ return {CM_Unknown, nullptr, Intrinsic::not_intrinsic, std::nullopt, 0};
+ return I->second;
+ }
+
+ /// Return True if instruction \p I is an optimizable truncate whose operand
+ /// is an induction variable. Such a truncate will be removed by adding a new
+ /// induction variable with the destination type.
+ bool isOptimizableIVTruncate(Instruction *I, ElementCount VF);
+
+ /// Collects the instructions to scalarize for each predicated instruction in
+ /// the loop.
+ void collectInstsToScalarize(ElementCount VF);
+
+ /// Collect values that will not be widened, including Uniforms, Scalars, and
+ /// Instructions to Scalarize for the given \p VF.
+ /// The sets depend on CM decision for Load/Store instructions
+ /// that may be vectorized as interleave, gather-scatter or scalarized.
+ /// Also make a decision on what to do about call instructions in the loop
+ /// at that VF -- scalarize, call a known vector routine, or call a
+ /// vector intrinsic.
+ void collectNonVectorizedAndSetWideningDecisions(ElementCount VF) {
+ // Do the analysis once.
+ if (VF.isScalar() || Uniforms.contains(VF))
+ return;
+ setCostBasedWideningDecision(VF);
+ collectLoopUniforms(VF);
+ setVectorizedCallDecision(VF);
+ collectLoopScalars(VF);
+ collectInstsToScalarize(VF);
+ }
+
+ /// Returns true if the target machine supports masked store operation
+ /// for the given \p DataType and kind of access to \p Ptr.
+ bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment,
+ unsigned AddressSpace) const;
+
+ /// Returns true if the target machine supports masked load operation
+ /// for the given \p DataType and kind of access to \p Ptr.
+ bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment,
+ unsigned AddressSpace) const;
+
+ /// Returns true if the target machine can represent \p V as a masked gather
+ /// or scatter operation.
+ bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
+ bool LI = isa<LoadInst>(V);
+ bool SI = isa<StoreInst>(V);
+ if (!LI && !SI)
+ return false;
+ auto *Ty = getLoadStoreType(V);
+ Align Align = getLoadStoreAlignment(V);
+ if (VF.isVector())
+ Ty = VectorType::get(Ty, VF);
+ return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
+ (SI && TTI.isLegalMaskedScatter(Ty, Align));
+ }
+
+ /// Returns true if the target machine supports all of the reduction
+ /// variables found for the given VF.
+ bool canVectorizeReductions(ElementCount VF) const;
+
+ /// Given costs for both strategies, return true if the scalar predication
+ /// lowering should be used for div/rem. This incorporates an override
+ /// option so it is not simply a cost comparison.
+ bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
+ InstructionCost SafeDivisorCost) const {
+ switch (ForceSafeDivisor) {
+ case cl::BOU_UNSET:
+ return ScalarCost < SafeDivisorCost;
+ case cl::BOU_TRUE:
+ return false;
+ case cl::BOU_FALSE:
+ return true;
+ }
+ llvm_unreachable("impossible case value");
+ }
+
+ /// Returns true if \p I is an instruction which requires predication and
+ /// for which our chosen predication strategy is scalarization (i.e. we
+ /// don't have an alternate strategy such as masking available).
+ /// \p VF is the vectorization factor that will be used to vectorize \p I.
+ bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
+
+ /// Returns true if \p I is an instruction that needs to be predicated
+ /// at runtime. The result is independent of the predication mechanism.
+ /// Superset of instructions that return true for isScalarWithPredication.
+ bool isPredicatedInst(Instruction *I) const;
+
+ /// Return the costs for our two available strategies for lowering a
+ /// div/rem operation which requires speculating at least one lane.
+ /// First result is for scalarization (will be invalid for scalable
+ /// vectors); second is for the safe-divisor strategy.
+ std::pair<InstructionCost, InstructionCost>
+ getDivRemSpeculationCost(Instruction *I, ElementCount VF) const;
+
+ /// Returns true if \p I is a memory instruction with consecutive memory
+ /// access that can be widened.
+ bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
+
+ /// Returns true if \p I is a memory instruction in an interleaved-group
+ /// of memory accesses that can be vectorized with wide vector loads/stores
+ /// and shuffles.
+ bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
+
+ /// Check if \p Instr belongs to any interleaved access group.
+ bool isAccessInterleaved(Instruction *Instr) const {
+ return InterleaveInfo.isInterleaved(Instr);
+ }
+
+ /// Get the interleaved access group that \p Instr belongs to.
+ const InterleaveGroup<Instruction> *
+ getInterleavedAccessGroup(Instruction *Instr) const {
+ return InterleaveInfo.getInterleaveGroup(Instr);
+ }
+
+ /// Returns true if we're required to use a scalar epilogue for at least
+ /// the final iteration of the original loop.
+ bool requiresScalarEpilogue(bool IsVectorizing) const;
+
+ /// Returns true if a scalar epilogue is not allowed due to optsize or a
+ /// loop hint annotation.
+ bool isScalarEpilogueAllowed() const {
+ return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
+ }
+
+ /// Returns the TailFoldingStyle that is best for the current loop.
+ TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
+ if (!ChosenTailFoldingStyle)
+ return TailFoldingStyle::None;
+ return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
+ : ChosenTailFoldingStyle->second;
+ }
+
+ /// Selects and saves TailFoldingStyle for 2 options - if IV update may
+ /// overflow or not.
+ /// \param IsScalableVF true if scalable vector factors enabled.
+ /// \param UserIC User specific interleave count.
+ void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC);
+
+ /// Returns true if all loop blocks should be masked to fold tail loop.
+ bool foldTailByMasking() const {
+ // TODO: check if it is possible to check for None style independent of
+ // IVUpdateMayOverflow flag in getTailFoldingStyle.
+ return getTailFoldingStyle() != TailFoldingStyle::None;
+ }
+
+ /// Return maximum safe number of elements to be processed per vector
+ /// iteration, which do not prevent store-load forwarding and are safe with
+ /// regard to the memory dependencies. Required for EVL-based VPlans to
+ /// correctly calculate AVL (application vector length) as min(remaining AVL,
+ /// MaxSafeElements).
+ /// TODO: need to consider adjusting cost model to use this value as a
+ /// vectorization factor for EVL-based vectorization.
+ std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
+
+ /// Returns true if the instructions in this block requires predication
+ /// for any reason, e.g. because tail folding now requires a predicate
+ /// or because the block in the original loop was predicated.
+ bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const;
+
+ /// Returns true if VP intrinsics with explicit vector length support should
+ /// be generated in the tail folded loop.
+ bool foldTailWithEVL() const {
+ return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
+ }
+
+ /// Returns true if the Phi is part of an inloop reduction.
+ bool isInLoopReduction(PHINode *Phi) const {
+ return InLoopReductions.contains(Phi);
+ }
+
+ /// Returns true if the predicated reduction select should be used to set the
+ /// incoming value for the reduction phi.
+ bool usePredicatedReductionSelect() const {
+ // Force to use predicated reduction select since the EVL of the
+ // second-to-last iteration might not be VF*UF.
+ if (foldTailWithEVL())
+ return true;
+ return PreferPredicatedReductionSelect ||
+ TTI.preferPredicatedReductionSelect();
+ }
+
+ /// Estimate cost of an intrinsic call instruction CI if it were vectorized
+ /// with factor VF. Return the cost of the instruction, including
+ /// scalarization overhead if it's needed.
+ InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
+
+ /// Estimate cost of a call instruction CI if it were vectorized with factor
+ /// VF. Return the cost of the instruction, including scalarization overhead
+ /// if it's needed.
+ InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
+
+ /// Invalidates decisions already taken by the cost model.
+ void invalidateCostModelingDecisions() {
+ WideningDecisions.clear();
+ CallWideningDecisions.clear();
+ Uniforms.clear();
+ Scalars.clear();
+ }
+
+ /// Returns the expected execution cost. The unit of the cost does
+ /// not matter because we use the 'cost' units to compare different
+ /// vector widths. The cost that is returned is *not* normalized by
+ /// the factor width.
+ InstructionCost expectedCost(ElementCount VF);
+
+ bool hasPredStores() const { return NumPredStores > 0; }
+
+ /// Returns true if epilogue vectorization is considered profitable, and
+ /// false otherwise.
+ /// \p VF is the vectorization factor chosen for the original loop.
+ /// \p Multiplier is an aditional scaling factor applied to VF before
+ /// comparing to EpilogueVectorizationMinVF.
+ bool isEpilogueVectorizationProfitable(const ElementCount VF,
+ const unsigned IC) const;
+
+ /// Returns the execution time cost of an instruction for a given vector
+ /// width. Vector width of one means scalar.
+ InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
+
+ /// Return the cost of instructions in an inloop reduction pattern, if I is
+ /// part of that pattern.
+ std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
+ ElementCount VF,
+ Type *VectorTy) const;
+
+ /// Returns true if \p Op should be considered invariant and if it is
+ /// trivially hoistable.
+ bool shouldConsiderInvariant(Value *Op);
+
+ /// Return the value of vscale used for tuning the cost model.
+ std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
+
+private:
+ unsigned NumPredStores = 0;
+
+ /// Used to store the value of vscale used for tuning the cost model. It is
+ /// initialized during object construction.
+ std::optional<unsigned> VScaleForTuning;
+
+ /// Initializes the value of vscale used for tuning the cost model. If
+ /// vscale_range.min == vscale_range.max then return vscale_range.max, else
+ /// return the value returned by the corresponding TTI method.
+ void initializeVScaleForTuning() {
+ const Function *Fn = TheLoop->getHeader()->getParent();
+ if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
+ auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
+ auto Min = Attr.getVScaleRangeMin();
+ auto Max = Attr.getVScaleRangeMax();
+ if (Max && Min == Max) {
+ VScaleForTuning = Max;
+ return;
+ }
+ }
+
+ VScaleForTuning = TTI.getVScaleForTuning();
+ }
+
+ /// \return An upper bound for the vectorization factors for both
+ /// fixed and scalable vectorization, where the minimum-known number of
+ /// elements is a power-of-2 larger than zero. If scalable vectorization is
+ /// disabled or unsupported, then the scalable part will be equal to
+ /// ElementCount::getScalable(0).
+ FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
+ ElementCount UserVF,
+ bool FoldTailByMasking);
+
+ /// If \p VF > MaxTripcount, clamps it to the next lower VF that is <=
+ /// MaxTripCount.
+ ElementCount clampVFByMaxTripCount(ElementCount VF, unsigned MaxTripCount,
+ bool FoldTailByMasking) const;
+
+ /// \return the maximized element count based on the targets vector
+ /// registers and the loop trip-count, but limited to a maximum safe VF.
+ /// This is a helper function of computeFeasibleMaxVF.
+ ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
+ unsigned SmallestType,
+ unsigned WidestType,
+ ElementCount MaxSafeVF,
+ bool FoldTailByMasking);
+
+ /// Checks if scalable vectorization is supported and enabled. Caches the
+ /// result to avoid repeated debug dumps for repeated queries.
+ bool isScalableVectorizationAllowed();
+
+ /// \return the maximum legal scalable VF, based on the safe max number
+ /// of elements.
+ ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
+
+ /// Calculate vectorization cost of memory instruction \p I.
+ InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
+
+ /// The cost computation for scalarized memory instruction.
+ InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
+
+ /// The cost computation for interleaving group of memory instructions.
+ InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
+
+ /// The cost computation for Gather/Scatter instruction.
+ InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
+
+ /// The cost computation for widening instruction \p I with consecutive
+ /// memory access.
+ InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
+
+ /// The cost calculation for Load/Store instruction \p I with uniform pointer
+ /// - Load: scalar load + broadcast. Store: scalar store + (loop invariant
+ /// value stored? 0 : extract of last element)
+ InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
+
+ /// Estimate the overhead of scalarizing an instruction. This is a
+ /// convenience wrapper for the type-based getScalarizationOverhead API.
+ InstructionCost getScalarizationOverhead(Instruction *I,
+ ElementCount VF) const;
+
+ /// Returns true if an artificially high cost for emulated masked memrefs
+ /// should be used.
+ bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
+
+ /// Map of scalar integer values to the smallest bitwidth they can be legally
+ /// represented as. The vector equivalents of these values should be truncated
+ /// to this type.
+ MapVector<Instruction *, uint64_t> MinBWs;
+
+ /// A type representing the costs for instructions if they were to be
+ /// scalarized rather than vectorized. The entries are Instruction-Cost
+ /// pairs.
+ using ScalarCostsTy = MapVector<Instruction *, InstructionCost>;
+
+ /// A set containing all BasicBlocks that are known to present after
+ /// vectorization as a predicated block.
+ DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
+ PredicatedBBsAfterVectorization;
+
+ /// Records whether it is allowed to have the original scalar loop execute at
+ /// least once. This may be needed as a fallback loop in case runtime
+ /// aliasing/dependence checks fail, or to handle the tail/remainder
+ /// iterations when the trip count is unknown or doesn't divide by the VF,
+ /// or as a peel-loop to handle gaps in interleave-groups.
+ /// Under optsize and when the trip count is very small we don't allow any
+ /// iterations to execute in the scalar loop.
+ ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+
+ /// Control finally chosen tail folding style. The first element is used if
+ /// the IV update may overflow, the second element - if it does not.
+ std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
+ ChosenTailFoldingStyle;
+
+ /// true if scalable vectorization is supported and enabled.
+ std::optional<bool> IsScalableVectorizationAllowed;
+
+ /// Maximum safe number of elements to be processed per vector iteration,
+ /// which do not prevent store-load forwarding and are safe with regard to the
+ /// memory dependencies. Required for EVL-based veectorization, where this
+ /// value is used as the upper bound of the safe AVL.
+ std::optional<unsigned> MaxSafeElements;
+
+ /// A map holding scalar costs for different vectorization factors. The
+ /// presence of a cost for an instruction in the mapping indicates that the
+ /// instruction will be scalarized when vectorizing with the associated
+ /// vectorization factor. The entries are VF-ScalarCostTy pairs.
+ MapVector<ElementCount, ScalarCostsTy> InstsToScalarize;
+
+ /// Holds the instructions known to be uniform after vectorization.
+ /// The data is collected per VF.
+ DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
+
+ /// Holds the instructions known to be scalar after vectorization.
+ /// The data is collected per VF.
+ DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
+
+ /// Holds the instructions (address computations) that are forced to be
+ /// scalarized.
+ DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
+
+ /// PHINodes of the reductions that should be expanded in-loop.
+ SmallPtrSet<PHINode *, 4> InLoopReductions;
+
+ /// A Map of inloop reduction operations and their immediate chain operand.
+ /// FIXME: This can be removed once reductions can be costed correctly in
+ /// VPlan. This was added to allow quick lookup of the inloop operations.
+ DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
+
+ /// Returns the expected difference in cost from scalarizing the expression
+ /// feeding a predicated instruction \p PredInst. The instructions to
+ /// scalarize and their scalar costs are collected in \p ScalarCosts. A
+ /// non-negative return value implies the expression will be scalarized.
+ /// Currently, only single-use chains are considered for scalarization.
+ InstructionCost computePredInstDiscount(Instruction *PredInst,
+ ScalarCostsTy &ScalarCosts,
+ ElementCount VF);
+
+ /// Collect the instructions that are uniform after vectorization. An
+ /// instruction is uniform if we represent it with a single scalar value in
+ /// the vectorized loop corresponding to each vector iteration. Examples of
+ /// uniform instructions include pointer operands of consecutive or
+ /// interleaved memory accesses. Note that although uniformity implies an
+ /// instruction will be scalar, the reverse is not true. In general, a
+ /// scalarized instruction will be represented by VF scalar values in the
+ /// vectorized loop, each corresponding to an iteration of the original
+ /// scalar loop.
+ void collectLoopUniforms(ElementCount VF);
+
+ /// Collect the instructions that are scalar after vectorization. An
+ /// instruction is scalar if it is known to be uniform or will be scalarized
+ /// during vectorization. collectLoopScalars should only add non-uniform nodes
+ /// to the list if they are used by a load/store instruction that is marked as
+ /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
+ /// VF values in the vectorized loop, each corresponding to an iteration of
+ /// the original scalar loop.
+ void collectLoopScalars(ElementCount VF);
+
+ /// Keeps cost model vectorization decision and cost for instructions.
+ /// Right now it is used for memory instructions only.
+ using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
+ std::pair<InstWidening, InstructionCost>>;
+
+ DecisionList WideningDecisions;
+
+ using CallDecisionList =
+ DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
+
+ CallDecisionList CallWideningDecisions;
+
+ /// Returns true if \p V is expected to be vectorized and it needs to be
+ /// extracted.
+ bool needsExtract(Value *V, ElementCount VF) const {
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (VF.isScalar() || !I || !TheLoop->contains(I) ||
+ TheLoop->isLoopInvariant(I) ||
+ getWideningDecision(I, VF) == CM_Scalarize ||
+ (isa<CallInst>(I) &&
+ getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize))
+ return false;
+
+ // Assume we can vectorize V (and hence we need extraction) if the
+ // scalars are not computed yet. This can happen, because it is called
+ // via getScalarizationOverhead from setCostBasedWideningDecision, before
+ // the scalars are collected. That should be a safe assumption in most
+ // cases, because we check if the operands have vectorizable types
+ // beforehand in LoopVectorizationLegality.
+ return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
+ };
+
+ /// Returns a range containing only operands needing to be extracted.
+ SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
+ ElementCount VF) const {
+
+ SmallPtrSet<const Value *, 4> UniqueOperands;
+ SmallVector<Value *, 4> Res;
+ for (Value *Op : Ops) {
+ if (isa<Constant>(Op) || !UniqueOperands.insert(Op).second ||
+ !needsExtract(Op, VF))
+ continue;
+ Res.push_back(Op);
+ }
+ return Res;
+ }
+
+public:
+ /// The loop that we evaluate.
+ Loop *TheLoop;
+
+ /// Predicated scalar evolution analysis.
+ PredicatedScalarEvolution &PSE;
+
+ /// Loop Info analysis.
+ LoopInfo *LI;
+
+ /// Vectorization legality.
+ LoopVectorizationLegality *Legal;
+
+ /// Vector target information.
+ const TargetTransformInfo &TTI;
+
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+
+ /// Demanded bits analysis.
+ DemandedBits *DB;
+
+ /// Assumption cache.
+ AssumptionCache *AC;
+
+ /// Interface to emit optimization remarks.
+ OptimizationRemarkEmitter *ORE;
+
+ const Function *TheFunction;
+
+ /// Loop Vectorize Hint.
+ const LoopVectorizeHints *Hints;
+
+ /// The interleave access information contains groups of interleaved accesses
+ /// with the same stride and close to each other.
+ InterleavedAccessInfo &InterleaveInfo;
+
+ /// Values to ignore in the cost model.
+ SmallPtrSet<const Value *, 16> ValuesToIgnore;
+
+ /// Values to ignore in the cost model when VF > 1.
+ SmallPtrSet<const Value *, 16> VecValuesToIgnore;
+
+ /// All element types found in the loop.
+ SmallPtrSet<Type *, 16> ElementTypesInLoop;
+
+ /// The kind of cost that we are calculating
+ TTI::TargetCostKind CostKind;
+
+ /// Whether this loop should be optimized for size based on function attribute
+ /// or profile information.
+ bool OptForSize;
+
+ /// The highest VF possible for this loop, without using MaxBandwidth.
+ FixedScalableVFPair MaxPermissibleVFWithoutMaxBW;
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONCOSTMODEL_H
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index d34d2ae7a0b31..0d1c72ac61068 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -398,37 +398,6 @@ struct VectorizationFactor {
}
};
-/// A class that represents two vectorization factors (initialized with 0 by
-/// default). One for fixed-width vectorization and one for scalable
-/// vectorization. This can be used by the vectorizer to choose from a range of
-/// fixed and/or scalable VFs in order to find the most cost-effective VF to
-/// vectorize with.
-struct FixedScalableVFPair {
- ElementCount FixedVF;
- ElementCount ScalableVF;
-
- FixedScalableVFPair()
- : FixedVF(ElementCount::getFixed(0)),
- ScalableVF(ElementCount::getScalable(0)) {}
- FixedScalableVFPair(const ElementCount &Max) : FixedScalableVFPair() {
- *(Max.isScalable() ? &ScalableVF : &FixedVF) = Max;
- }
- FixedScalableVFPair(const ElementCount &FixedVF,
- const ElementCount &ScalableVF)
- : FixedVF(FixedVF), ScalableVF(ScalableVF) {
- assert(!FixedVF.isScalable() && ScalableVF.isScalable() &&
- "Invalid scalable properties");
- }
-
- static FixedScalableVFPair getNone() { return FixedScalableVFPair(); }
-
- /// \return true if either fixed- or scalable VF is non-zero.
- explicit operator bool() const { return FixedVF || ScalableVF; }
-
- /// \return true if either fixed- or scalable VF is a valid vector VF.
- bool hasVector() const { return FixedVF.isVector() || ScalableVF.isVector(); }
-};
-
/// Planner drives the vectorization process after having passed
/// Legality checks.
class LoopVectorizationPlanner {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 640a98c622f80..60fec588fafc9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -54,6 +54,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
+#include "LoopVectorizationCostModel.h"
#include "LoopVectorizationPlanner.h"
#include "VPRecipeBuilder.h"
#include "VPlan.h"
@@ -288,7 +289,7 @@ cl::opt<unsigned> llvm::ForceTargetInstructionCost(
"an instruction to a single constant value. Mostly "
"useful for getting consistent testing."));
-static cl::opt<bool> ForceTargetSupportsScalableVectors(
+cl::opt<bool> llvm::ForceTargetSupportsScalableVectors(
"force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
cl::desc(
"Pretend that scalable vectors are supported, even if the target does "
@@ -340,7 +341,7 @@ static cl::opt<bool> ForceOrderedReductions(
cl::desc("Enable the vectorisation of loops with in-order (strict) "
"FP reductions"));
-static cl::opt<bool> PreferPredicatedReductionSelect(
+cl::opt<bool> llvm::PreferPredicatedReductionSelect(
"prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
cl::desc(
"Prefer predicating a reduction operation over an after loop select."));
@@ -378,7 +379,7 @@ cl::opt<bool> llvm::EnableLoopVectorization(
"vectorize-loops", cl::init(true), cl::Hidden,
cl::desc("Run the Loop vectorization passes"));
-static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
+cl::opt<cl::boolOrDefault> llvm::ForceSafeDivisor(
"force-widen-divrem-via-safe-divisor", cl::Hidden,
cl::desc(
"Override cost based safe divisor widening for div/rem instructions"));
@@ -853,894 +854,6 @@ static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
} // end namespace llvm
-namespace llvm {
-
-// Loop vectorization cost-model hints how the scalar epilogue loop should be
-// lowered.
-enum ScalarEpilogueLowering {
-
- // The default: allowing scalar epilogues.
- CM_ScalarEpilogueAllowed,
-
- // Vectorization with OptForSize: don't allow epilogues.
- CM_ScalarEpilogueNotAllowedOptSize,
-
- // A special case of vectorisation with OptForSize: loops with a very small
- // trip count are considered for vectorization under OptForSize, thereby
- // making sure the cost of their loop body is dominant, free of runtime
- // guards and scalar iteration overheads.
- CM_ScalarEpilogueNotAllowedLowTripLoop,
-
- // Loop hint predicate indicating an epilogue is undesired.
- CM_ScalarEpilogueNotNeededUsePredicate,
-
- // Directive indicating we must either tail fold or not vectorize
- CM_ScalarEpilogueNotAllowedUsePredicate
-};
-
-/// LoopVectorizationCostModel - estimates the expected speedups due to
-/// vectorization.
-/// In many cases vectorization is not profitable. This can happen because of
-/// a number of reasons. In this class we mainly attempt to predict the
-/// expected speedup/slowdowns due to the supported instruction set. We use the
-/// TargetTransformInfo to query the different backends for the cost of
-/// different operations.
-class LoopVectorizationCostModel {
- friend class LoopVectorizationPlanner;
-
-public:
- LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
- PredicatedScalarEvolution &PSE, LoopInfo *LI,
- LoopVectorizationLegality *Legal,
- const TargetTransformInfo &TTI,
- const TargetLibraryInfo *TLI, DemandedBits *DB,
- AssumptionCache *AC,
- OptimizationRemarkEmitter *ORE, const Function *F,
- const LoopVectorizeHints *Hints,
- InterleavedAccessInfo &IAI,
- ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
- : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
- TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
- Hints(Hints), InterleaveInfo(IAI) {
- if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
- initializeVScaleForTuning();
- CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
- // Query this against the original loop and save it here because the profile
- // of the original loop header may change as the transformation happens.
- OptForSize = llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
- PGSOQueryType::IRPass);
- }
-
- /// \return An upper bound for the vectorization factors (both fixed and
- /// scalable). If the factors are 0, vectorization and interleaving should be
- /// avoided up front.
- FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
-
- /// \return True if runtime checks are required for vectorization, and false
- /// otherwise.
- bool runtimeChecksRequired();
-
- /// Setup cost-based decisions for user vectorization factor.
- /// \return true if the UserVF is a feasible VF to be chosen.
- bool selectUserVectorizationFactor(ElementCount UserVF) {
- collectNonVectorizedAndSetWideningDecisions(UserVF);
- return expectedCost(UserVF).isValid();
- }
-
- /// \return True if maximizing vector bandwidth is enabled by the target or
- /// user options, for the given register kind.
- bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
-
- /// \return True if register pressure should be considered for the given VF.
- bool shouldConsiderRegPressureForVF(ElementCount VF);
-
- /// \return The size (in bits) of the smallest and widest types in the code
- /// that needs to be vectorized. We ignore values that remain scalar such as
- /// 64 bit loop indices.
- std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
-
- /// Memory access instruction may be vectorized in more than one way.
- /// Form of instruction after vectorization depends on cost.
- /// This function takes cost-based decisions for Load/Store instructions
- /// and collects them in a map. This decisions map is used for building
- /// the lists of loop-uniform and loop-scalar instructions.
- /// The calculated cost is saved with widening decision in order to
- /// avoid redundant calculations.
- void setCostBasedWideningDecision(ElementCount VF);
-
- /// A call may be vectorized in different ways depending on whether we have
- /// vectorized variants available and whether the target supports masking.
- /// This function analyzes all calls in the function at the supplied VF,
- /// makes a decision based on the costs of available options, and stores that
- /// decision in a map for use in planning and plan execution.
- void setVectorizedCallDecision(ElementCount VF);
-
- /// Collect values we want to ignore in the cost model.
- void collectValuesToIgnore();
-
- /// Collect all element types in the loop for which widening is needed.
- void collectElementTypesForWidening();
-
- /// Split reductions into those that happen in the loop, and those that happen
- /// outside. In loop reductions are collected into InLoopReductions.
- void collectInLoopReductions();
-
- /// Returns true if we should use strict in-order reductions for the given
- /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
- /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
- /// of FP operations.
- bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
- return !Hints->allowReordering() && RdxDesc.isOrdered();
- }
-
- /// \returns The smallest bitwidth each instruction can be represented with.
- /// The vector equivalents of these instructions should be truncated to this
- /// type.
- const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
- return MinBWs;
- }
-
- /// \returns True if it is more profitable to scalarize instruction \p I for
- /// vectorization factor \p VF.
- bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
- assert(VF.isVector() &&
- "Profitable to scalarize relevant only for VF > 1.");
- assert(
- TheLoop->isInnermost() &&
- "cost-model should not be used for outer loops (in VPlan-native path)");
-
- auto Scalars = InstsToScalarize.find(VF);
- assert(Scalars != InstsToScalarize.end() &&
- "VF not yet analyzed for scalarization profitability");
- return Scalars->second.contains(I);
- }
-
- /// Returns true if \p I is known to be uniform after vectorization.
- bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
- assert(
- TheLoop->isInnermost() &&
- "cost-model should not be used for outer loops (in VPlan-native path)");
- // Pseudo probe needs to be duplicated for each unrolled iteration and
- // vector lane so that profiled loop trip count can be accurately
- // accumulated instead of being under counted.
- if (isa<PseudoProbeInst>(I))
- return false;
-
- if (VF.isScalar())
- return true;
-
- auto UniformsPerVF = Uniforms.find(VF);
- assert(UniformsPerVF != Uniforms.end() &&
- "VF not yet analyzed for uniformity");
- return UniformsPerVF->second.count(I);
- }
-
- /// Returns true if \p I is known to be scalar after vectorization.
- bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
- assert(
- TheLoop->isInnermost() &&
- "cost-model should not be used for outer loops (in VPlan-native path)");
- if (VF.isScalar())
- return true;
-
- auto ScalarsPerVF = Scalars.find(VF);
- assert(ScalarsPerVF != Scalars.end() &&
- "Scalar values are not calculated for VF");
- return ScalarsPerVF->second.count(I);
- }
-
- /// \returns True if instruction \p I can be truncated to a smaller bitwidth
- /// for vectorization factor \p VF.
- bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
- return VF.isVector() && MinBWs.contains(I) &&
- !isProfitableToScalarize(I, VF) &&
- !isScalarAfterVectorization(I, VF);
- }
-
- /// Decision that was taken during cost calculation for memory instruction.
- enum InstWidening {
- CM_Unknown,
- CM_Widen, // For consecutive accesses with stride +1.
- CM_Widen_Reverse, // For consecutive accesses with stride -1.
- CM_Interleave,
- CM_GatherScatter,
- CM_Scalarize,
- CM_VectorCall,
- CM_IntrinsicCall
- };
-
- /// Save vectorization decision \p W and \p Cost taken by the cost model for
- /// instruction \p I and vector width \p VF.
- void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
- InstructionCost Cost) {
- assert(VF.isVector() && "Expected VF >=2");
- WideningDecisions[{I, VF}] = {W, Cost};
- }
-
- /// Save vectorization decision \p W and \p Cost taken by the cost model for
- /// interleaving group \p Grp and vector width \p VF.
- void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
- ElementCount VF, InstWidening W,
- InstructionCost Cost) {
- assert(VF.isVector() && "Expected VF >=2");
- /// Broadcast this decicion to all instructions inside the group.
- /// When interleaving, the cost will only be assigned one instruction, the
- /// insert position. For other cases, add the appropriate fraction of the
- /// total cost to each instruction. This ensures accurate costs are used,
- /// even if the insert position instruction is not used.
- InstructionCost InsertPosCost = Cost;
- InstructionCost OtherMemberCost = 0;
- if (W != CM_Interleave)
- OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
- ;
- for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
- if (auto *I = Grp->getMember(Idx)) {
- if (Grp->getInsertPos() == I)
- WideningDecisions[{I, VF}] = {W, InsertPosCost};
- else
- WideningDecisions[{I, VF}] = {W, OtherMemberCost};
- }
- }
- }
-
- /// Return the cost model decision for the given instruction \p I and vector
- /// width \p VF. Return CM_Unknown if this instruction did not pass
- /// through the cost modeling.
- InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
- assert(VF.isVector() && "Expected VF to be a vector VF");
- assert(
- TheLoop->isInnermost() &&
- "cost-model should not be used for outer loops (in VPlan-native path)");
-
- std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
- auto Itr = WideningDecisions.find(InstOnVF);
- if (Itr == WideningDecisions.end())
- return CM_Unknown;
- return Itr->second.first;
- }
-
- /// Return the vectorization cost for the given instruction \p I and vector
- /// width \p VF.
- InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
- assert(VF.isVector() && "Expected VF >=2");
- std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
- assert(WideningDecisions.contains(InstOnVF) &&
- "The cost is not calculated");
- return WideningDecisions[InstOnVF].second;
- }
-
- struct CallWideningDecision {
- InstWidening Kind;
- Function *Variant;
- Intrinsic::ID IID;
- std::optional<unsigned> MaskPos;
- InstructionCost Cost;
- };
-
- void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
- Function *Variant, Intrinsic::ID IID,
- std::optional<unsigned> MaskPos,
- InstructionCost Cost) {
- assert(!VF.isScalar() && "Expected vector VF");
- CallWideningDecisions[{CI, VF}] = {Kind, Variant, IID, MaskPos, Cost};
- }
-
- CallWideningDecision getCallWideningDecision(CallInst *CI,
- ElementCount VF) const {
- assert(!VF.isScalar() && "Expected vector VF");
- auto I = CallWideningDecisions.find({CI, VF});
- if (I == CallWideningDecisions.end())
- return {CM_Unknown, nullptr, Intrinsic::not_intrinsic, std::nullopt, 0};
- return I->second;
- }
-
- /// Return True if instruction \p I is an optimizable truncate whose operand
- /// is an induction variable. Such a truncate will be removed by adding a new
- /// induction variable with the destination type.
- bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
- // If the instruction is not a truncate, return false.
- auto *Trunc = dyn_cast<TruncInst>(I);
- if (!Trunc)
- return false;
-
- // Get the source and destination types of the truncate.
- Type *SrcTy = toVectorTy(Trunc->getSrcTy(), VF);
- Type *DestTy = toVectorTy(Trunc->getDestTy(), VF);
-
- // If the truncate is free for the given types, return false. Replacing a
- // free truncate with an induction variable would add an induction variable
- // update instruction to each iteration of the loop. We exclude from this
- // check the primary induction variable since it will need an update
- // instruction regardless.
- Value *Op = Trunc->getOperand(0);
- if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
- return false;
-
- // If the truncated value is not an induction variable, return false.
- return Legal->isInductionPhi(Op);
- }
-
- /// Collects the instructions to scalarize for each predicated instruction in
- /// the loop.
- void collectInstsToScalarize(ElementCount VF);
-
- /// Collect values that will not be widened, including Uniforms, Scalars, and
- /// Instructions to Scalarize for the given \p VF.
- /// The sets depend on CM decision for Load/Store instructions
- /// that may be vectorized as interleave, gather-scatter or scalarized.
- /// Also make a decision on what to do about call instructions in the loop
- /// at that VF -- scalarize, call a known vector routine, or call a
- /// vector intrinsic.
- void collectNonVectorizedAndSetWideningDecisions(ElementCount VF) {
- // Do the analysis once.
- if (VF.isScalar() || Uniforms.contains(VF))
- return;
- setCostBasedWideningDecision(VF);
- collectLoopUniforms(VF);
- setVectorizedCallDecision(VF);
- collectLoopScalars(VF);
- collectInstsToScalarize(VF);
- }
-
- /// Returns true if the target machine supports masked store operation
- /// for the given \p DataType and kind of access to \p Ptr.
- bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment,
- unsigned AddressSpace) const {
- return Legal->isConsecutivePtr(DataType, Ptr) &&
- TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace);
- }
-
- /// Returns true if the target machine supports masked load operation
- /// for the given \p DataType and kind of access to \p Ptr.
- bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment,
- unsigned AddressSpace) const {
- return Legal->isConsecutivePtr(DataType, Ptr) &&
- TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace);
- }
-
- /// Returns true if the target machine can represent \p V as a masked gather
- /// or scatter operation.
- bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
- bool LI = isa<LoadInst>(V);
- bool SI = isa<StoreInst>(V);
- if (!LI && !SI)
- return false;
- auto *Ty = getLoadStoreType(V);
- Align Align = getLoadStoreAlignment(V);
- if (VF.isVector())
- Ty = VectorType::get(Ty, VF);
- return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
- (SI && TTI.isLegalMaskedScatter(Ty, Align));
- }
-
- /// Returns true if the target machine supports all of the reduction
- /// variables found for the given VF.
- bool canVectorizeReductions(ElementCount VF) const {
- return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
- const RecurrenceDescriptor &RdxDesc = Reduction.second;
- return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
- }));
- }
-
- /// Given costs for both strategies, return true if the scalar predication
- /// lowering should be used for div/rem. This incorporates an override
- /// option so it is not simply a cost comparison.
- bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
- InstructionCost SafeDivisorCost) const {
- switch (ForceSafeDivisor) {
- case cl::BOU_UNSET:
- return ScalarCost < SafeDivisorCost;
- case cl::BOU_TRUE:
- return false;
- case cl::BOU_FALSE:
- return true;
- }
- llvm_unreachable("impossible case value");
- }
-
- /// Returns true if \p I is an instruction which requires predication and
- /// for which our chosen predication strategy is scalarization (i.e. we
- /// don't have an alternate strategy such as masking available).
- /// \p VF is the vectorization factor that will be used to vectorize \p I.
- bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
-
- /// Returns true if \p I is an instruction that needs to be predicated
- /// at runtime. The result is independent of the predication mechanism.
- /// Superset of instructions that return true for isScalarWithPredication.
- bool isPredicatedInst(Instruction *I) const;
-
- /// Return the costs for our two available strategies for lowering a
- /// div/rem operation which requires speculating at least one lane.
- /// First result is for scalarization (will be invalid for scalable
- /// vectors); second is for the safe-divisor strategy.
- std::pair<InstructionCost, InstructionCost>
- getDivRemSpeculationCost(Instruction *I,
- ElementCount VF) const;
-
- /// Returns true if \p I is a memory instruction with consecutive memory
- /// access that can be widened.
- bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
-
- /// Returns true if \p I is a memory instruction in an interleaved-group
- /// of memory accesses that can be vectorized with wide vector loads/stores
- /// and shuffles.
- bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
-
- /// Check if \p Instr belongs to any interleaved access group.
- bool isAccessInterleaved(Instruction *Instr) const {
- return InterleaveInfo.isInterleaved(Instr);
- }
-
- /// Get the interleaved access group that \p Instr belongs to.
- const InterleaveGroup<Instruction> *
- getInterleavedAccessGroup(Instruction *Instr) const {
- return InterleaveInfo.getInterleaveGroup(Instr);
- }
-
- /// Returns true if we're required to use a scalar epilogue for at least
- /// the final iteration of the original loop.
- bool requiresScalarEpilogue(bool IsVectorizing) const {
- if (!isScalarEpilogueAllowed()) {
- LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
- return false;
- }
- // If we might exit from anywhere but the latch and early exit vectorization
- // is disabled, we must run the exiting iteration in scalar form.
- if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
- !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
- LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
- "from latch block\n");
- return true;
- }
- if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
- LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
- "interleaved group requires scalar epilogue\n");
- return true;
- }
- LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
- return false;
- }
-
- /// Returns true if a scalar epilogue is not allowed due to optsize or a
- /// loop hint annotation.
- bool isScalarEpilogueAllowed() const {
- return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
- }
-
- /// Returns the TailFoldingStyle that is best for the current loop.
- TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
- if (!ChosenTailFoldingStyle)
- return TailFoldingStyle::None;
- return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
- : ChosenTailFoldingStyle->second;
- }
-
- /// Selects and saves TailFoldingStyle for 2 options - if IV update may
- /// overflow or not.
- /// \param IsScalableVF true if scalable vector factors enabled.
- /// \param UserIC User specific interleave count.
- void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
- assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
- if (!Legal->canFoldTailByMasking()) {
- ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
- return;
- }
-
- // Default to TTI preference, but allow command line override.
- ChosenTailFoldingStyle = {
- TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
- TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)};
- if (ForceTailFoldingStyle.getNumOccurrences())
- ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
- ForceTailFoldingStyle.getValue()};
-
- if (ChosenTailFoldingStyle->first != TailFoldingStyle::DataWithEVL &&
- ChosenTailFoldingStyle->second != TailFoldingStyle::DataWithEVL)
- return;
- // Override EVL styles if needed.
- // FIXME: Investigate opportunity for fixed vector factor.
- bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
- TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
- if (EVLIsLegal)
- return;
- // If for some reason EVL mode is unsupported, fallback to a scalar epilogue
- // if it's allowed, or DataWithoutLaneMask otherwise.
- if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
- ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
- ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
- else
- ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
- TailFoldingStyle::DataWithoutLaneMask};
-
- LLVM_DEBUG(
- dbgs() << "LV: Preference for VP intrinsics indicated. Will "
- "not try to generate VP Intrinsics "
- << (UserIC > 1
- ? "since interleave count specified is greater than 1.\n"
- : "due to non-interleaving reasons.\n"));
- }
-
- /// Returns true if all loop blocks should be masked to fold tail loop.
- bool foldTailByMasking() const {
- // TODO: check if it is possible to check for None style independent of
- // IVUpdateMayOverflow flag in getTailFoldingStyle.
- return getTailFoldingStyle() != TailFoldingStyle::None;
- }
-
- /// Return maximum safe number of elements to be processed per vector
- /// iteration, which do not prevent store-load forwarding and are safe with
- /// regard to the memory dependencies. Required for EVL-based VPlans to
- /// correctly calculate AVL (application vector length) as min(remaining AVL,
- /// MaxSafeElements).
- /// TODO: need to consider adjusting cost model to use this value as a
- /// vectorization factor for EVL-based vectorization.
- std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
-
- /// Returns true if the instructions in this block requires predication
- /// for any reason, e.g. because tail folding now requires a predicate
- /// or because the block in the original loop was predicated.
- bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
- return foldTailByMasking() || Legal->blockNeedsPredication(BB);
- }
-
- /// Returns true if VP intrinsics with explicit vector length support should
- /// be generated in the tail folded loop.
- bool foldTailWithEVL() const {
- return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
- }
-
- /// Returns true if the Phi is part of an inloop reduction.
- bool isInLoopReduction(PHINode *Phi) const {
- return InLoopReductions.contains(Phi);
- }
-
- /// Returns true if the predicated reduction select should be used to set the
- /// incoming value for the reduction phi.
- bool usePredicatedReductionSelect() const {
- // Force to use predicated reduction select since the EVL of the
- // second-to-last iteration might not be VF*UF.
- if (foldTailWithEVL())
- return true;
- return PreferPredicatedReductionSelect ||
- TTI.preferPredicatedReductionSelect();
- }
-
- /// Estimate cost of an intrinsic call instruction CI if it were vectorized
- /// with factor VF. Return the cost of the instruction, including
- /// scalarization overhead if it's needed.
- InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
-
- /// Estimate cost of a call instruction CI if it were vectorized with factor
- /// VF. Return the cost of the instruction, including scalarization overhead
- /// if it's needed.
- InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
-
- /// Invalidates decisions already taken by the cost model.
- void invalidateCostModelingDecisions() {
- WideningDecisions.clear();
- CallWideningDecisions.clear();
- Uniforms.clear();
- Scalars.clear();
- }
-
- /// Returns the expected execution cost. The unit of the cost does
- /// not matter because we use the 'cost' units to compare different
- /// vector widths. The cost that is returned is *not* normalized by
- /// the factor width.
- InstructionCost expectedCost(ElementCount VF);
-
- bool hasPredStores() const { return NumPredStores > 0; }
-
- /// Returns true if epilogue vectorization is considered profitable, and
- /// false otherwise.
- /// \p VF is the vectorization factor chosen for the original loop.
- /// \p Multiplier is an aditional scaling factor applied to VF before
- /// comparing to EpilogueVectorizationMinVF.
- bool isEpilogueVectorizationProfitable(const ElementCount VF,
- const unsigned IC) const;
-
- /// Returns the execution time cost of an instruction for a given vector
- /// width. Vector width of one means scalar.
- InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
-
- /// Return the cost of instructions in an inloop reduction pattern, if I is
- /// part of that pattern.
- std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
- ElementCount VF,
- Type *VectorTy) const;
-
- /// Returns true if \p Op should be considered invariant and if it is
- /// trivially hoistable.
- bool shouldConsiderInvariant(Value *Op);
-
- /// Return the value of vscale used for tuning the cost model.
- std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
-
-private:
- unsigned NumPredStores = 0;
-
- /// Used to store the value of vscale used for tuning the cost model. It is
- /// initialized during object construction.
- std::optional<unsigned> VScaleForTuning;
-
- /// Initializes the value of vscale used for tuning the cost model. If
- /// vscale_range.min == vscale_range.max then return vscale_range.max, else
- /// return the value returned by the corresponding TTI method.
- void initializeVScaleForTuning() {
- const Function *Fn = TheLoop->getHeader()->getParent();
- if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
- auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
- auto Min = Attr.getVScaleRangeMin();
- auto Max = Attr.getVScaleRangeMax();
- if (Max && Min == Max) {
- VScaleForTuning = Max;
- return;
- }
- }
-
- VScaleForTuning = TTI.getVScaleForTuning();
- }
-
- /// \return An upper bound for the vectorization factors for both
- /// fixed and scalable vectorization, where the minimum-known number of
- /// elements is a power-of-2 larger than zero. If scalable vectorization is
- /// disabled or unsupported, then the scalable part will be equal to
- /// ElementCount::getScalable(0).
- FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
- ElementCount UserVF,
- bool FoldTailByMasking);
-
- /// If \p VF > MaxTripcount, clamps it to the next lower VF that is <=
- /// MaxTripCount.
- ElementCount clampVFByMaxTripCount(ElementCount VF, unsigned MaxTripCount,
- bool FoldTailByMasking) const;
-
- /// \return the maximized element count based on the targets vector
- /// registers and the loop trip-count, but limited to a maximum safe VF.
- /// This is a helper function of computeFeasibleMaxVF.
- ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
- unsigned SmallestType,
- unsigned WidestType,
- ElementCount MaxSafeVF,
- bool FoldTailByMasking);
-
- /// Checks if scalable vectorization is supported and enabled. Caches the
- /// result to avoid repeated debug dumps for repeated queries.
- bool isScalableVectorizationAllowed();
-
- /// \return the maximum legal scalable VF, based on the safe max number
- /// of elements.
- ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
-
- /// Calculate vectorization cost of memory instruction \p I.
- InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
-
- /// The cost computation for scalarized memory instruction.
- InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
-
- /// The cost computation for interleaving group of memory instructions.
- InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
-
- /// The cost computation for Gather/Scatter instruction.
- InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
-
- /// The cost computation for widening instruction \p I with consecutive
- /// memory access.
- InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
-
- /// The cost calculation for Load/Store instruction \p I with uniform pointer -
- /// Load: scalar load + broadcast.
- /// Store: scalar store + (loop invariant value stored? 0 : extract of last
- /// element)
- InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
-
- /// Estimate the overhead of scalarizing an instruction. This is a
- /// convenience wrapper for the type-based getScalarizationOverhead API.
- InstructionCost getScalarizationOverhead(Instruction *I,
- ElementCount VF) const;
-
- /// Returns true if an artificially high cost for emulated masked memrefs
- /// should be used.
- bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
-
- /// Map of scalar integer values to the smallest bitwidth they can be legally
- /// represented as. The vector equivalents of these values should be truncated
- /// to this type.
- MapVector<Instruction *, uint64_t> MinBWs;
-
- /// A type representing the costs for instructions if they were to be
- /// scalarized rather than vectorized. The entries are Instruction-Cost
- /// pairs.
- using ScalarCostsTy = MapVector<Instruction *, InstructionCost>;
-
- /// A set containing all BasicBlocks that are known to present after
- /// vectorization as a predicated block.
- DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
- PredicatedBBsAfterVectorization;
-
- /// Records whether it is allowed to have the original scalar loop execute at
- /// least once. This may be needed as a fallback loop in case runtime
- /// aliasing/dependence checks fail, or to handle the tail/remainder
- /// iterations when the trip count is unknown or doesn't divide by the VF,
- /// or as a peel-loop to handle gaps in interleave-groups.
- /// Under optsize and when the trip count is very small we don't allow any
- /// iterations to execute in the scalar loop.
- ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
-
- /// Control finally chosen tail folding style. The first element is used if
- /// the IV update may overflow, the second element - if it does not.
- std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
- ChosenTailFoldingStyle;
-
- /// true if scalable vectorization is supported and enabled.
- std::optional<bool> IsScalableVectorizationAllowed;
-
- /// Maximum safe number of elements to be processed per vector iteration,
- /// which do not prevent store-load forwarding and are safe with regard to the
- /// memory dependencies. Required for EVL-based veectorization, where this
- /// value is used as the upper bound of the safe AVL.
- std::optional<unsigned> MaxSafeElements;
-
- /// A map holding scalar costs for different vectorization factors. The
- /// presence of a cost for an instruction in the mapping indicates that the
- /// instruction will be scalarized when vectorizing with the associated
- /// vectorization factor. The entries are VF-ScalarCostTy pairs.
- MapVector<ElementCount, ScalarCostsTy> InstsToScalarize;
-
- /// Holds the instructions known to be uniform after vectorization.
- /// The data is collected per VF.
- DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
-
- /// Holds the instructions known to be scalar after vectorization.
- /// The data is collected per VF.
- DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
-
- /// Holds the instructions (address computations) that are forced to be
- /// scalarized.
- DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
-
- /// PHINodes of the reductions that should be expanded in-loop.
- SmallPtrSet<PHINode *, 4> InLoopReductions;
-
- /// A Map of inloop reduction operations and their immediate chain operand.
- /// FIXME: This can be removed once reductions can be costed correctly in
- /// VPlan. This was added to allow quick lookup of the inloop operations.
- DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
-
- /// Returns the expected difference in cost from scalarizing the expression
- /// feeding a predicated instruction \p PredInst. The instructions to
- /// scalarize and their scalar costs are collected in \p ScalarCosts. A
- /// non-negative return value implies the expression will be scalarized.
- /// Currently, only single-use chains are considered for scalarization.
- InstructionCost computePredInstDiscount(Instruction *PredInst,
- ScalarCostsTy &ScalarCosts,
- ElementCount VF);
-
- /// Collect the instructions that are uniform after vectorization. An
- /// instruction is uniform if we represent it with a single scalar value in
- /// the vectorized loop corresponding to each vector iteration. Examples of
- /// uniform instructions include pointer operands of consecutive or
- /// interleaved memory accesses. Note that although uniformity implies an
- /// instruction will be scalar, the reverse is not true. In general, a
- /// scalarized instruction will be represented by VF scalar values in the
- /// vectorized loop, each corresponding to an iteration of the original
- /// scalar loop.
- void collectLoopUniforms(ElementCount VF);
-
- /// Collect the instructions that are scalar after vectorization. An
- /// instruction is scalar if it is known to be uniform or will be scalarized
- /// during vectorization. collectLoopScalars should only add non-uniform nodes
- /// to the list if they are used by a load/store instruction that is marked as
- /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
- /// VF values in the vectorized loop, each corresponding to an iteration of
- /// the original scalar loop.
- void collectLoopScalars(ElementCount VF);
-
- /// Keeps cost model vectorization decision and cost for instructions.
- /// Right now it is used for memory instructions only.
- using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
- std::pair<InstWidening, InstructionCost>>;
-
- DecisionList WideningDecisions;
-
- using CallDecisionList =
- DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
-
- CallDecisionList CallWideningDecisions;
-
- /// Returns true if \p V is expected to be vectorized and it needs to be
- /// extracted.
- bool needsExtract(Value *V, ElementCount VF) const {
- Instruction *I = dyn_cast<Instruction>(V);
- if (VF.isScalar() || !I || !TheLoop->contains(I) ||
- TheLoop->isLoopInvariant(I) ||
- getWideningDecision(I, VF) == CM_Scalarize ||
- (isa<CallInst>(I) &&
- getCallWideningDecision(cast<CallInst>(I), VF).Kind == CM_Scalarize))
- return false;
-
- // Assume we can vectorize V (and hence we need extraction) if the
- // scalars are not computed yet. This can happen, because it is called
- // via getScalarizationOverhead from setCostBasedWideningDecision, before
- // the scalars are collected. That should be a safe assumption in most
- // cases, because we check if the operands have vectorizable types
- // beforehand in LoopVectorizationLegality.
- return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);
- };
-
- /// Returns a range containing only operands needing to be extracted.
- SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
- ElementCount VF) const {
-
- SmallPtrSet<const Value *, 4> UniqueOperands;
- SmallVector<Value *, 4> Res;
- for (Value *Op : Ops) {
- if (isa<Constant>(Op) || !UniqueOperands.insert(Op).second ||
- !needsExtract(Op, VF))
- continue;
- Res.push_back(Op);
- }
- return Res;
- }
-
-public:
- /// The loop that we evaluate.
- Loop *TheLoop;
-
- /// Predicated scalar evolution analysis.
- PredicatedScalarEvolution &PSE;
-
- /// Loop Info analysis.
- LoopInfo *LI;
-
- /// Vectorization legality.
- LoopVectorizationLegality *Legal;
-
- /// Vector target information.
- const TargetTransformInfo &TTI;
-
- /// Target Library Info.
- const TargetLibraryInfo *TLI;
-
- /// Demanded bits analysis.
- DemandedBits *DB;
-
- /// Assumption cache.
- AssumptionCache *AC;
-
- /// Interface to emit optimization remarks.
- OptimizationRemarkEmitter *ORE;
-
- const Function *TheFunction;
-
- /// Loop Vectorize Hint.
- const LoopVectorizeHints *Hints;
-
- /// The interleave access information contains groups of interleaved accesses
- /// with the same stride and close to each other.
- InterleavedAccessInfo &InterleaveInfo;
-
- /// Values to ignore in the cost model.
- SmallPtrSet<const Value *, 16> ValuesToIgnore;
-
- /// Values to ignore in the cost model when VF > 1.
- SmallPtrSet<const Value *, 16> VecValuesToIgnore;
-
- /// All element types found in the loop.
- SmallPtrSet<Type *, 16> ElementTypesInLoop;
-
- /// The kind of cost that we are calculating
- TTI::TargetCostKind CostKind;
-
- /// Whether this loop should be optimized for size based on function attribute
- /// or profile information.
- bool OptForSize;
-
- /// The highest VF possible for this loop, without using MaxBandwidth.
- FixedScalableVFPair MaxPermissibleVFWithoutMaxBW;
-};
-} // end namespace llvm
-
namespace {
/// Helper struct to manage generating runtime checks for vectorization.
///
@@ -2490,6 +1603,119 @@ static unsigned estimateElementCount(ElementCount VF,
return EstimatedVF;
}
+bool LoopVectorizationCostModel::isLegalMaskedStore(
+ Type *DataType, Value *Ptr, Align Alignment, unsigned AddressSpace) const {
+ return Legal->isConsecutivePtr(DataType, Ptr) &&
+ TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace);
+}
+
+bool LoopVectorizationCostModel::isLegalMaskedLoad(
+ Type *DataType, Value *Ptr, Align Alignment, unsigned AddressSpace) const {
+ return Legal->isConsecutivePtr(DataType, Ptr) &&
+ TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace);
+}
+
+bool LoopVectorizationCostModel::canVectorizeReductions(ElementCount VF) const {
+ return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
+ const RecurrenceDescriptor &RdxDesc = Reduction.second;
+ return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
+ }));
+}
+
+bool LoopVectorizationCostModel::blockNeedsPredicationForAnyReason(
+ BasicBlock *BB) const {
+ return foldTailByMasking() || Legal->blockNeedsPredication(BB);
+}
+
+bool LoopVectorizationCostModel::isOptimizableIVTruncate(Instruction *I,
+ ElementCount VF) {
+ // If the instruction is not a truncate, return false.
+ auto *Trunc = dyn_cast<TruncInst>(I);
+ if (!Trunc)
+ return false;
+
+ // Get the source and destination types of the truncate.
+ Type *SrcTy = toVectorTy(Trunc->getSrcTy(), VF);
+ Type *DestTy = toVectorTy(Trunc->getDestTy(), VF);
+
+ // If the truncate is free for the given types, return false. Replacing a
+ // free truncate with an induction variable would add an induction variable
+ // update instruction to each iteration of the loop. We exclude from this
+ // check the primary induction variable since it will need an update
+ // instruction regardless.
+ Value *Op = Trunc->getOperand(0);
+ if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
+ return false;
+
+ // If the truncated value is not an induction variable, return false.
+ return Legal->isInductionPhi(Op);
+}
+
+bool LoopVectorizationCostModel::requiresScalarEpilogue(
+ bool IsVectorizing) const {
+ if (!isScalarEpilogueAllowed()) {
+ LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
+ return false;
+ }
+ // If we might exit from anywhere but the latch and early exit vectorization
+ // is disabled, we must run the exiting iteration in scalar form.
+ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
+ !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
+ LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
+ "from latch block\n");
+ return true;
+ }
+ if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
+ LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
+ "interleaved group requires scalar epilogue\n");
+ return true;
+ }
+ LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
+ return false;
+}
+
+void LoopVectorizationCostModel::setTailFoldingStyles(bool IsScalableVF,
+ unsigned UserIC) {
+ assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
+ if (!Legal->canFoldTailByMasking()) {
+ ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
+ return;
+ }
+
+ // Default to TTI preference, but allow command line override.
+ ChosenTailFoldingStyle = {
+ TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
+ TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)};
+ if (ForceTailFoldingStyle.getNumOccurrences())
+ ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
+ ForceTailFoldingStyle.getValue()};
+
+ if (ChosenTailFoldingStyle->first != TailFoldingStyle::DataWithEVL &&
+ ChosenTailFoldingStyle->second != TailFoldingStyle::DataWithEVL)
+ return;
+ // Override EVL styles if needed.
+ // FIXME: Investigate opportunity for fixed vector factor.
+ bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
+ TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
+ if (EVLIsLegal)
+ return;
+ // If for some reason EVL mode is unsupported, fallback to a scalar epilogue
+ // if it's allowed, or DataWithoutLaneMask otherwise.
+ if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
+ ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
+ ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
+ else
+ ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
+ TailFoldingStyle::DataWithoutLaneMask};
+
+ LLVM_DEBUG(
+ dbgs() << "LV: Preference for VP intrinsics indicated. Will "
+ "not try to generate VP Intrinsics "
+ << (UserIC > 1
+ ? "since interleave count specified is greater than 1.\n"
+ : "due to non-interleaving reasons.\n"));
+}
+
InstructionCost
LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
ElementCount VF) const {
More information about the llvm-commits
mailing list