[llvm] [LV] Use getFixedValue instead of getKnownMinValue when appropriate (PR #143526)
David Sherwood via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 12 05:46:54 PDT 2025
https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/143526
>From 3857530a491e9309cde5572a575d69311b939386 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Tue, 10 Jun 2025 12:52:29 +0000
Subject: [PATCH 1/3] [LV] Use getFixedValue instead of getKnownMinValue when
appropriate
There are many places in VPlan and LoopVectorize where we use
getKnownMinValue to discover the number of elements in a vector.
Where we expect the vector to have a fixed length, I have used
the stronger getFixedValue call. I believe this is clearer and
adds extra protection in the form of an assert in getFixedValue
that the vector is not scalable.
While looking at VPFirstOrderRecurrencePHIRecipe::computeCost
I also took the liberty of simplifying the code.
In theory I believe this patch should be NFC, but I'm reluctant
to add that to the title in case we're just missing tests. I
built and ran the LLVM test suite when targeting neoverse-v1
and it seemed ok.
---
.../Transforms/Vectorize/LoopVectorize.cpp | 34 +++++++++++--------
llvm/lib/Transforms/Vectorize/VPlan.cpp | 7 ++--
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 18 +++++-----
3 files changed, 30 insertions(+), 29 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8177b76ad5bdf..f5324e85f0048 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3107,12 +3107,13 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
// that we will create. This cost is likely to be zero. The phi node
// cost, if any, should be scaled by the block probability because it
// models a copy at the end of each predicated block.
- ScalarizationCost += VF.getKnownMinValue() *
- TTI.getCFInstrCost(Instruction::PHI, CostKind);
+ ScalarizationCost +=
+ VF.getFixedValue() * TTI.getCFInstrCost(Instruction::PHI, CostKind);
// The cost of the non-predicated instruction.
- ScalarizationCost += VF.getKnownMinValue() *
- TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
+ ScalarizationCost +=
+ VF.getFixedValue() *
+ TTI.getArithmeticInstrCost(I->getOpcode(), I->getType(), CostKind);
// The cost of insertelement and extractelement instructions needed for
// scalarization.
@@ -4280,7 +4281,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
return NumLegalParts <= VF.getKnownMinValue();
}
// Two or more elements that share a register - are vectorized.
- return NumLegalParts < VF.getKnownMinValue();
+ return NumLegalParts < VF.getFixedValue();
};
// If no def nor is a store, e.g., branches, continue - no value to check.
@@ -4565,8 +4566,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
assert(!isa<SCEVCouldNotCompute>(TC) &&
"Trip count SCEV must be computable");
RemainingIterations = SE.getURemExpr(
- TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));
- MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;
+ TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC));
+ MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,
SE.getConstant(TCType, MaxTripCount))) {
MaxTripCount =
@@ -4577,7 +4578,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
}
if (SE.isKnownPredicate(
CmpInst::ICMP_UGT,
- SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),
+ SE.getConstant(TCType, NextVF.Width.getFixedValue()),
RemainingIterations))
continue;
}
@@ -5248,14 +5249,14 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// Get the cost of the scalar memory instruction and address computation.
InstructionCost Cost =
- VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
+ VF.getFixedValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
// Don't pass *I here, since it is scalar but will actually be part of a
// vectorized loop where the user of it is a vectorized instruction.
const Align Alignment = getLoadStoreAlignment(I);
- Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
- ValTy->getScalarType(),
- Alignment, AS, CostKind);
+ Cost += VF.getFixedValue() * TTI.getMemoryOpCost(I->getOpcode(),
+ ValTy->getScalarType(),
+ Alignment, AS, CostKind);
// Get the overhead of the extractelement and insertelement instructions
// we might create due to scalarization.
@@ -5271,7 +5272,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
auto *VecI1Ty =
VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
Cost += TTI.getScalarizationOverhead(
- VecI1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
+ VecI1Ty, APInt::getAllOnes(VF.getFixedValue()),
/*Insert=*/false, /*Extract=*/true, CostKind);
Cost += TTI.getCFInstrCost(Instruction::Br, CostKind);
@@ -5317,7 +5318,6 @@ InstructionCost
LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
ElementCount VF) {
assert(Legal->isUniformMemOp(*I, VF));
-
Type *ValTy = getLoadStoreType(I);
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
@@ -5332,6 +5332,10 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
StoreInst *SI = cast<StoreInst>(I);
bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
+ // TODO: We have tests that request the cost of extracting element
+ // VF.getKnownMinValue() - 1 from a scalable vector. This is actually
+ // meaningless, given what we actually want is the last lane and is likely
+ // to be more expensive.
return TTI.getAddressComputationCost(ValTy) +
TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
CostKind) +
@@ -5614,7 +5618,7 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
for (Type *VectorTy : getContainedTypes(RetTy)) {
Cost += TTI.getScalarizationOverhead(
- cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getKnownMinValue()),
+ cast<VectorType>(VectorTy), APInt::getAllOnes(VF.getFixedValue()),
/*Insert=*/true,
/*Extract=*/false, CostKind);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1838562f26b82..b5fa98d341756 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -331,7 +331,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
bool IsSingleScalar = vputils::isSingleScalar(Def);
- VPLane LastLane(IsSingleScalar ? 0 : VF.getKnownMinValue() - 1);
+ VPLane LastLane(IsSingleScalar ? 0 : VF.getFixedValue() - 1);
// Check if there is a scalar value for the selected lane.
if (!hasScalarValue(Def, LastLane)) {
// At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
@@ -368,7 +368,7 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
Value *Undef = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
set(Def, Undef);
- for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
+ for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane)
packScalarIntoVectorizedValue(Def, Lane);
VectorValue = get(Def);
}
@@ -789,8 +789,7 @@ void VPRegionBlock::execute(VPTransformState *State) {
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
Entry);
State->Lane = VPLane(0);
- for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
- ++Lane) {
+ for (unsigned Lane = 0, VF = State->VF.getFixedValue(); Lane < VF; ++Lane) {
State->Lane = VPLane(Lane, VPLane::Kind::First);
// Visit the VPBlocks connected to \p this, starting from it.
for (VPBlockBase *Block : RPOT) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index aa6b13c217bd1..7894ccd9b4c5a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -872,7 +872,7 @@ void VPInstruction::execute(VPTransformState &State) {
isVectorToScalar() || isSingleScalar());
bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
if (GeneratesPerAllLanes) {
- for (unsigned Lane = 0, NumLanes = State.VF.getKnownMinValue();
+ for (unsigned Lane = 0, NumLanes = State.VF.getFixedValue();
Lane != NumLanes; ++Lane) {
Value *GeneratedValue = generatePerLane(State, VPLane(Lane));
assert(GeneratedValue && "generatePerLane must produce a value");
@@ -2792,8 +2792,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
}
// Generate scalar instances for all VF lanes.
- assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
- const unsigned EndLane = State.VF.getKnownMinValue();
+ const unsigned EndLane = State.VF.getFixedValue();
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
scalarizeInstruction(UI, this, VPLane(Lane), State);
}
@@ -2846,7 +2845,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
UI->getOpcode(), ResultTy, CostKind,
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
Op2Info, Operands, UI, &Ctx.TLI) *
- (isSingleScalar() ? 1 : VF.getKnownMinValue());
+ (isSingleScalar() ? 1 : VF.getFixedValue());
}
}
@@ -3399,7 +3398,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
Value *ResBlockInMask = State.get(BlockInMask);
Value *ShuffledMask = State.Builder.CreateShuffleVector(
ResBlockInMask,
- createReplicatedMask(InterleaveFactor, State.VF.getKnownMinValue()),
+ createReplicatedMask(InterleaveFactor, State.VF.getFixedValue()),
"interleaved.mask");
return MaskForGaps ? State.Builder.CreateBinOp(Instruction::And,
ShuffledMask, MaskForGaps)
@@ -3411,8 +3410,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
if (isa<LoadInst>(Instr)) {
Value *MaskForGaps = nullptr;
if (NeedsMaskForGaps) {
- MaskForGaps = createBitMaskForGaps(State.Builder,
- State.VF.getKnownMinValue(), *Group);
+ MaskForGaps =
+ createBitMaskForGaps(State.Builder, State.VF.getFixedValue(), *Group);
assert(MaskForGaps && "Mask for Gaps is required but it is null");
}
@@ -3475,13 +3474,12 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
continue;
auto StrideMask =
- createStrideMask(I, InterleaveFactor, State.VF.getKnownMinValue());
+ createStrideMask(I, InterleaveFactor, State.VF.getFixedValue());
Value *StridedVec =
State.Builder.CreateShuffleVector(NewLoad, StrideMask, "strided.vec");
// If this member has different type, cast the result type.
if (Member->getType() != ScalarTy) {
- assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF);
StridedVec =
createBitOrPointerCast(State.Builder, StridedVec, OtherVTy, DL);
@@ -3817,7 +3815,7 @@ VPFirstOrderRecurrencePHIRecipe::computeCost(ElementCount VF,
if (VF.isScalar())
return Ctx.TTI.getCFInstrCost(Instruction::PHI, Ctx.CostKind);
- if (VF.isScalable() && VF.getKnownMinValue() == 1)
+ if (VF == ElementCount::getScalable(1))
return InstructionCost::getInvalid();
return 0;
>From 3957b6c62adfb8fa1333643a00eadfd0dfa503a4 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Wed, 11 Jun 2025 12:40:36 +0000
Subject: [PATCH 2/3] Address review comments
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 8 ++++----
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 1 +
2 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f5324e85f0048..8f8a1ce9d7ad3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5332,10 +5332,10 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
StoreInst *SI = cast<StoreInst>(I);
bool IsLoopInvariantStoreValue = Legal->isInvariant(SI->getValueOperand());
- // TODO: We have tests that request the cost of extracting element
- // VF.getKnownMinValue() - 1 from a scalable vector. This is actually
- // meaningless, given what we actually want is the last lane and is likely
- // to be more expensive.
+ // TODO: We have existing tests that request the cost of extracting element
+ // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
+ // the actual generated code, which involves extracting the last element of
+ // a scalable vector where the lane to extract is unknown at compile time.
return TTI.getAddressComputationCost(ValTy) +
TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
CostKind) +
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7894ccd9b4c5a..c4c6141cb62c5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3462,6 +3462,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
return;
}
+ assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
// For each member in the group, shuffle out the appropriate data from the
// wide loads.
>From 4050dc98377f9676895c821b83a3ce90c032a80f Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Thu, 12 Jun 2025 12:46:08 +0000
Subject: [PATCH 3/3] Re-add blank line
---
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8f8a1ce9d7ad3..86bea6afdb738 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5318,6 +5318,7 @@ InstructionCost
LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
ElementCount VF) {
assert(Legal->isUniformMemOp(*I, VF));
+
Type *ValTy = getLoadStoreType(I);
auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));
const Align Alignment = getLoadStoreAlignment(I);
More information about the llvm-commits
mailing list