[llvm] [LV] Add a flag to conservatively choose a larger vector factor when maximizing bandwidth (PR #156012)
Yuta Mukai via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 2 05:57:29 PDT 2025
https://github.com/ytmukai updated https://github.com/llvm/llvm-project/pull/156012
>From a63ac66f7d95d98a2dc445166a6129ba87c89125 Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Mon, 25 Aug 2025 01:09:43 +0000
Subject: [PATCH 1/2] [LV] Add a flag to conservatively choose a larger vector
factor when maximizing bandwidth
Add a flag -vectorizer-maximize-bandwidth-conservatively to
conservatively choose a larger vector factor when considering
candidates up to the factor that matches the smallest type
size. When the vector factor is large, pack/unpack instructions for
vector registers may be required, which can lead to performance
degradation due to the vector calculation pipeline becoming a
bottleneck, even if the overall number of instructions is
reduced. When this flag is enabled, a larger factor is chosen
only if it is superior not only in terms of overall cost but also when
compared solely based on the cost of vector calculation.
---
.../llvm/Analysis/TargetTransformInfo.h | 7 ++
.../llvm/Analysis/TargetTransformInfoImpl.h | 5 ++
llvm/lib/Analysis/TargetTransformInfo.cpp | 5 ++
.../AArch64/AArch64TargetTransformInfo.cpp | 7 +-
.../Vectorize/LoopVectorizationPlanner.h | 3 +-
.../Transforms/Vectorize/LoopVectorize.cpp | 77 +++++++++++++++----
llvm/lib/Transforms/Vectorize/VPlan.cpp | 54 +++++++++----
llvm/lib/Transforms/Vectorize/VPlan.h | 12 ++-
.../maximize-bandwidth-conservatively.ll | 58 ++++++++++++++
9 files changed, 195 insertions(+), 33 deletions(-)
create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-conservatively.ll
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index c4ba8e9857dc4..abf087281fe41 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1231,6 +1231,13 @@ class TargetTransformInfo {
LLVM_ABI bool
shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const;
+ /// \return True if vectorization factors wider than those matching the
+ /// largest element type should be chosen conservatively. This only makes
+ /// sense when shouldMaximizeVectorBandwidth returns true.
+ /// \p K Register Kind for vectorization.
+ LLVM_ABI bool shouldMaximizeVectorBandwidthConservatively(
+ TargetTransformInfo::RegisterKind K) const;
+
/// \return The minimum vectorization factor for types of given element
/// bit width, or 0 if there is no minimum VF. The returned value only
/// applies when shouldMaximizeVectorBandwidth returns true.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 43813d2f3acb5..6651505be9b86 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -597,6 +597,11 @@ class TargetTransformInfoImplBase {
return false;
}
+ virtual bool shouldMaximizeVectorBandwidthConservatively(
+ TargetTransformInfo::RegisterKind K) const {
+ return false;
+ }
+
virtual ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const {
return ElementCount::get(0, IsScalable);
}
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 4ac8f03e6dbf5..0485581b8006c 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -803,6 +803,11 @@ bool TargetTransformInfo::shouldMaximizeVectorBandwidth(
return TTIImpl->shouldMaximizeVectorBandwidth(K);
}
+bool TargetTransformInfo::shouldMaximizeVectorBandwidthConservatively(
+ TargetTransformInfo::RegisterKind K) const {
+ return TTIImpl->shouldMaximizeVectorBandwidthConservatively(K);
+}
+
ElementCount TargetTransformInfo::getMinimumVF(unsigned ElemWidth,
bool IsScalable) const {
return TTIImpl->getMinimumVF(ElemWidth, IsScalable);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 490f6391c15a0..ac75a8a1727e9 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -76,6 +76,9 @@ static cl::opt<unsigned> DMBLookaheadThreshold(
"dmb-lookahead-threshold", cl::init(10), cl::Hidden,
cl::desc("The number of instructions to search for a redundant dmb"));
+static cl::opt<bool> EnableSVEMaximizeVecBW("enable-sve-maximize-vec-bw",
+ cl::init(false), cl::Hidden);
+
namespace {
class TailFoldingOption {
// These bitfields will only ever be set to something non-zero in operator=,
@@ -370,7 +373,9 @@ bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
TargetTransformInfo::RegisterKind K) const {
assert(K != TargetTransformInfo::RGK_Scalar);
return (K == TargetTransformInfo::RGK_FixedWidthVector &&
- ST->isNeonAvailable());
+ ST->isNeonAvailable()) ||
+ (EnableSVEMaximizeVecBW &&
+ K == TargetTransformInfo::RGK_ScalableVector && ST->isSVEAvailable());
}
/// Calculate the cost of materializing a 64-bit value. This helper
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 838476dcae661..c747920b0a318 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -474,7 +474,8 @@ class LoopVectorizationPlanner {
///
/// TODO: Move to VPlan::cost once the use of LoopVectorizationLegality has
/// been retired.
- InstructionCost cost(VPlan &Plan, ElementCount VF) const;
+ InstructionCost cost(VPlan &Plan, ElementCount VF,
+ bool CountsVecCalcOnly = false) const;
/// Precompute costs for certain instructions using the legacy cost model. The
/// function is used to bring up the VPlan-based cost model to initially avoid
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a0f306c12754f..a70c21353139d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -263,6 +263,11 @@ static cl::opt<bool> MaximizeBandwidth(
cl::desc("Maximize bandwidth when selecting vectorization factor which "
"will be determined by the smallest type in loop."));
+static cl::opt<bool> MaximizeBandwidthConservatively(
+ "vectorizer-maximize-bandwidth-conservatively", cl::init(false), cl::Hidden,
+ cl::desc("When MaximizeBandwidth is enabled, a larger vector factor is "
+ "chosen conservatively."));
+
static cl::opt<bool> EnableInterleavedMemAccesses(
"enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
@@ -962,9 +967,16 @@ class LoopVectorizationCostModel {
/// user options, for the given register kind.
bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
+ /// \return True if maximizing vector bandwidth should be applied
+ /// conservatively by the target or user options, for the given register kind.
+ /// This only makes sense when useMaxBandwidth returns true.
+ bool useMaxBandwidthConservatively(TargetTransformInfo::RegisterKind RegKind);
+
/// \return True if register pressure should be calculated for the given VF.
bool shouldCalculateRegPressureForVF(ElementCount VF);
+ bool isVFForMaxBandwidth(ElementCount VF);
+
/// \return The size (in bits) of the smallest and widest types in the code
/// that needs to be vectorized. We ignore values that remain scalar such as
/// 64 bit loop indices.
@@ -3812,11 +3824,15 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
bool LoopVectorizationCostModel::shouldCalculateRegPressureForVF(
ElementCount VF) {
+ // Only calculate register pressure for VFs enabled by MaxBandwidth.
+ return isVFForMaxBandwidth(VF);
+}
+
+bool LoopVectorizationCostModel::isVFForMaxBandwidth(ElementCount VF) {
if (!useMaxBandwidth(VF.isScalable()
? TargetTransformInfo::RGK_ScalableVector
: TargetTransformInfo::RGK_FixedWidthVector))
return false;
- // Only calculate register pressure for VFs enabled by MaxBandwidth.
return ElementCount::isKnownGT(
VF, VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
: MaxPermissibleVFWithoutMaxBW.FixedVF);
@@ -3830,6 +3846,13 @@ bool LoopVectorizationCostModel::useMaxBandwidth(
Legal->hasVectorCallVariants())));
}
+bool LoopVectorizationCostModel::useMaxBandwidthConservatively(
+ TargetTransformInfo::RegisterKind RegKind) {
+ return MaximizeBandwidthConservatively ||
+ (MaximizeBandwidthConservatively.getNumOccurrences() == 0 &&
+ TTI.shouldMaximizeVectorBandwidthConservatively(RegKind));
+}
+
ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount(
ElementCount VF, unsigned MaxTripCount, bool FoldTailByMasking) const {
unsigned EstimatedVF = VF.getKnownMinValue();
@@ -6923,13 +6946,16 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
return Cost;
}
-InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
- ElementCount VF) const {
+InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF,
+ bool CountsVecCalcOnly) const {
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind);
- InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
+ InstructionCost Cost;
+
+ if (!CountsVecCalcOnly)
+ Cost += precomputeCosts(Plan, VF, CostCtx);
// Now compute and add the VPlan-based cost.
- Cost += Plan.cost(VF, CostCtx);
+ Cost += Plan.cost(VF, CostCtx, CountsVecCalcOnly);
#ifndef NDEBUG
unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
@@ -7105,8 +7131,25 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
continue;
}
- if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
- BestFactor = CurrentFactor;
+ if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail())) {
+ if (CM.isVFForMaxBandwidth(VF) &&
+ CM.useMaxBandwidthConservatively(
+ VF.isScalable() ? TargetTransformInfo::RGK_ScalableVector
+ : TargetTransformInfo::RGK_FixedWidthVector)) {
+ if (ElementCount::isKnownLT(BestFactor.Width, VF) &&
+ llvm::find(VFs, BestFactor.Width)) {
+ VectorizationFactor BestFactorVecCalc(
+ BestFactor.Width, cost(*P, BestFactor.Width, true), ScalarCost);
+ VectorizationFactor CurrentFactorVecCalc(VF, cost(*P, VF, true),
+ ScalarCost);
+ if (isMoreProfitable(CurrentFactorVecCalc, BestFactorVecCalc,
+ P->hasScalarTail()))
+ BestFactor = CurrentFactor;
+ }
+ } else {
+ BestFactor = CurrentFactor;
+ }
+ }
// If profitable add it to ProfitableVF list.
if (isMoreProfitable(CurrentFactor, ScalarFactor, P->hasScalarTail()))
@@ -7131,13 +7174,19 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
// Verify that the VPlan-based and legacy cost models agree, except for VPlans
// with early exits and plans with additional VPlan simplifications. The
// legacy cost model doesn't properly model costs for such loops.
- assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
- planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
- CostCtx, OrigLoop,
- BestFactor.Width) ||
- planContainsAdditionalSimplifications(
- getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
- " VPlan cost model and legacy cost model disagreed");
+ if (!CM.isVFForMaxBandwidth(LegacyVF.Width) ||
+ !CM.useMaxBandwidthConservatively(
+ LegacyVF.Width.isScalable()
+ ? TargetTransformInfo::RGK_ScalableVector
+ : TargetTransformInfo::RGK_FixedWidthVector))
+ assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
+ planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
+ CostCtx, OrigLoop,
+ BestFactor.Width) ||
+ planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),
+ CostCtx, OrigLoop,
+ LegacyVF.Width)) &&
+ " VPlan cost model and legacy cost model disagreed");
assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
"when vectorizing, the scalar cost must be computed.");
#endif
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index f972efa07eb7e..3470de8e56871 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -800,10 +800,34 @@ void VPRegionBlock::execute(VPTransformState *State) {
State->Lane.reset();
}
-InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx) {
+InstructionCost VPBasicBlock::cost(ElementCount VF, VPCostContext &Ctx,
+ bool CountsVecCalcOnly) {
InstructionCost Cost = 0;
- for (VPRecipeBase &R : Recipes)
- Cost += R.cost(VF, Ctx);
+ for (VPRecipeBase &R : Recipes) {
+ if (!CountsVecCalcOnly)
+ Cost += R.cost(VF, Ctx);
+ else {
+ switch (R.getVPDefID()) {
+ case VPDef::VPActiveLaneMaskPHISC:
+ case VPDef::VPBlendSC:
+ case VPDef::VPFirstOrderRecurrencePHISC:
+ case VPDef::VPPartialReductionSC:
+ case VPDef::VPReductionPHISC:
+ case VPDef::VPReductionSC:
+ case VPDef::VPWidenCallSC:
+ case VPDef::VPWidenCanonicalIVSC:
+ case VPDef::VPWidenCastSC:
+ case VPDef::VPWidenGEPSC:
+ case VPDef::VPWidenIntOrFpInductionSC:
+ case VPDef::VPWidenIntrinsicSC:
+ case VPDef::VPWidenPHISC:
+ case VPDef::VPWidenPointerInductionSC:
+ case VPDef::VPWidenSC:
+ case VPDef::VPWidenSelectSC:
+ Cost += R.cost(VF, Ctx);
+ }
+ }
+ }
return Cost;
}
@@ -826,11 +850,12 @@ const VPBasicBlock *VPBasicBlock::getCFGPredecessor(unsigned Idx) const {
return Pred->getExitingBasicBlock();
}
-InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
+InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx,
+ bool CountsVecCalcOnly) {
if (!isReplicator()) {
InstructionCost Cost = 0;
for (VPBlockBase *Block : vp_depth_first_shallow(getEntry()))
- Cost += Block->cost(VF, Ctx);
+ Cost += Block->cost(VF, Ctx, CountsVecCalcOnly);
InstructionCost BackedgeCost =
ForceTargetInstructionCost.getNumOccurrences()
? InstructionCost(ForceTargetInstructionCost.getNumOccurrences())
@@ -853,7 +878,7 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
// uniform condition.
using namespace llvm::VPlanPatternMatch;
VPBasicBlock *Then = cast<VPBasicBlock>(getEntry()->getSuccessors()[0]);
- InstructionCost ThenCost = Then->cost(VF, Ctx);
+ InstructionCost ThenCost = Then->cost(VF, Ctx, CountsVecCalcOnly);
// For the scalar case, we may not always execute the original predicated
// block, Thus, scale the block's cost by the probability of executing it.
@@ -1016,19 +1041,22 @@ void VPlan::execute(VPTransformState *State) {
}
}
-InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx) {
+InstructionCost VPlan::cost(ElementCount VF, VPCostContext &Ctx,
+ bool CountsVecCalcOnly) {
// For now only return the cost of the vector loop region, ignoring any other
// blocks, like the preheader or middle blocks, expect for checking them for
// recipes with invalid costs.
- InstructionCost Cost = getVectorLoopRegion()->cost(VF, Ctx);
+ InstructionCost Cost =
+ getVectorLoopRegion()->cost(VF, Ctx, CountsVecCalcOnly);
// If the cost of the loop region is invalid or any recipe in the skeleton
// outside loop regions are invalid return an invalid cost.
- if (!Cost.isValid() || any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(
- vp_depth_first_shallow(getEntry())),
- [&VF, &Ctx](VPBasicBlock *VPBB) {
- return !VPBB->cost(VF, Ctx).isValid();
- }))
+ if (!Cost.isValid() ||
+ any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(getEntry())),
+ [&VF, &Ctx, &CountsVecCalcOnly](VPBasicBlock *VPBB) {
+ return !VPBB->cost(VF, Ctx, CountsVecCalcOnly).isValid();
+ }))
return InstructionCost::getInvalid();
return Cost;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index d6bc462a0dfab..88f4f5dd24eaa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -340,7 +340,8 @@ class LLVM_ABI_FOR_TEST VPBlockBase {
virtual void execute(VPTransformState *State) = 0;
/// Return the cost of the block.
- virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0;
+ virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx,
+ bool CountsVecCalcOnly = false) = 0;
/// Return true if it is legal to hoist instructions into this block.
bool isLegalToHoistInto() {
@@ -3716,7 +3717,8 @@ class LLVM_ABI_FOR_TEST VPBasicBlock : public VPBlockBase {
void execute(VPTransformState *State) override;
/// Return the cost of this VPBasicBlock.
- InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
+ InstructionCost cost(ElementCount VF, VPCostContext &Ctx,
+ bool CountsVecCalcOnly) override;
/// Return the position of the first non-phi node recipe in the block.
iterator getFirstNonPhi();
@@ -3897,7 +3899,8 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase {
void execute(VPTransformState *State) override;
// Return the cost of this region.
- InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override;
+ InstructionCost cost(ElementCount VF, VPCostContext &Ctx,
+ bool CountsVecCalcOnly) override;
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print this VPRegionBlock to \p O (recursively), prefixing all lines with
@@ -4022,7 +4025,8 @@ class VPlan {
void execute(VPTransformState *State);
/// Return the cost of this plan.
- InstructionCost cost(ElementCount VF, VPCostContext &Ctx);
+ InstructionCost cost(ElementCount VF, VPCostContext &Ctx,
+ bool CountsVecCalcOnly = false);
VPBasicBlock *getEntry() { return Entry; }
const VPBasicBlock *getEntry() const { return Entry; }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-conservatively.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-conservatively.ll
new file mode 100644
index 0000000000000..441669c5f6dc6
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-conservatively.ll
@@ -0,0 +1,58 @@
+; REQUIRES: asserts
+; RUN: opt < %s -mtriple aarch64-linux-gnu -mattr=+sve -passes=loop-vectorize -vectorizer-maximize-bandwidth -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
+; RUN: opt < %s -mtriple aarch64-linux-gnu -mattr=+sve -passes=loop-vectorize -vectorizer-maximize-bandwidth -vectorizer-maximize-bandwidth-conservatively -S -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=CHECK-CONS
+
+define void @f(i32 %n, ptr noalias %a, ptr %b, ptr %c) {
+; The following loop is an example where choosing a larger vector width reduces
+; the number of instructions but may lead to performance degradation due to the
+; FP pipeline becoming a bottleneck.
+;
+; void f(int n, short *restrict a, long *b, double *c) {
+; for (int i = 0; i < n; i++) {
+; a[i] = b[i] + c[i];
+; }
+; }
+
+; In the usual cost model, vscale x 8 is chosen.
+; CHECK: Cost for VF vscale x 2: 8 (Estimated cost per lane: 4.0)
+; CHECK: Cost for VF vscale x 4: 14 (Estimated cost per lane: 3.5)
+; CHECK: Cost for VF vscale x 8: 26 (Estimated cost per lane: 3.2)
+; CHECK: LV: Selecting VF: vscale x 8.
+
+; In a conservative cost model, a larger vector width is chosen only if it is
+; superior when compared solely based on the cost of the FP pipeline, in
+; addition to the usual model.
+; CHECK-CONS: Cost for VF vscale x 2: 3 (Estimated cost per lane: 1.5)
+; CHECK-CONS: Cost for VF vscale x 4: 7 (Estimated cost per lane: 1.8)
+; CHECK-CONS: Cost for VF vscale x 8: 15 (Estimated cost per lane: 1.9)
+; CHECK-CONS: LV: Selecting VF: vscale x 2.
+
+entry:
+ %cmp10 = icmp sgt i32 %n, 0
+ br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ %wide.trip.count = zext nneg i32 %n to i64
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ ret void
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+ %arrayidx = getelementptr inbounds nuw i64, ptr %b, i64 %indvars.iv
+ %0 = load i64, ptr %arrayidx, align 8
+ %conv = sitofp i64 %0 to double
+ %arrayidx2 = getelementptr inbounds nuw double, ptr %c, i64 %indvars.iv
+ %1 = load double, ptr %arrayidx2, align 8
+ %add = fadd double %1, %conv
+ %conv3 = fptosi double %add to i16
+ %arrayidx5 = getelementptr inbounds nuw i16, ptr %a, i64 %indvars.iv
+ store i16 %conv3, ptr %arrayidx5, align 2
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
>From a52e829f45cc069e24abd60bca53382a78b40220 Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Tue, 2 Sep 2025 12:54:11 +0000
Subject: [PATCH 2/2] fixup! [LV] Add a flag to conservatively choose a larger
vector factor when maximizing bandwidth
---
llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 7 +------
1 file changed, 1 insertion(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index ac75a8a1727e9..490f6391c15a0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -76,9 +76,6 @@ static cl::opt<unsigned> DMBLookaheadThreshold(
"dmb-lookahead-threshold", cl::init(10), cl::Hidden,
cl::desc("The number of instructions to search for a redundant dmb"));
-static cl::opt<bool> EnableSVEMaximizeVecBW("enable-sve-maximize-vec-bw",
- cl::init(false), cl::Hidden);
-
namespace {
class TailFoldingOption {
// These bitfields will only ever be set to something non-zero in operator=,
@@ -373,9 +370,7 @@ bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
TargetTransformInfo::RegisterKind K) const {
assert(K != TargetTransformInfo::RGK_Scalar);
return (K == TargetTransformInfo::RGK_FixedWidthVector &&
- ST->isNeonAvailable()) ||
- (EnableSVEMaximizeVecBW &&
- K == TargetTransformInfo::RGK_ScalableVector && ST->isSVEAvailable());
+ ST->isNeonAvailable());
}
/// Calculate the cost of materializing a 64-bit value. This helper
More information about the llvm-commits
mailing list