[llvm] 584e9b6 - [LV] Calculate max feasible scalable VF.
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 28 04:33:28 PDT 2021
Author: Sander de Smalen
Date: 2021-04-28T12:30:00+01:00
New Revision: 584e9b6e4b4987b882719923e640eed854613d91
URL: https://github.com/llvm/llvm-project/commit/584e9b6e4b4987b882719923e640eed854613d91
DIFF: https://github.com/llvm/llvm-project/commit/584e9b6e4b4987b882719923e640eed854613d91.diff
LOG: [LV] Calculate max feasible scalable VF.
This patch also refactors the way the feasible max VF is calculated,
although this is NFC for fixed-width vectors.
After this change scalable VF hints are no longer truncated/clamped
to a shorter scalable VF, nor does it drop the 'scalable flag' from
the suggested VF to vectorize with a similar VF that is fixed.
Instead, the hint is ignored which means the vectorizer is free
to find a more suitable VF, using the CostModel to determine the
best possible VF.
Reviewed By: c-rhodes, fhahn
Differential Revision: https://reviews.llvm.org/D98509
Added:
llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll
Modified:
llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
index ecb44a7b1518d..ad6a4b561a9bb 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
@@ -174,6 +174,13 @@ void reportVectorizationFailure(const StringRef DebugMsg,
const StringRef OREMsg, const StringRef ORETag,
OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr);
+/// Reports an informative message: print \p Msg for debugging purposes as well
+/// as an optimization remark. Uses either \p I as location of the remark, or
+/// otherwise \p TheLoop.
+void reportVectorizationInfo(const StringRef OREMsg, const StringRef ORETag,
+ OptimizationRemarkEmitter *ORE, Loop *TheLoop,
+ Instruction *I = nullptr);
+
} // end namespace llvm
#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZE_H
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 27161ea75e303..2b413fc495052 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1065,13 +1065,13 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr)
B.SetCurrentDebugLocation(DebugLoc());
}
-/// Write a record \p DebugMsg about vectorization failure to the debug
-/// output stream. If \p I is passed, it is an instruction that prevents
-/// vectorization.
+/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
+/// is passed, the message relates to that particular instruction.
#ifndef NDEBUG
-static void debugVectorizationFailure(const StringRef DebugMsg,
- Instruction *I) {
- dbgs() << "LV: Not vectorizing: " << DebugMsg;
+static void debugVectorizationMessage(const StringRef Prefix,
+ const StringRef DebugMsg,
+ Instruction *I) {
+ dbgs() << "LV: " << Prefix << DebugMsg;
if (I != nullptr)
dbgs() << " " << *I;
else
@@ -1100,9 +1100,7 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
DL = I->getDebugLoc();
}
- OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
- R << "loop not vectorized: ";
- return R;
+ return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
}
/// Return a value for Step multiplied by VF.
@@ -1123,12 +1121,24 @@ Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
}
void reportVectorizationFailure(const StringRef DebugMsg,
- const StringRef OREMsg, const StringRef ORETag,
- OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
- LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
+ const StringRef OREMsg, const StringRef ORETag,
+ OptimizationRemarkEmitter *ORE, Loop *TheLoop,
+ Instruction *I) {
+ LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
+ LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
+ ORE->emit(
+ createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
+ << "loop not vectorized: " << OREMsg);
+}
+
+void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
+ OptimizationRemarkEmitter *ORE, Loop *TheLoop,
+ Instruction *I) {
+ LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
- ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
- ORETag, TheLoop, I) << OREMsg);
+ ORE->emit(
+ createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
+ << Msg);
}
} // end namespace llvm
@@ -1623,6 +1633,18 @@ class LoopVectorizationCostModel {
ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
ElementCount UserVF);
+ /// \return the maximized element count based on the targets vector
+ /// registers and the loop trip-count, but limited to a maximum safe VF.
+ /// This is a helper function of computeFeasibleMaxVF.
+ ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
+ unsigned SmallestType,
+ unsigned WidestType,
+ ElementCount MaxSafeVF);
+
+ /// \return the maximum legal scalable VF, based on the safe max number
+ /// of elements.
+ ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
+
/// The vectorization cost is a combination of the cost itself and a boolean
/// indicating whether any of the contributing operations will actually
/// operate on
@@ -5576,6 +5598,129 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
return false;
}
+ElementCount
+LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
+ if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
+ reportVectorizationInfo(
+ "Disabling scalable vectorization, because target does not "
+ "support scalable vectors.",
+ "ScalableVectorsUnsupported", ORE, TheLoop);
+ return ElementCount::getScalable(0);
+ }
+
+ auto MaxScalableVF = ElementCount::getScalable(1u << 16);
+
+ // Disable scalable vectorization if the loop contains unsupported reductions.
+ // Test that the loop-vectorizer can legalize all operations for this MaxVF.
+ // FIXME: While for scalable vectors this is currently sufficient, this should
+ // be replaced by a more detailed mechanism that filters out specific VFs,
+ // instead of invalidating vectorization for a whole set of VFs based on the
+ // MaxVF.
+ if (!canVectorizeReductions(MaxScalableVF)) {
+ reportVectorizationInfo(
+ "Scalable vectorization not supported for the reduction "
+ "operations found in this loop.",
+ "ScalableVFUnfeasible", ORE, TheLoop);
+ return ElementCount::getScalable(0);
+ }
+
+ if (Legal->isSafeForAnyVectorWidth())
+ return MaxScalableVF;
+
+ // Limit MaxScalableVF by the maximum safe dependence distance.
+ Optional<unsigned> MaxVScale = TTI.getMaxVScale();
+ MaxScalableVF = ElementCount::getScalable(
+ MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
+ if (!MaxScalableVF)
+ reportVectorizationInfo(
+ "Max legal vector width too small, scalable vectorization "
+ "unfeasible.",
+ "ScalableVFUnfeasible", ORE, TheLoop);
+
+ return MaxScalableVF;
+}
+
+ElementCount
+LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
+ ElementCount UserVF) {
+ MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
+ unsigned SmallestType, WidestType;
+ std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
+
+ // Get the maximum safe dependence distance in bits computed by LAA.
+ // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
+ // the memory accesses that is most restrictive (involved in the smallest
+ // dependence distance).
+ unsigned MaxSafeElements =
+ PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
+
+ auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
+ auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
+
+ LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
+ << ".\n");
+ LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
+ << ".\n");
+
+ // First analyze the UserVF, fall back if the UserVF should be ignored.
+ if (UserVF) {
+ auto MaxSafeUserVF =
+ UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
+
+ if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF))
+ return UserVF;
+
+ assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
+
+ // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
+ // is better to ignore the hint and let the compiler choose a suitable VF.
+ if (!UserVF.isScalable()) {
+ LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
+ << " is unsafe, clamping to max safe VF="
+ << MaxSafeFixedVF << ".\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "User-specified vectorization factor "
+ << ore::NV("UserVectorizationFactor", UserVF)
+ << " is unsafe, clamping to maximum safe vectorization factor "
+ << ore::NV("VectorizationFactor", MaxSafeFixedVF);
+ });
+ return MaxSafeFixedVF;
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
+ << " is unsafe. Ignoring scalable UserVF.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "User-specified vectorization factor "
+ << ore::NV("UserVectorizationFactor", UserVF)
+ << " is unsafe. Ignoring the hint to let the compiler pick a "
+ "suitable VF.";
+ });
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
+ << " / " << WidestType << " bits.\n");
+
+ ElementCount MaxFixedVF = ElementCount::getFixed(1);
+ if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
+ WidestType, MaxSafeFixedVF))
+ MaxFixedVF = MaxVF;
+
+ if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
+ WidestType, MaxSafeScalableVF))
+ // FIXME: Return scalable VF as well (to be added in future patch).
+ if (MaxVF.isScalable())
+ LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
+ << "\n");
+
+ return MaxFixedVF;
+}
+
Optional<ElementCount>
LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
@@ -5716,149 +5861,61 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return None;
}
-ElementCount
-LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
- ElementCount UserVF) {
- bool IgnoreScalableUserVF = UserVF.isScalable() &&
- !TTI.supportsScalableVectors() &&
- !ForceTargetSupportsScalableVectors;
- if (IgnoreScalableUserVF) {
- LLVM_DEBUG(
- dbgs() << "LV: Ignoring VF=" << UserVF
- << " because target does not support scalable vectors.\n");
- ORE->emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF",
- TheLoop->getStartLoc(),
- TheLoop->getHeader())
- << "Ignoring VF=" << ore::NV("UserVF", UserVF)
- << " because target does not support scalable vectors.";
- });
- }
-
- // Beyond this point two scenarios are handled. If UserVF isn't specified
- // then a suitable VF is chosen. If UserVF is specified and there are
- // dependencies, check if it's legal. However, if a UserVF is specified and
- // there are no dependencies, then there's nothing to do.
- if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
- if (!canVectorizeReductions(UserVF)) {
- reportVectorizationFailure(
- "LV: Scalable vectorization not supported for the reduction "
- "operations found in this loop. Using fixed-width "
- "vectorization instead.",
- "Scalable vectorization not supported for the reduction operations "
- "found in this loop. Using fixed-width vectorization instead.",
- "ScalableVFUnfeasible", ORE, TheLoop);
- return computeFeasibleMaxVF(
- ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
- }
-
- if (Legal->isSafeForAnyVectorWidth())
- return UserVF;
- }
-
- MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
- unsigned SmallestType, WidestType;
- std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
- unsigned WidestRegister =
- TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
- .getFixedSize();
+ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
+ unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
+ ElementCount MaxSafeVF) {
+ bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
+ TypeSize WidestRegister = TTI.getRegisterBitWidth(
+ ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
+ : TargetTransformInfo::RGK_FixedWidthVector);
- // Get the maximum safe dependence distance in bits computed by LAA.
- // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
- // the memory accesses that is most restrictive (involved in the smallest
- // dependence distance).
- unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
-
- // If the user vectorization factor is legally unsafe, clamp it to a safe
- // value. Otherwise, return as is.
- if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
- unsigned MaxSafeElements =
- PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
- ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
-
- if (UserVF.isScalable()) {
- Optional<unsigned> MaxVScale = TTI.getMaxVScale();
-
- // Scale VF by vscale before checking if it's safe.
- MaxSafeVF = ElementCount::getScalable(
- MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
-
- if (MaxSafeVF.isZero()) {
- // The dependence distance is too small to use scalable vectors,
- // fallback on fixed.
- LLVM_DEBUG(
- dbgs()
- << "LV: Max legal vector width too small, scalable vectorization "
- "unfeasible. Using fixed-width vectorization instead.\n");
- ORE->emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible",
- TheLoop->getStartLoc(),
- TheLoop->getHeader())
- << "Max legal vector width too small, scalable vectorization "
- << "unfeasible. Using fixed-width vectorization instead.";
- });
- return computeFeasibleMaxVF(
- ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
- }
- }
-
- LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n");
-
- if (ElementCount::isKnownLE(UserVF, MaxSafeVF))
- return UserVF;
-
- LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
- << " is unsafe, clamping to max safe VF=" << MaxSafeVF
- << ".\n");
- ORE->emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
- TheLoop->getStartLoc(),
- TheLoop->getHeader())
- << "User-specified vectorization factor "
- << ore::NV("UserVectorizationFactor", UserVF)
- << " is unsafe, clamping to maximum safe vectorization factor "
- << ore::NV("VectorizationFactor", MaxSafeVF);
- });
- return MaxSafeVF;
- }
-
- WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
+ // Convenience function to return the minimum of two ElementCounts.
+ auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
+ assert((LHS.isScalable() == RHS.isScalable()) &&
+ "Scalable flags must match");
+ return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
+ };
// Ensure MaxVF is a power of 2; the dependence distance bound may not be.
// Note that both WidestRegister and WidestType may not be a powers of 2.
- auto MaxVectorSize =
- ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType));
-
- LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
- << " / " << WidestType << " bits.\n");
+ auto MaxVectorElementCount = ElementCount::get(
+ PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
+ ComputeScalableMaxVF);
+ MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
- << WidestRegister << " bits.\n");
+ << (MaxVectorElementCount * WidestType) << " bits.\n");
- assert(MaxVectorSize.getFixedValue() <= WidestRegister &&
- "Did not expect to pack so many elements"
- " into one vector!");
- if (MaxVectorSize.getFixedValue() == 0) {
+ if (!MaxVectorElementCount) {
LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
return ElementCount::getFixed(1);
- } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() &&
- isPowerOf2_32(ConstTripCount)) {
+ }
+
+ const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
+ if (ConstTripCount &&
+ ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
+ isPowerOf2_32(ConstTripCount)) {
// We need to clamp the VF to be the ConstTripCount. There is no point in
- // choosing a higher viable VF as done in the loop below.
+ // choosing a higher viable VF as done in the loop below. If
+ // MaxVectorElementCount is scalable, we only fall back on a fixed VF when
+ // the TC is less than or equal to the known number of lanes.
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
<< ConstTripCount << "\n");
- return ElementCount::getFixed(ConstTripCount);
+ return TripCountEC;
}
- ElementCount MaxVF = MaxVectorSize;
+ ElementCount MaxVF = MaxVectorElementCount;
if (TTI.shouldMaximizeVectorBandwidth() ||
(MaximizeBandwidth && isScalarEpilogueAllowed())) {
+ auto MaxVectorElementCountMaxBW = ElementCount::get(
+ PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
+ ComputeScalableMaxVF);
+ MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
+
// Collect all viable vectorization factors larger than the default MaxVF
- // (i.e. MaxVectorSize).
+ // (i.e. MaxVectorElementCount).
SmallVector<ElementCount, 8> VFs;
- auto MaxVectorSizeMaxBW =
- ElementCount::getFixed(WidestRegister / SmallestType);
- for (ElementCount VS = MaxVectorSize * 2;
- ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2)
+ for (ElementCount VS = MaxVectorElementCount * 2;
+ ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
VFs.push_back(VS);
// For each VF calculate its register usage.
@@ -5879,7 +5936,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
}
}
if (ElementCount MinVF =
- TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) {
+ TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
if (ElementCount::isKnownLT(MaxVF, MinVF)) {
LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
<< ") with target's minimum: " << MinVF << '\n');
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
index 582bf4dc5747c..7b410dd73a3c1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
@@ -221,7 +221,7 @@ for.end:
ret float %add
}
-; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead.
+; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
define bfloat @fadd_fast_bfloat(bfloat* noalias nocapture readonly %a, i64 %n) {
; CHECK-LABEL: @fadd_fast_bfloat
@@ -322,18 +322,18 @@ for.end:
; MUL
-; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead.
-; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
+; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
+; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
define i32 @mul(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @mul
; CHECK: vector.body:
-; CHECK: %[[LOAD1:.*]] = load <8 x i32>
-; CHECK: %[[LOAD2:.*]] = load <8 x i32>
-; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD1]]
-; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD2]]
+; CHECK: %[[LOAD1:.*]] = load <4 x i32>
+; CHECK: %[[LOAD2:.*]] = load <4 x i32>
+; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD1]]
+; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD2]]
; CHECK: middle.block:
-; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]]
-; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]])
+; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
+; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
entry:
br label %for.body
@@ -352,22 +352,22 @@ for.end: ; preds = %for.body, %entry
}
; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies
-; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead.
-; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
+; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
+; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
define i32 @memory_dependence(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @memory_dependence
; CHECK: vector.body:
-; CHECK: %[[LOAD1:.*]] = load <8 x i32>
-; CHECK: %[[LOAD2:.*]] = load <8 x i32>
-; CHECK: %[[LOAD3:.*]] = load <8 x i32>
-; CHECK: %[[LOAD4:.*]] = load <8 x i32>
-; CHECK: %[[ADD1:.*]] = add nsw <8 x i32> %[[LOAD3]], %[[LOAD1]]
-; CHECK: %[[ADD2:.*]] = add nsw <8 x i32> %[[LOAD4]], %[[LOAD2]]
-; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD3]]
-; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD4]]
+; CHECK: %[[LOAD1:.*]] = load <4 x i32>
+; CHECK: %[[LOAD2:.*]] = load <4 x i32>
+; CHECK: %[[LOAD3:.*]] = load <4 x i32>
+; CHECK: %[[LOAD4:.*]] = load <4 x i32>
+; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]]
+; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]]
+; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]]
+; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]]
; CHECK: middle.block:
-; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]]
-; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]])
+; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
+; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll
new file mode 100644
index 0000000000000..6fe546439a133
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll
@@ -0,0 +1,149 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_SCALABLE_ON
+; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_SCALABLE_ON_MAXBW
+
+; Test that the MaxVF for the following loop, that has no dependence distances,
+; is calculated as vscale x 4 (max legal SVE vector size) or vscale x 16
+; (maximized bandwidth for i8 in the loop).
+define void @test0(i32* %a, i8* %b, i32* %c) {
+; CHECK: LV: Checking a loop in "test0"
+; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
+; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i32, i32* %c, i64 %iv
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
+ %1 = load i8, i8* %arrayidx2, align 4
+ %zext = zext i8 %1 to i32
+ %add = add nsw i32 %zext, %0
+ %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %iv
+ store i32 %add, i32* %arrayidx5, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 1024
+ br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
+
+exit:
+ ret void
+}
+
+; Test that the MaxVF for the following loop, with a dependence distance
+; of 64 elements, is calculated as (maxvscale = 16) * 4.
+define void @test1(i32* %a, i8* %b) {
+; CHECK: LV: Checking a loop in "test1"
+; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
+; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 4
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
+ %1 = load i8, i8* %arrayidx2, align 4
+ %zext = zext i8 %1 to i32
+ %add = add nsw i32 %zext, %0
+ %2 = add nuw nsw i64 %iv, 64
+ %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+ store i32 %add, i32* %arrayidx5, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 1024
+ br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
+
+exit:
+ ret void
+}
+
+; Test that the MaxVF for the following loop, with a dependence distance
+; of 32 elements, is calculated as (maxvscale = 16) * 2.
+define void @test2(i32* %a, i8* %b) {
+; CHECK: LV: Checking a loop in "test2"
+; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2
+; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 2
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
+ %1 = load i8, i8* %arrayidx2, align 4
+ %zext = zext i8 %1 to i32
+ %add = add nsw i32 %zext, %0
+ %2 = add nuw nsw i64 %iv, 32
+ %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+ store i32 %add, i32* %arrayidx5, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 1024
+ br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
+
+exit:
+ ret void
+}
+
+; Test that the MaxVF for the following loop, with a dependence distance
+; of 16 elements, is calculated as (maxvscale = 16) * 1.
+define void @test3(i32* %a, i8* %b) {
+; CHECK: LV: Checking a loop in "test3"
+; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1
+; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 1
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
+ %1 = load i8, i8* %arrayidx2, align 4
+ %zext = zext i8 %1 to i32
+ %add = add nsw i32 %zext, %0
+ %2 = add nuw nsw i64 %iv, 16
+ %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+ store i32 %add, i32* %arrayidx5, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 1024
+ br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
+
+exit:
+ ret void
+}
+
+; Test the fallback mechanism when scalable vectors are not feasible due
+; to e.g. dependence distance. For the '-scalable-vectorization=exclusive'
+; it shouldn't try to vectorize with fixed-width vectors.
+define void @test4(i32* %a, i32* %b) {
+; CHECK: LV: Checking a loop in "test4"
+; CHECK_SCALABLE_ON-NOT: LV: Found feasible scalable VF
+; CHECK_SCALABLE_ON_MAXBW-NOT: LV: Found feasible scalable VF
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+ %1 = load i32, i32* %arrayidx2, align 4
+ %add = add nsw i32 %1, %0
+ %2 = add nuw nsw i64 %iv, 8
+ %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+ store i32 %add, i32* %arrayidx5, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 1024
+ br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !2
+
+exit:
+ ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
+!2 = distinct !{!2, !3, !4}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+!4 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
index a6b50e03768c6..33d75e0a8565c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
@@ -37,9 +37,10 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
; unless max(vscale)=2 it's unsafe to vectorize. For SVE max(vscale)=16, check
; fixed-width vectorization is used instead.
-; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
-; CHECK-DBG: remark: <unknown>:0:0: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
-; CHECK-DBG: LV: The max safe VF is: 8.
+; CHECK-DBG: LV: Checking a loop in "test1"
+; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible.
+; CHECK-DBG: remark: <unknown>:0:0: Max legal vector width too small, scalable vectorization unfeasible.
+; CHECK-DBG: LV: The max safe fixed VF is: 8.
; CHECK-DBG: LV: Selecting VF: 4.
; CHECK-LABEL: @test1
; CHECK: <4 x i32>
@@ -80,9 +81,10 @@ exit:
; }
; }
-; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
-; CHECK-DBG: LV: The max safe VF is: 4.
-; CHECK-DBG: LV: User VF=8 is unsafe, clamping to max safe VF=4.
+; CHECK-DBG: LV: Checking a loop in "test2"
+; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible.
+; CHECK-DBG: LV: The max safe fixed VF is: 4.
+; CHECK-DBG: LV: User VF=vscale x 8 is unsafe. Ignoring scalable UserVF.
; CHECK-DBG: LV: Selecting VF: 4.
; CHECK-LABEL: @test2
; CHECK: <4 x i32>
@@ -129,7 +131,7 @@ exit:
; Max fixed VF=32, Max scalable VF=2, safe to vectorize.
; CHECK-DBG-LABEL: LV: Checking a loop in "test3"
-; CHECK-DBG: LV: The max safe VF is: vscale x 2.
+; CHECK-DBG: LV: The max safe scalable VF is: vscale x 2.
; CHECK-DBG: LV: Using user VF vscale x 2.
; CHECK-LABEL: @test3
; CHECK: <vscale x 2 x i32>
@@ -161,7 +163,8 @@ exit:
; test4
;
-; Scalable vectorization feasible, but the VF is unsafe. Should clamp.
+; Scalable vectorization feasible, but the given VF is unsafe. Should ignore
+; the hint and leave it to the vectorizer to pick a more suitable VF.
;
; Specifies a vector of <vscale x 4 x i32>, i.e. maximum of 64 x i32 with 4
; words per 128-bits (packed).
@@ -173,15 +176,16 @@ exit:
; }
; }
;
-; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize. Should clamp to 2.
+; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize.
; CHECK-DBG-LABEL: LV: Checking a loop in "test4"
-; CHECK-DBG: LV: The max safe VF is: vscale x 2.
-; CHECK-DBG: LV: User VF=vscale x 4 is unsafe, clamping to max safe VF=vscale x 2.
-; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is unsafe, clamping to maximum safe vectorization factor vscale x 2
-; CHECK-DBG: LV: Using max VF vscale x 2
+; CHECK-DBG: LV: The max safe scalable VF is: vscale x 2.
+; CHECK-DBG: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF.
+; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is unsafe. Ignoring the hint to let the compiler pick a suitable VF.
+; CHECK-DBG: Found feasible scalable VF = vscale x 2
+; CHECK-DBG: LV: Selecting VF: 4.
; CHECK-LABEL: @test4
-; CHECK: <vscale x 2 x i32>
+; CHECK: <4 x i32>
define void @test4(i32* %a, i32* %b) {
entry:
br label %loop
@@ -225,7 +229,7 @@ exit:
; Max fixed VF=128, Max scalable VF=8, safe to vectorize.
; CHECK-DBG-LABEL: LV: Checking a loop in "test5"
-; CHECK-DBG: LV: The max safe VF is: vscale x 8.
+; CHECK-DBG: LV: The max safe scalable VF is: vscale x 8.
; CHECK-DBG: LV: Using user VF vscale x 4
; CHECK-LABEL: @test5
; CHECK: <vscale x 4 x i32>
@@ -257,7 +261,8 @@ exit:
; test6
;
-; Scalable vectorization feasible, but the VF is unsafe. Should clamp.
+; Scalable vectorization feasible, but the VF is unsafe. Should ignore
+; the hint and leave it to the vectorizer to pick a more suitable VF.
;
; Specifies a vector of <vscale x 16 x i32>, i.e. maximum of 256 x i32.
;
@@ -268,15 +273,16 @@ exit:
; }
; }
;
-; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize. Should clamp to 8.
+; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize.
; CHECK-DBG-LABEL: LV: Checking a loop in "test6"
-; CHECK-DBG: LV: The max safe VF is: vscale x 8.
-; CHECK-DBG: LV: User VF=vscale x 16 is unsafe, clamping to max safe VF=vscale x 8.
-; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 16 is unsafe, clamping to maximum safe vectorization factor vscale x 8
-; CHECK-DBG: LV: Using max VF vscale x 8
+; CHECK-DBG: LV: The max safe scalable VF is: vscale x 8.
+; CHECK-DBG: LV: User VF=vscale x 16 is unsafe. Ignoring scalable UserVF.
+; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 16 is unsafe. Ignoring the hint to let the compiler pick a suitable VF.
+; CHECK-DBG: LV: Found feasible scalable VF = vscale x 4
+; CHECK-DBG: Selecting VF: 4.
; CHECK-LABEL: @test6
-; CHECK: <vscale x 8 x i32>
+; CHECK: <4 x i32>
define void @test6(i32* %a, i32* %b) {
entry:
br label %loop
@@ -304,8 +310,9 @@ exit:
!17 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
; CHECK-NO-SVE-LABEL: LV: Checking a loop in "test_no_sve"
-; CHECK-NO-SVE: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors.
-; CHECK-NO-SVE: remark: <unknown>:0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors.
+; CHECK-NO-SVE: LV: Disabling scalable vectorization, because target does not support scalable vectors.
+; CHECK-NO-SVE: remark: <unknown>:0:0: Disabling scalable vectorization, because target does not support scalable vectors.
+; CHECK-NO-SVE: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF.
; CHECK-NO-SVE: LV: Selecting VF: 4.
; CHECK-NO-SVE: <4 x i32>
; CHECK-NO-SVE-NOT: <vscale x 4 x i32>
@@ -337,8 +344,8 @@ exit:
; supported but max vscale is undefined.
;
; CHECK-NO-MAX-VSCALE-LABEL: LV: Checking a loop in "test_no_max_vscale"
-; CHECK-NO-MAX-VSCALE: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
-; CEHCK-NO-MAX-VSCALE: The max safe VF is: 4.
+; CEHCK-NO-MAX-VSCALE: The max safe fixed VF is: 4.
+; CHECK-NO-MAX-VSCALE: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF.
; CHECK-NO-MAX-VSCALE: LV: Selecting VF: 4.
; CHECK-NO-MAX-VSCALE: <4 x i32>
define void @test_no_max_vscale(i32* %a, i32* %b) {
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
index 1361ba59bca2d..f99d87f8ad795 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
@@ -3,8 +3,8 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-; CHECK: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors.
-; CHECK: remark: <unknown>:0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors.
+; CHECK: LV: Disabling scalable vectorization, because target does not support scalable vectors.
+; CHECK: remark: <unknown>:0:0: Disabling scalable vectorization, because target does not support scalable vectors.
; CHECK: LV: The Widest register safe to use is: 32 bits.
define void @test1(i32* %a, i32* %b) {
entry:
More information about the llvm-commits
mailing list