[llvm] 51d648c - Revert "[LV] Calculate max feasible scalable VF."
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 29 08:04:55 PDT 2021
Author: Sander de Smalen
Date: 2021-04-29T16:04:37+01:00
New Revision: 51d648c119d7773ce6fb809353bd6bd14bca8818
URL: https://github.com/llvm/llvm-project/commit/51d648c119d7773ce6fb809353bd6bd14bca8818
DIFF: https://github.com/llvm/llvm-project/commit/51d648c119d7773ce6fb809353bd6bd14bca8818.diff
LOG: Revert "[LV] Calculate max feasible scalable VF."
Temporarily reverting this patch due to some unexpected issue found
by one of the PPC buildbots.
This reverts commit 584e9b6e4b4987b882719923e640eed854613d91.
Added:
Modified:
llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
Removed:
llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll
################################################################################
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
index ad6a4b561a9bb..ecb44a7b1518d 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h
@@ -174,13 +174,6 @@ void reportVectorizationFailure(const StringRef DebugMsg,
const StringRef OREMsg, const StringRef ORETag,
OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr);
-/// Reports an informative message: print \p Msg for debugging purposes as well
-/// as an optimization remark. Uses either \p I as location of the remark, or
-/// otherwise \p TheLoop.
-void reportVectorizationInfo(const StringRef OREMsg, const StringRef ORETag,
- OptimizationRemarkEmitter *ORE, Loop *TheLoop,
- Instruction *I = nullptr);
-
} // end namespace llvm
#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZE_H
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0fe92b3685e54..f8f47c475088e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1065,13 +1065,13 @@ void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr)
B.SetCurrentDebugLocation(DebugLoc());
}
-/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
-/// is passed, the message relates to that particular instruction.
+/// Write a record \p DebugMsg about vectorization failure to the debug
+/// output stream. If \p I is passed, it is an instruction that prevents
+/// vectorization.
#ifndef NDEBUG
-static void debugVectorizationMessage(const StringRef Prefix,
- const StringRef DebugMsg,
- Instruction *I) {
- dbgs() << "LV: " << Prefix << DebugMsg;
+static void debugVectorizationFailure(const StringRef DebugMsg,
+ Instruction *I) {
+ dbgs() << "LV: Not vectorizing: " << DebugMsg;
if (I != nullptr)
dbgs() << " " << *I;
else
@@ -1100,7 +1100,9 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
DL = I->getDebugLoc();
}
- return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
+ OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
+ R << "loop not vectorized: ";
+ return R;
}
/// Return a value for Step multiplied by VF.
@@ -1121,24 +1123,12 @@ Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
}
void reportVectorizationFailure(const StringRef DebugMsg,
- const StringRef OREMsg, const StringRef ORETag,
- OptimizationRemarkEmitter *ORE, Loop *TheLoop,
- Instruction *I) {
- LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
- LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
- ORE->emit(
- createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
- << "loop not vectorized: " << OREMsg);
-}
-
-void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
- OptimizationRemarkEmitter *ORE, Loop *TheLoop,
- Instruction *I) {
- LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
+ const StringRef OREMsg, const StringRef ORETag,
+ OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
+ LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
- ORE->emit(
- createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
- << Msg);
+ ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
+ ORETag, TheLoop, I) << OREMsg);
}
} // end namespace llvm
@@ -1633,18 +1623,6 @@ class LoopVectorizationCostModel {
ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
ElementCount UserVF);
- /// \return the maximized element count based on the targets vector
- /// registers and the loop trip-count, but limited to a maximum safe VF.
- /// This is a helper function of computeFeasibleMaxVF.
- ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
- unsigned SmallestType,
- unsigned WidestType,
- ElementCount MaxSafeVF);
-
- /// \return the maximum legal scalable VF, based on the safe max number
- /// of elements.
- ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
-
/// The vectorization cost is a combination of the cost itself and a boolean
/// indicating whether any of the contributing operations will actually
/// operate on
@@ -5598,129 +5576,6 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
return false;
}
-ElementCount
-LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
- if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
- reportVectorizationInfo(
- "Disabling scalable vectorization, because target does not "
- "support scalable vectors.",
- "ScalableVectorsUnsupported", ORE, TheLoop);
- return ElementCount::getScalable(0);
- }
-
- auto MaxScalableVF = ElementCount::getScalable(1u << 16);
-
- // Disable scalable vectorization if the loop contains unsupported reductions.
- // Test that the loop-vectorizer can legalize all operations for this MaxVF.
- // FIXME: While for scalable vectors this is currently sufficient, this should
- // be replaced by a more detailed mechanism that filters out specific VFs,
- // instead of invalidating vectorization for a whole set of VFs based on the
- // MaxVF.
- if (!canVectorizeReductions(MaxScalableVF)) {
- reportVectorizationInfo(
- "Scalable vectorization not supported for the reduction "
- "operations found in this loop.",
- "ScalableVFUnfeasible", ORE, TheLoop);
- return ElementCount::getScalable(0);
- }
-
- if (Legal->isSafeForAnyVectorWidth())
- return MaxScalableVF;
-
- // Limit MaxScalableVF by the maximum safe dependence distance.
- Optional<unsigned> MaxVScale = TTI.getMaxVScale();
- MaxScalableVF = ElementCount::getScalable(
- MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
- if (!MaxScalableVF)
- reportVectorizationInfo(
- "Max legal vector width too small, scalable vectorization "
- "unfeasible.",
- "ScalableVFUnfeasible", ORE, TheLoop);
-
- return MaxScalableVF;
-}
-
-ElementCount
-LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
- ElementCount UserVF) {
- MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
- unsigned SmallestType, WidestType;
- std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
-
- // Get the maximum safe dependence distance in bits computed by LAA.
- // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
- // the memory accesses that is most restrictive (involved in the smallest
- // dependence distance).
- unsigned MaxSafeElements =
- PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
-
- auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
- auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
-
- LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
- << ".\n");
- LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
- << ".\n");
-
- // First analyze the UserVF, fall back if the UserVF should be ignored.
- if (UserVF) {
- auto MaxSafeUserVF =
- UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
-
- if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF))
- return UserVF;
-
- assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
-
- // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
- // is better to ignore the hint and let the compiler choose a suitable VF.
- if (!UserVF.isScalable()) {
- LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
- << " is unsafe, clamping to max safe VF="
- << MaxSafeFixedVF << ".\n");
- ORE->emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
- TheLoop->getStartLoc(),
- TheLoop->getHeader())
- << "User-specified vectorization factor "
- << ore::NV("UserVectorizationFactor", UserVF)
- << " is unsafe, clamping to maximum safe vectorization factor "
- << ore::NV("VectorizationFactor", MaxSafeFixedVF);
- });
- return MaxSafeFixedVF;
- }
-
- LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
- << " is unsafe. Ignoring scalable UserVF.\n");
- ORE->emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
- TheLoop->getStartLoc(),
- TheLoop->getHeader())
- << "User-specified vectorization factor "
- << ore::NV("UserVectorizationFactor", UserVF)
- << " is unsafe. Ignoring the hint to let the compiler pick a "
- "suitable VF.";
- });
- }
-
- LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
- << " / " << WidestType << " bits.\n");
-
- ElementCount MaxFixedVF = ElementCount::getFixed(1);
- if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
- WidestType, MaxSafeFixedVF))
- MaxFixedVF = MaxVF;
-
- if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
- WidestType, MaxSafeScalableVF))
- // FIXME: Return scalable VF as well (to be added in future patch).
- if (MaxVF.isScalable())
- LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
- << "\n");
-
- return MaxFixedVF;
-}
-
Optional<ElementCount>
LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
@@ -5861,61 +5716,149 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return None;
}
-ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
- unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
- ElementCount MaxSafeVF) {
- bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
- TypeSize WidestRegister = TTI.getRegisterBitWidth(
- ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
- : TargetTransformInfo::RGK_FixedWidthVector);
+ElementCount
+LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
+ ElementCount UserVF) {
+ bool IgnoreScalableUserVF = UserVF.isScalable() &&
+ !TTI.supportsScalableVectors() &&
+ !ForceTargetSupportsScalableVectors;
+ if (IgnoreScalableUserVF) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Ignoring VF=" << UserVF
+ << " because target does not support scalable vectors.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "IgnoreScalableUserVF",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "Ignoring VF=" << ore::NV("UserVF", UserVF)
+ << " because target does not support scalable vectors.";
+ });
+ }
- // Convenience function to return the minimum of two ElementCounts.
- auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
- assert((LHS.isScalable() == RHS.isScalable()) &&
- "Scalable flags must match");
- return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
- };
+ // Beyond this point two scenarios are handled. If UserVF isn't specified
+ // then a suitable VF is chosen. If UserVF is specified and there are
+ // dependencies, check if it's legal. However, if a UserVF is specified and
+ // there are no dependencies, then there's nothing to do.
+ if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
+ if (!canVectorizeReductions(UserVF)) {
+ reportVectorizationFailure(
+ "LV: Scalable vectorization not supported for the reduction "
+ "operations found in this loop. Using fixed-width "
+ "vectorization instead.",
+ "Scalable vectorization not supported for the reduction operations "
+ "found in this loop. Using fixed-width vectorization instead.",
+ "ScalableVFUnfeasible", ORE, TheLoop);
+ return computeFeasibleMaxVF(
+ ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
+ }
+
+ if (Legal->isSafeForAnyVectorWidth())
+ return UserVF;
+ }
+
+ MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
+ unsigned SmallestType, WidestType;
+ std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
+ unsigned WidestRegister =
+ TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
+ .getFixedSize();
+
+ // Get the maximum safe dependence distance in bits computed by LAA.
+ // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
+ // the memory accesses that is most restrictive (involved in the smallest
+ // dependence distance).
+ unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
+
+ // If the user vectorization factor is legally unsafe, clamp it to a safe
+ // value. Otherwise, return as is.
+ if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
+ unsigned MaxSafeElements =
+ PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
+ ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
+
+ if (UserVF.isScalable()) {
+ Optional<unsigned> MaxVScale = TTI.getMaxVScale();
+
+ // Scale VF by vscale before checking if it's safe.
+ MaxSafeVF = ElementCount::getScalable(
+ MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
+
+ if (MaxSafeVF.isZero()) {
+ // The dependence distance is too small to use scalable vectors,
+ // fallback on fixed.
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Max legal vector width too small, scalable vectorization "
+ "unfeasible. Using fixed-width vectorization instead.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "ScalableVFUnfeasible",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "Max legal vector width too small, scalable vectorization "
+ << "unfeasible. Using fixed-width vectorization instead.";
+ });
+ return computeFeasibleMaxVF(
+ ConstTripCount, ElementCount::getFixed(UserVF.getKnownMinValue()));
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: The max safe VF is: " << MaxSafeVF << ".\n");
+
+ if (ElementCount::isKnownLE(UserVF, MaxSafeVF))
+ return UserVF;
+
+ LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
+ << " is unsafe, clamping to max safe VF=" << MaxSafeVF
+ << ".\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "User-specified vectorization factor "
+ << ore::NV("UserVectorizationFactor", UserVF)
+ << " is unsafe, clamping to maximum safe vectorization factor "
+ << ore::NV("VectorizationFactor", MaxSafeVF);
+ });
+ return MaxSafeVF;
+ }
+
+ WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
// Ensure MaxVF is a power of 2; the dependence distance bound may not be.
// Note that both WidestRegister and WidestType may not be a powers of 2.
- auto MaxVectorElementCount = ElementCount::get(
- PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
- ComputeScalableMaxVF);
- MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
+ auto MaxVectorSize =
+ ElementCount::getFixed(PowerOf2Floor(WidestRegister / WidestType));
+
+ LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
+ << " / " << WidestType << " bits.\n");
LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
- << (MaxVectorElementCount * WidestType) << " bits.\n");
+ << WidestRegister << " bits.\n");
- if (!MaxVectorElementCount) {
+ assert(MaxVectorSize.getFixedValue() <= WidestRegister &&
+ "Did not expect to pack so many elements"
+ " into one vector!");
+ if (MaxVectorSize.getFixedValue() == 0) {
LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
return ElementCount::getFixed(1);
- }
-
- const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
- if (ConstTripCount &&
- ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
- isPowerOf2_32(ConstTripCount)) {
+ } else if (ConstTripCount && ConstTripCount < MaxVectorSize.getFixedValue() &&
+ isPowerOf2_32(ConstTripCount)) {
// We need to clamp the VF to be the ConstTripCount. There is no point in
- // choosing a higher viable VF as done in the loop below. If
- // MaxVectorElementCount is scalable, we only fall back on a fixed VF when
- // the TC is less than or equal to the known number of lanes.
+ // choosing a higher viable VF as done in the loop below.
LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
<< ConstTripCount << "\n");
- return TripCountEC;
+ return ElementCount::getFixed(ConstTripCount);
}
- ElementCount MaxVF = MaxVectorElementCount;
+ ElementCount MaxVF = MaxVectorSize;
if (TTI.shouldMaximizeVectorBandwidth() ||
(MaximizeBandwidth && isScalarEpilogueAllowed())) {
- auto MaxVectorElementCountMaxBW = ElementCount::get(
- PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
- ComputeScalableMaxVF);
- MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
-
// Collect all viable vectorization factors larger than the default MaxVF
- // (i.e. MaxVectorElementCount).
+ // (i.e. MaxVectorSize).
SmallVector<ElementCount, 8> VFs;
- for (ElementCount VS = MaxVectorElementCount * 2;
- ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
+ auto MaxVectorSizeMaxBW =
+ ElementCount::getFixed(WidestRegister / SmallestType);
+ for (ElementCount VS = MaxVectorSize * 2;
+ ElementCount::isKnownLE(VS, MaxVectorSizeMaxBW); VS *= 2)
VFs.push_back(VS);
// For each VF calculate its register usage.
@@ -5936,7 +5879,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
}
}
if (ElementCount MinVF =
- TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
+ TTI.getMinimumVF(SmallestType, /*IsScalable=*/false)) {
if (ElementCount::isKnownLT(MaxVF, MinVF)) {
LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
<< ") with target's minimum: " << MinVF << '\n');
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
index 7b410dd73a3c1..582bf4dc5747c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
@@ -221,7 +221,7 @@ for.end:
ret float %add
}
-; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
+; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead.
; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
define bfloat @fadd_fast_bfloat(bfloat* noalias nocapture readonly %a, i64 %n) {
; CHECK-LABEL: @fadd_fast_bfloat
@@ -322,18 +322,18 @@ for.end:
; MUL
-; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
-; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
+; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead.
+; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
define i32 @mul(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @mul
; CHECK: vector.body:
-; CHECK: %[[LOAD1:.*]] = load <4 x i32>
-; CHECK: %[[LOAD2:.*]] = load <4 x i32>
-; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD1]]
-; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD2]]
+; CHECK: %[[LOAD1:.*]] = load <8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <8 x i32>
+; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD1]]
+; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD2]]
; CHECK: middle.block:
-; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
-; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
+; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]]
+; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]])
entry:
br label %for.body
@@ -352,22 +352,22 @@ for.end: ; preds = %for.body, %entry
}
; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies
-; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
-; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
+; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. Using fixed-width vectorization instead.
+; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
define i32 @memory_dependence(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @memory_dependence
; CHECK: vector.body:
-; CHECK: %[[LOAD1:.*]] = load <4 x i32>
-; CHECK: %[[LOAD2:.*]] = load <4 x i32>
-; CHECK: %[[LOAD3:.*]] = load <4 x i32>
-; CHECK: %[[LOAD4:.*]] = load <4 x i32>
-; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]]
-; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]]
-; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]]
-; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]]
+; CHECK: %[[LOAD1:.*]] = load <8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <8 x i32>
+; CHECK: %[[LOAD3:.*]] = load <8 x i32>
+; CHECK: %[[LOAD4:.*]] = load <8 x i32>
+; CHECK: %[[ADD1:.*]] = add nsw <8 x i32> %[[LOAD3]], %[[LOAD1]]
+; CHECK: %[[ADD2:.*]] = add nsw <8 x i32> %[[LOAD4]], %[[LOAD2]]
+; CHECK: %[[MUL1:.*]] = mul <8 x i32> %[[LOAD3]]
+; CHECK: %[[MUL2:.*]] = mul <8 x i32> %[[LOAD4]]
; CHECK: middle.block:
-; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
-; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
+; CHECK: %[[RDX:.*]] = mul <8 x i32> %[[MUL2]], %[[MUL1]]
+; CHECK: call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %[[RDX]])
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll
deleted file mode 100644
index 6fe546439a133..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll
+++ /dev/null
@@ -1,149 +0,0 @@
-; REQUIRES: asserts
-; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_SCALABLE_ON
-; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize -vectorizer-maximize-bandwidth < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_SCALABLE_ON_MAXBW
-
-; Test that the MaxVF for the following loop, that has no dependence distances,
-; is calculated as vscale x 4 (max legal SVE vector size) or vscale x 16
-; (maximized bandwidth for i8 in the loop).
-define void @test0(i32* %a, i8* %b, i32* %c) {
-; CHECK: LV: Checking a loop in "test0"
-; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
-; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16
-entry:
- br label %loop
-
-loop:
- %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
- %arrayidx = getelementptr inbounds i32, i32* %c, i64 %iv
- %0 = load i32, i32* %arrayidx, align 4
- %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
- %1 = load i8, i8* %arrayidx2, align 4
- %zext = zext i8 %1 to i32
- %add = add nsw i32 %zext, %0
- %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %iv
- store i32 %add, i32* %arrayidx5, align 4
- %iv.next = add nuw nsw i64 %iv, 1
- %exitcond.not = icmp eq i64 %iv.next, 1024
- br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
-
-exit:
- ret void
-}
-
-; Test that the MaxVF for the following loop, with a dependence distance
-; of 64 elements, is calculated as (maxvscale = 16) * 4.
-define void @test1(i32* %a, i8* %b) {
-; CHECK: LV: Checking a loop in "test1"
-; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
-; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 4
-entry:
- br label %loop
-
-loop:
- %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
- %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
- %0 = load i32, i32* %arrayidx, align 4
- %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
- %1 = load i8, i8* %arrayidx2, align 4
- %zext = zext i8 %1 to i32
- %add = add nsw i32 %zext, %0
- %2 = add nuw nsw i64 %iv, 64
- %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
- store i32 %add, i32* %arrayidx5, align 4
- %iv.next = add nuw nsw i64 %iv, 1
- %exitcond.not = icmp eq i64 %iv.next, 1024
- br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
-
-exit:
- ret void
-}
-
-; Test that the MaxVF for the following loop, with a dependence distance
-; of 32 elements, is calculated as (maxvscale = 16) * 2.
-define void @test2(i32* %a, i8* %b) {
-; CHECK: LV: Checking a loop in "test2"
-; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2
-; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 2
-entry:
- br label %loop
-
-loop:
- %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
- %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
- %0 = load i32, i32* %arrayidx, align 4
- %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
- %1 = load i8, i8* %arrayidx2, align 4
- %zext = zext i8 %1 to i32
- %add = add nsw i32 %zext, %0
- %2 = add nuw nsw i64 %iv, 32
- %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
- store i32 %add, i32* %arrayidx5, align 4
- %iv.next = add nuw nsw i64 %iv, 1
- %exitcond.not = icmp eq i64 %iv.next, 1024
- br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
-
-exit:
- ret void
-}
-
-; Test that the MaxVF for the following loop, with a dependence distance
-; of 16 elements, is calculated as (maxvscale = 16) * 1.
-define void @test3(i32* %a, i8* %b) {
-; CHECK: LV: Checking a loop in "test3"
-; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1
-; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 1
-entry:
- br label %loop
-
-loop:
- %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
- %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
- %0 = load i32, i32* %arrayidx, align 4
- %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv
- %1 = load i8, i8* %arrayidx2, align 4
- %zext = zext i8 %1 to i32
- %add = add nsw i32 %zext, %0
- %2 = add nuw nsw i64 %iv, 16
- %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
- store i32 %add, i32* %arrayidx5, align 4
- %iv.next = add nuw nsw i64 %iv, 1
- %exitcond.not = icmp eq i64 %iv.next, 1024
- br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
-
-exit:
- ret void
-}
-
-; Test the fallback mechanism when scalable vectors are not feasible due
-; to e.g. dependence distance. For the '-scalable-vectorization=exclusive'
-; it shouldn't try to vectorize with fixed-width vectors.
-define void @test4(i32* %a, i32* %b) {
-; CHECK: LV: Checking a loop in "test4"
-; CHECK_SCALABLE_ON-NOT: LV: Found feasible scalable VF
-; CHECK_SCALABLE_ON_MAXBW-NOT: LV: Found feasible scalable VF
-entry:
- br label %loop
-
-loop:
- %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
- %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
- %0 = load i32, i32* %arrayidx, align 4
- %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
- %1 = load i32, i32* %arrayidx2, align 4
- %add = add nsw i32 %1, %0
- %2 = add nuw nsw i64 %iv, 8
- %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
- store i32 %add, i32* %arrayidx5, align 4
- %iv.next = add nuw nsw i64 %iv, 1
- %exitcond.not = icmp eq i64 %iv.next, 1024
- br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !2
-
-exit:
- ret void
-}
-
-!0 = distinct !{!0, !1}
-!1 = !{!"llvm.loop.vectorize.enable", i1 true}
-!2 = distinct !{!2, !3, !4}
-!3 = !{!"llvm.loop.vectorize.enable", i1 true}
-!4 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
index 33d75e0a8565c..a6b50e03768c6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
@@ -37,10 +37,9 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
; unless max(vscale)=2 it's unsafe to vectorize. For SVE max(vscale)=16, check
; fixed-width vectorization is used instead.
-; CHECK-DBG: LV: Checking a loop in "test1"
-; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible.
-; CHECK-DBG: remark: <unknown>:0:0: Max legal vector width too small, scalable vectorization unfeasible.
-; CHECK-DBG: LV: The max safe fixed VF is: 8.
+; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CHECK-DBG: remark: <unknown>:0:0: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CHECK-DBG: LV: The max safe VF is: 8.
; CHECK-DBG: LV: Selecting VF: 4.
; CHECK-LABEL: @test1
; CHECK: <4 x i32>
@@ -81,10 +80,9 @@ exit:
; }
; }
-; CHECK-DBG: LV: Checking a loop in "test2"
-; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible.
-; CHECK-DBG: LV: The max safe fixed VF is: 4.
-; CHECK-DBG: LV: User VF=vscale x 8 is unsafe. Ignoring scalable UserVF.
+; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CHECK-DBG: LV: The max safe VF is: 4.
+; CHECK-DBG: LV: User VF=8 is unsafe, clamping to max safe VF=4.
; CHECK-DBG: LV: Selecting VF: 4.
; CHECK-LABEL: @test2
; CHECK: <4 x i32>
@@ -131,7 +129,7 @@ exit:
; Max fixed VF=32, Max scalable VF=2, safe to vectorize.
; CHECK-DBG-LABEL: LV: Checking a loop in "test3"
-; CHECK-DBG: LV: The max safe scalable VF is: vscale x 2.
+; CHECK-DBG: LV: The max safe VF is: vscale x 2.
; CHECK-DBG: LV: Using user VF vscale x 2.
; CHECK-LABEL: @test3
; CHECK: <vscale x 2 x i32>
@@ -163,8 +161,7 @@ exit:
; test4
;
-; Scalable vectorization feasible, but the given VF is unsafe. Should ignore
-; the hint and leave it to the vectorizer to pick a more suitable VF.
+; Scalable vectorization feasible, but the VF is unsafe. Should clamp.
;
; Specifies a vector of <vscale x 4 x i32>, i.e. maximum of 64 x i32 with 4
; words per 128-bits (packed).
@@ -176,16 +173,15 @@ exit:
; }
; }
;
-; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize.
+; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize. Should clamp to 2.
; CHECK-DBG-LABEL: LV: Checking a loop in "test4"
-; CHECK-DBG: LV: The max safe scalable VF is: vscale x 2.
-; CHECK-DBG: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF.
-; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is unsafe. Ignoring the hint to let the compiler pick a suitable VF.
-; CHECK-DBG: Found feasible scalable VF = vscale x 2
-; CHECK-DBG: LV: Selecting VF: 4.
+; CHECK-DBG: LV: The max safe VF is: vscale x 2.
+; CHECK-DBG: LV: User VF=vscale x 4 is unsafe, clamping to max safe VF=vscale x 2.
+; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is unsafe, clamping to maximum safe vectorization factor vscale x 2
+; CHECK-DBG: LV: Using max VF vscale x 2
; CHECK-LABEL: @test4
-; CHECK: <4 x i32>
+; CHECK: <vscale x 2 x i32>
define void @test4(i32* %a, i32* %b) {
entry:
br label %loop
@@ -229,7 +225,7 @@ exit:
; Max fixed VF=128, Max scalable VF=8, safe to vectorize.
; CHECK-DBG-LABEL: LV: Checking a loop in "test5"
-; CHECK-DBG: LV: The max safe scalable VF is: vscale x 8.
+; CHECK-DBG: LV: The max safe VF is: vscale x 8.
; CHECK-DBG: LV: Using user VF vscale x 4
; CHECK-LABEL: @test5
; CHECK: <vscale x 4 x i32>
@@ -261,8 +257,7 @@ exit:
; test6
;
-; Scalable vectorization feasible, but the VF is unsafe. Should ignore
-; the hint and leave it to the vectorizer to pick a more suitable VF.
+; Scalable vectorization feasible, but the VF is unsafe. Should clamp.
;
; Specifies a vector of <vscale x 16 x i32>, i.e. maximum of 256 x i32.
;
@@ -273,16 +268,15 @@ exit:
; }
; }
;
-; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize.
+; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize. Should clamp to 8.
; CHECK-DBG-LABEL: LV: Checking a loop in "test6"
-; CHECK-DBG: LV: The max safe scalable VF is: vscale x 8.
-; CHECK-DBG: LV: User VF=vscale x 16 is unsafe. Ignoring scalable UserVF.
-; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 16 is unsafe. Ignoring the hint to let the compiler pick a suitable VF.
-; CHECK-DBG: LV: Found feasible scalable VF = vscale x 4
-; CHECK-DBG: Selecting VF: 4.
+; CHECK-DBG: LV: The max safe VF is: vscale x 8.
+; CHECK-DBG: LV: User VF=vscale x 16 is unsafe, clamping to max safe VF=vscale x 8.
+; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 16 is unsafe, clamping to maximum safe vectorization factor vscale x 8
+; CHECK-DBG: LV: Using max VF vscale x 8
; CHECK-LABEL: @test6
-; CHECK: <4 x i32>
+; CHECK: <vscale x 8 x i32>
define void @test6(i32* %a, i32* %b) {
entry:
br label %loop
@@ -310,9 +304,8 @@ exit:
!17 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
; CHECK-NO-SVE-LABEL: LV: Checking a loop in "test_no_sve"
-; CHECK-NO-SVE: LV: Disabling scalable vectorization, because target does not support scalable vectors.
-; CHECK-NO-SVE: remark: <unknown>:0:0: Disabling scalable vectorization, because target does not support scalable vectors.
-; CHECK-NO-SVE: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF.
+; CHECK-NO-SVE: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors.
+; CHECK-NO-SVE: remark: <unknown>:0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors.
; CHECK-NO-SVE: LV: Selecting VF: 4.
; CHECK-NO-SVE: <4 x i32>
; CHECK-NO-SVE-NOT: <vscale x 4 x i32>
@@ -344,8 +337,8 @@ exit:
; supported but max vscale is undefined.
;
; CHECK-NO-MAX-VSCALE-LABEL: LV: Checking a loop in "test_no_max_vscale"
-; CEHCK-NO-MAX-VSCALE: The max safe fixed VF is: 4.
-; CHECK-NO-MAX-VSCALE: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF.
+; CHECK-NO-MAX-VSCALE: LV: Max legal vector width too small, scalable vectorization unfeasible. Using fixed-width vectorization instead.
+; CEHCK-NO-MAX-VSCALE: The max safe VF is: 4.
; CHECK-NO-MAX-VSCALE: LV: Selecting VF: 4.
; CHECK-NO-MAX-VSCALE: <4 x i32>
define void @test_no_max_vscale(i32* %a, i32* %b) {
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
index f99d87f8ad795..1361ba59bca2d 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
@@ -3,8 +3,8 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-; CHECK: LV: Disabling scalable vectorization, because target does not support scalable vectors.
-; CHECK: remark: <unknown>:0:0: Disabling scalable vectorization, because target does not support scalable vectors.
+; CHECK: LV: Ignoring VF=vscale x 4 because target does not support scalable vectors.
+; CHECK: remark: <unknown>:0:0: Ignoring VF=vscale x 4 because target does not support scalable vectors.
; CHECK: LV: The Widest register safe to use is: 32 bits.
define void @test1(i32* %a, i32* %b) {
entry:
More information about the llvm-commits
mailing list