[llvm-branch-commits] [llvm] cba4acc - [LV] Clamp VF hint when unsafe
Cullen Rhodes via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Dec 1 03:34:44 PST 2020
Author: Cullen Rhodes
Date: 2020-12-01T11:30:34Z
New Revision: cba4accda08f90bbc96d7662ef6b1bb12a7733f2
URL: https://github.com/llvm/llvm-project/commit/cba4accda08f90bbc96d7662ef6b1bb12a7733f2
DIFF: https://github.com/llvm/llvm-project/commit/cba4accda08f90bbc96d7662ef6b1bb12a7733f2.diff
LOG: [LV] Clamp VF hint when unsafe
In the following loop the dependence distance is 2 and can only be
vectorized if the vector length is no larger than this.
void foo(int *a, int *b, int N) {
#pragma clang loop vectorize(enable) vectorize_width(4)
for (int i=0; i<N; ++i) {
a[i + 2] = a[i] + b[i];
}
}
However, when specifying a VF of 4 via a loop hint this loop is
vectorized. According to [1][2], loop hints are ignored if the
optimization is not safe to apply.
This patch introduces a check to bail of vectorization if the user
specified VF is greater than the maximum feasible VF, unless explicitly
forced with '-force-vector-width=X'.
[1] https://llvm.org/docs/LangRef.html#llvm-loop-vectorize-and-llvm-loop-interleave
[2] https://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations
Reviewed By: sdesmalen, fhahn, Meinersbur
Differential Revision: https://reviews.llvm.org/D90687
Added:
llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll
llvm/test/Transforms/LoopVectorize/unsafe-vf-hint-remark.ll
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d389e03e9c04..d87938bb1464 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1443,7 +1443,8 @@ class LoopVectorizationCostModel {
/// \return An upper bound for the vectorization factor, a power-of-2 larger
/// than zero. One is returned if vectorization should best be avoided due
/// to cost.
- ElementCount computeFeasibleMaxVF(unsigned ConstTripCount);
+ ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
+ ElementCount UserVF);
/// The vectorization cost is a combination of the cost itself and a boolean
/// indicating whether any of the contributing operations will actually
@@ -5270,9 +5271,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return None;
}
+ ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
+
switch (ScalarEpilogueStatus) {
case CM_ScalarEpilogueAllowed:
- return UserVF ? UserVF : computeFeasibleMaxVF(TC);
+ return MaxVF;
case CM_ScalarEpilogueNotNeededUsePredicate:
LLVM_DEBUG(
dbgs() << "LV: vector predicate hint/switch found.\n"
@@ -5308,7 +5311,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
}
- ElementCount MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
assert(!MaxVF.isScalable() &&
"Scalable vectors do not yet support tail folding");
assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
@@ -5361,7 +5363,9 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
}
ElementCount
-LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
+LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
+ ElementCount UserVF) {
+ assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned SmallestType, WidestType;
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -5373,6 +5377,27 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
// dependence distance).
unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
+ if (UserVF.isNonZero()) {
+ // If legally unsafe, clamp the user vectorization factor to a safe value.
+ unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
+ if (UserVF.getFixedValue() <= MaxSafeVF)
+ return UserVF;
+
+ LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
+ << " is unsafe, clamping to max safe VF=" << MaxSafeVF
+ << ".\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "User-specified vectorization factor "
+ << ore::NV("UserVectorizationFactor", UserVF)
+ << " is unsafe, clamping to maximum safe vectorization factor "
+ << ore::NV("VectorizationFactor", MaxSafeVF);
+ });
+ return ElementCount::getFixed(MaxSafeVF);
+ }
+
WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
// Ensure MaxVF is a power of 2; the dependence distance bound may not be.
@@ -7031,9 +7056,12 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
CM.invalidateCostModelingDecisions();
}
- if (!UserVF.isZero()) {
+ ElementCount MaxVF = MaybeMaxVF.getValue();
+ assert(MaxVF.isNonZero() && "MaxVF is zero.");
+
+ if (!UserVF.isZero() && UserVF.getFixedValue() <= MaxVF.getFixedValue()) {
LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
- assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
+ assert(isPowerOf2_32(UserVF.getFixedValue()) &&
"VF needs to be a power of two");
// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.
@@ -7044,9 +7072,6 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
return {{UserVF, 0}};
}
- ElementCount MaxVF = MaybeMaxVF.getValue();
- assert(MaxVF.isNonZero() && "MaxVF is zero.");
-
for (ElementCount VF = ElementCount::getFixed(1);
ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
// Collect Uniform and Scalar instructions after vectorization with VF.
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll b/llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll
new file mode 100644
index 000000000000..44293441e492
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll
@@ -0,0 +1,43 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; Specify a large unsafe vectorization factor of 32 that gets clamped to 16,
+; then test an even smaller VF of 2 is selected based on the cost-model.
+
+; CHECK: LV: User VF=32 is unsafe, clamping to max safe VF=16.
+; CHECK: remark: <unknown>:0:0: User-specified vectorization factor 32 is unsafe, clamping to maximum safe vectorization factor 16
+; CHECK: LV: Selecting VF: 2.
+; CHECK-LABEL: @test
+; CHECK: <2 x i64>
+define void @test(i64* nocapture %a, i64* nocapture readonly %b) {
+entry:
+ br label %loop.header
+
+loop.header:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
+ %arrayidx = getelementptr inbounds i64, i64* %a, i64 %iv
+ %0 = load i64, i64* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i64, i64* %b, i64 %iv
+ %1 = load i64, i64* %arrayidx2, align 4
+ %add = add nsw i64 %1, %0
+ %2 = add nuw nsw i64 %iv, 16
+ %arrayidx5 = getelementptr inbounds i64, i64* %a, i64 %2
+ %c = icmp eq i64 %1, 120
+ br i1 %c, label %then, label %latch
+
+then:
+ store i64 %add, i64* %arrayidx5, align 4
+ br label %latch
+
+latch:
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 1024
+ br i1 %exitcond.not, label %exit, label %loop.header, !llvm.loop !0
+
+exit:
+ ret void
+}
+
+!0 = !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.width", i64 32}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/unsafe-vf-hint-remark.ll b/llvm/test/Transforms/LoopVectorize/unsafe-vf-hint-remark.ll
new file mode 100644
index 000000000000..a6b0eec230bd
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/unsafe-vf-hint-remark.ll
@@ -0,0 +1,46 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; Make sure the unsafe user specified vectorization factor is clamped.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; void foo(int *a, int *b) {
+; #pragma clang loop vectorize(enable) vectorize_width(4)
+; for (int i=0; i < 1024; ++i) {
+; a[i + 2] = a[i] + b[i];
+; }
+; }
+
+; CHECK: LV: User VF=4 is unsafe, clamping to max safe VF=2.
+; CHECK: remark: <unknown>:0:0: User-specified vectorization factor 4 is unsafe, clamping to maximum safe vectorization factor 2
+; CHECK-LABEL: @foo
+; CHECK: <2 x i32>
+define void @foo(i32* %a, i32* %b) {
+entry:
+ br label %loop.ph
+
+loop.ph:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %loop.ph ], [ %iv.next, %loop ]
+ %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+ %0 = load i32, i32* %arrayidx, align 4
+ %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+ %1 = load i32, i32* %arrayidx2, align 4
+ %add = add nsw i32 %1, %0
+ %2 = add nuw nsw i64 %iv, 2
+ %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+ store i32 %add, i32* %arrayidx5, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv.next, 1024
+ br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
+
+exit:
+ ret void
+}
+
+!0 = !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
More information about the llvm-branch-commits
mailing list