[llvm] d2e4ccc - [LV] Ignore candidate VFs with invalid costs.
Sander de Smalen via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 12 01:58:56 PDT 2021
Author: Sander de Smalen
Date: 2021-07-12T09:58:22+01:00
New Revision: d2e4ccc79023fcc5fb87ec6873a7856250e55eaa
URL: https://github.com/llvm/llvm-project/commit/d2e4ccc79023fcc5fb87ec6873a7856250e55eaa
DIFF: https://github.com/llvm/llvm-project/commit/d2e4ccc79023fcc5fb87ec6873a7856250e55eaa.diff
LOG: [LV] Ignore candidate VFs with invalid costs.
This follows on from discussion on the mailing-list:
https://lists.llvm.org/pipermail/llvm-dev/2021-June/151047.html
to interpret an Invalid cost as 'infinitely expensive', as this
simplifies some of the legalization issues with scalable vectors.
Reviewed By: dmgreen
Differential Revision: https://reviews.llvm.org/D105473
Added:
Modified:
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 94f6aad5d33c6..71f16a86a0bf3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1261,9 +1261,11 @@ class LoopVectorizationCostModel {
const LoopVectorizationPlanner &LVP);
/// Setup cost-based decisions for user vectorization factor.
- void selectUserVectorizationFactor(ElementCount UserVF) {
+ /// \return true if the UserVF is a feasible VF to be chosen.
+ bool selectUserVectorizationFactor(ElementCount UserVF) {
collectUniformsAndScalars(UserVF);
collectInstsToScalarize(UserVF);
+ return expectedCost(UserVF).first.isValid();
}
/// \return The size (in bits) of the smallest and widest types in the code
@@ -5725,8 +5727,14 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
auto MaxSafeUserVF =
UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
- if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF))
- return UserVF;
+ if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
+ // If `VF=vscale x N` is safe, then so is `VF=N`
+ if (UserVF.isScalable())
+ return FixedScalableVFPair(
+ ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
+ else
+ return UserVF;
+ }
assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
@@ -6072,17 +6080,11 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
if (i.isScalar())
continue;
- // Notice that the vector loop needs to be executed less times, so
- // we need to divide the cost of the vector loops by the width of
- // the vector elements.
VectorizationCostTy C = expectedCost(i);
-
- assert(C.first.isValid() && "Unexpected invalid cost for vector loop");
VectorizationFactor Candidate(i, C.first);
LLVM_DEBUG(
dbgs() << "LV: Vector loop of width " << i << " costs: "
- << (*Candidate.Cost.getValue() /
- Candidate.Width.getKnownMinValue())
+ << (Candidate.Cost / Candidate.Width.getKnownMinValue())
<< (i.isScalable() ? " (assuming a minimum vscale of 1)" : "")
<< ".\n");
@@ -6109,8 +6111,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
}
LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
- *ChosenFactor.Cost.getValue() >= *ScalarCost.Cost.getValue())
- dbgs()
+ ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
<< "LV: Vectorization seems to be not beneficial, "
<< "but was forced by a user.\n");
LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
@@ -6438,8 +6439,9 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.
if (LoopCost == 0) {
- assert(expectedCost(VF).first.isValid() && "Expected a valid cost");
- LoopCost = *expectedCost(VF).first.getValue();
+ InstructionCost C = expectedCost(VF).first;
+ assert(C.isValid() && "Expected to have chosen a VF with valid cost");
+ LoopCost = *C.getValue();
}
assert(LoopCost && "Non-zero loop cost expected");
@@ -7295,6 +7297,8 @@ InstructionCost
LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
ElementCount VF) const {
+ // There is no mechanism yet to create a scalable scalarization loop,
+ // so this is currently Invalid.
if (VF.isScalable())
return InstructionCost::getInvalid();
@@ -8013,17 +8017,19 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
if (!UserVF.isZero() && UserVFIsLegal) {
- LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
- << " VF " << UserVF << ".\n");
assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
"VF needs to be a power of two");
// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.
- CM.selectUserVectorizationFactor(UserVF);
- CM.collectInLoopReductions();
- buildVPlansWithVPRecipes(UserVF, UserVF);
- LLVM_DEBUG(printPlans(dbgs()));
- return {{UserVF, 0}};
+ if (CM.selectUserVectorizationFactor(UserVF)) {
+ LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
+ CM.collectInLoopReductions();
+ buildVPlansWithVPRecipes(UserVF, UserVF);
+ LLVM_DEBUG(printPlans(dbgs()));
+ return {{UserVF, 0}};
+ } else
+ reportVectorizationInfo("UserVF ignored because of invalid costs.",
+ "InvalidCost", ORE, OrigLoop);
}
// Populate the set of Vectorization Factor Candidates.
@@ -8798,8 +8804,6 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
- assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
- "Either the intrinsic cost or vector call cost must be valid");
return UseVectorIntrinsic || !NeedToScalarize;
};
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
index 2bbc6df24dc17..767aac9a31abd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-call.ll
@@ -75,7 +75,7 @@ define void @vec_intrinsic(i64 %N, double* nocapture readonly %a) {
; CHECK-LABEL: @vec_intrinsic
; CHECK: vector.body:
; CHECK: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>*
-; CHECK: call fast <vscale x 2 x double> @sin_vec(<vscale x 2 x double> %[[LOAD]])
+; CHECK: call fast <vscale x 2 x double> @sin_vec_nxv2f64(<vscale x 2 x double> %[[LOAD]])
entry:
%cmp7 = icmp sgt i64 %N, 0
br i1 %cmp7, label %for.body, label %for.end
@@ -95,17 +95,90 @@ for.end:
ret void
}
+define void @vec_sin_no_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) {
+; CHECK: @vec_sin_no_mapping
+; CHECK: call fast <2 x float> @llvm.sin.v2f32
+; CHECK-NOT: <vscale x
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds float, float* %src, i64 %i.07
+ %0 = load float, float* %arrayidx, align 4
+ %1 = tail call fast float @llvm.sin.f32(float %0)
+ %arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07
+ store float %1, float* %arrayidx1, align 4
+ %inc = add nuw nsw i64 %i.07, 1
+ %exitcond.not = icmp eq i64 %inc, %n
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
+
+for.cond.cleanup: ; preds = %for.body
+ ret void
+}
+
+define void @vec_sin_fixed_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) {
+; CHECK: @vec_sin_fixed_mapping
+; CHECK: call fast <2 x float> @llvm.sin.v2f32
+; CHECK-NOT: <vscale x
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds float, float* %src, i64 %i.07
+ %0 = load float, float* %arrayidx, align 4
+ %1 = tail call fast float @llvm.sin.f32(float %0) #3
+ %arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07
+ store float %1, float* %arrayidx1, align 4
+ %inc = add nuw nsw i64 %i.07, 1
+ %exitcond.not = icmp eq i64 %inc, %n
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
+
+for.cond.cleanup: ; preds = %for.body
+ ret void
+}
+
+; Even though there are no function mappings attached to the call
+; in the loop below we can still vectorize the loop because SVE has
+; hardware support in the form of the 'fqsrt' instruction.
+define void @vec_sqrt_no_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) #0 {
+; CHECK: @vec_sqrt_no_mapping
+; CHECK: call fast <vscale x 2 x float> @llvm.sqrt.nxv2f32
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds float, float* %src, i64 %i.07
+ %0 = load float, float* %arrayidx, align 4
+ %1 = tail call fast float @llvm.sqrt.f32(float %0)
+ %arrayidx1 = getelementptr inbounds float, float* %dst, i64 %i.07
+ store float %1, float* %arrayidx1, align 4
+ %inc = add nuw nsw i64 %i.07, 1
+ %exitcond.not = icmp eq i64 %inc, %n
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !1
+
+for.cond.cleanup: ; preds = %for.body
+ ret void
+}
+
+
declare double @foo(double)
declare i64 @bar(i64*)
declare double @llvm.sin.f64(double)
+declare float @llvm.sin.f32(float)
+declare float @llvm.sqrt.f32(float)
declare <vscale x 2 x double> @foo_vec(<vscale x 2 x double>)
declare <vscale x 2 x i64> @bar_vec(<vscale x 2 x i64*>)
-declare <vscale x 2 x double> @sin_vec(<vscale x 2 x double>)
+declare <vscale x 2 x double> @sin_vec_nxv2f64(<vscale x 2 x double>)
+declare <2 x double> @sin_vec_v2f64(<2 x double>)
attributes #0 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_foo(foo_vec)" }
attributes #1 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_bar(bar_vec)" }
-attributes #2 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_llvm.sin.f64(sin_vec)" }
+attributes #2 = { "vector-function-abi-variant"="_ZGV_LLVM_Nxv_llvm.sin.f64(sin_vec_nxv2f64)" }
+attributes #3 = { "vector-function-abi-variant"="_ZGV_LLVM_N2v_llvm.sin.f64(sin_vec_v2f64)" }
!1 = distinct !{!1, !2, !3}
!2 = !{!"llvm.loop.vectorize.width", i32 2}
More information about the llvm-commits
mailing list