[llvm] r207512 - [OPENMP][LV][D3423] Respect Hints.Force meta-data for loops in LoopVectorizer
Kostya Serebryany
kcc at google.com
Tue Apr 29 02:33:26 PDT 2014
This causes build breakage in release mode:
lib/Transforms/Vectorize/LoopVectorize.cpp:5099:15: error: unused variable
'ScalarCost' [-Werror,-Wunused-variable]
const float ScalarCost = Cost;
(The constant is used in DEBUG() statement which is void in Release mode)
On Tue, Apr 29, 2014 at 12:55 PM, Zinovy Nis <zinovy.nis at gmail.com> wrote:
> Author: zinovy.nis
> Date: Tue Apr 29 03:55:11 2014
> New Revision: 207512
>
> URL: http://llvm.org/viewvc/llvm-project?rev=207512&view=rev
> Log:
> [OPENMP][LV][D3423] Respect Hints.Force meta-data for loops in
> LoopVectorizer
>
> Added:
> llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.ll
> llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
> Modified:
> llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
>
> Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=207512&r1=207511&r2=207512&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
> +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Tue Apr 29
> 03:55:11 2014
> @@ -819,7 +819,8 @@ public:
> /// then this vectorization factor will be selected if vectorization is
> /// possible.
> VectorizationFactor selectVectorizationFactor(bool OptForSize,
> - unsigned UserVF);
> + unsigned UserVF,
> + bool ForceVectorization);
>
> /// \return The size (in bits) of the widest type in the code that
> /// needs to be vectorized. We ignore values that remain scalar such as
> @@ -891,13 +892,17 @@ struct LoopVectorizeHints {
> unsigned Width;
> /// Vectorization unroll factor.
> unsigned Unroll;
> - /// Vectorization forced (-1 not selected, 0 force disabled, 1 force
> enabled)
> - int Force;
> + /// Vectorization forced
> + enum ForceKind {
> + FK_Undefined = -1, ///< Not selected.
> + FK_Disabled = 0, ///< Forcing disabled.
> + FK_Enabled = 1, ///< Forcing enabled.
> + } Force;
>
> LoopVectorizeHints(const Loop *L, bool DisableUnrolling)
> : Width(VectorizationFactor)
> , Unroll(DisableUnrolling ? 1 : VectorizationUnroll)
> - , Force(-1)
> + , Force(FK_Undefined)
> , LoopID(L->getLoopID()) {
> getHints(L);
> // The command line options override any loop metadata except for when
> @@ -1010,7 +1015,8 @@ private:
> DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n");
> } else if (Hint == "enable") {
> if (C->getBitWidth() == 1)
> - Force = Val;
> + Force = Val == 1 ? LoopVectorizeHints::FK_Enabled
> + : LoopVectorizeHints::FK_Disabled;
> else
> DEBUG(dbgs() << "LV: ignoring invalid enable hint metadata\n");
> } else {
> @@ -1106,18 +1112,20 @@ struct LoopVectorize : public FunctionPa
> LoopVectorizeHints Hints(L, DisableUnrolling);
>
> DEBUG(dbgs() << "LV: Loop hints:"
> - << " force=" << (Hints.Force == 0
> - ? "disabled"
> - : (Hints.Force == 1 ? "enabled" :
> "?"))
> - << " width=" << Hints.Width << " unroll=" << Hints.Unroll
> - << "\n");
> + << " force="
> + << (Hints.Force == LoopVectorizeHints::FK_Disabled
> + ? "disabled"
> + : (Hints.Force == LoopVectorizeHints::FK_Enabled
> + ? "enabled"
> + : "?")) << " width=" << Hints.Width
> + << " unroll=" << Hints.Unroll << "\n");
>
> - if (Hints.Force == 0) {
> + if (Hints.Force == LoopVectorizeHints::FK_Disabled) {
> DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize
> disable.\n");
> return false;
> }
>
> - if (!AlwaysVectorize && Hints.Force != 1) {
> + if (!AlwaysVectorize && Hints.Force !=
> LoopVectorizeHints::FK_Enabled) {
> DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize
> enable.\n");
> return false;
> }
> @@ -1127,6 +1135,21 @@ struct LoopVectorize : public FunctionPa
> return false;
> }
>
> + // Check the loop for a trip count threshold:
> + // do not vectorize loops with a tiny trip count.
> + BasicBlock *Latch = L->getLoopLatch();
> + const unsigned TC = SE->getSmallConstantTripCount(L, Latch);
> + if (TC > 0u && TC < TinyTripCountVectorThreshold) {
> + DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
> + << "This loop is not worth vectorizing.");
> + if (Hints.Force == LoopVectorizeHints::FK_Enabled)
> + DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
> + else {
> + DEBUG(dbgs() << "\n");
> + return false;
> + }
> + }
> +
> // Check if it is legal to vectorize the loop.
> LoopVectorizationLegality LVL(L, SE, DL, DT, TLI);
> if (!LVL.canVectorize()) {
> @@ -1140,8 +1163,8 @@ struct LoopVectorize : public FunctionPa
> // Check the function attributes to find out if this function should
> be
> // optimized for size.
> Function *F = L->getHeader()->getParent();
> - bool OptForSize =
> - Hints.Force != 1 && F->hasFnAttribute(Attribute::OptimizeForSize);
> + bool OptForSize = Hints.Force != LoopVectorizeHints::FK_Enabled &&
> + F->hasFnAttribute(Attribute::OptimizeForSize);
>
> // Compute the weighted frequency of this loop being executed and see
> if it
> // is less than 20% of the function entry baseline frequency. Note
> that we
> @@ -1150,7 +1173,8 @@ struct LoopVectorize : public FunctionPa
> // exactly what block frequency models.
> if (LoopVectorizeWithBlockFrequency) {
> BlockFrequency LoopEntryFreq =
> BFI->getBlockFreq(L->getLoopPreheader());
> - if (Hints.Force != 1 && LoopEntryFreq < ColdEntryFreq)
> + if (Hints.Force != LoopVectorizeHints::FK_Enabled &&
> + LoopEntryFreq < ColdEntryFreq)
> OptForSize = true;
> }
>
> @@ -1166,7 +1190,10 @@ struct LoopVectorize : public FunctionPa
>
> // Select the optimal vectorization factor.
> const LoopVectorizationCostModel::VectorizationFactor VF =
> - CM.selectVectorizationFactor(OptForSize,
> Hints.Width);
> + CM.selectVectorizationFactor(OptForSize, Hints.Width,
> + Hints.Force ==
> + LoopVectorizeHints::FK_Enabled);
> +
> // Select the unroll factor.
> const unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll,
> VF.Width,
> VF.Cost);
> @@ -3300,15 +3327,6 @@ bool LoopVectorizationLegality::canVecto
> return false;
> }
>
> - // Do not loop-vectorize loops with a tiny trip count.
> - BasicBlock *Latch = TheLoop->getLoopLatch();
> - unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch);
> - if (TC > 0u && TC < TinyTripCountVectorThreshold) {
> - DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
> - "This loop is not worth vectorizing.\n");
> - return false;
> - }
> -
> // Check if we can vectorize the instructions and CFG in this loop.
> if (!canVectorizeInstrs()) {
> DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
> @@ -5007,7 +5025,8 @@ bool LoopVectorizationLegality::blockCan
>
> LoopVectorizationCostModel::VectorizationFactor
> LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
> - unsigned UserVF) {
> + unsigned UserVF,
> + bool
> ForceVectorization) {
> // Width 1 means no vectorize
> VectorizationFactor Factor = { 1U, 0U };
> if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
> @@ -5077,8 +5096,16 @@ LoopVectorizationCostModel::selectVector
> }
>
> float Cost = expectedCost(1);
> + const float ScalarCost = Cost;
> unsigned Width = 1;
> DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)Cost << ".\n");
> +
> + // Ignore scalar width, because the user explicitly wants vectorization.
> + if (ForceVectorization && VF > 1) {
> + Width = 2;
> + Cost = expectedCost(Width) / (float)Width;
> + }
> +
> for (unsigned i=2; i <= VF; i*=2) {
> // Notice that the vector loop needs to be executed less times, so
> // we need to divide the cost of the vector loops by the width of
> @@ -5092,6 +5119,9 @@ LoopVectorizationCostModel::selectVector
> }
> }
>
> + DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
> + << "LV: Vectorization seems to be not beneficial, "
> + << "but was forced by a user.\n");
> DEBUG(dbgs() << "LV: Selecting VF: "<< Width << ".\n");
> Factor.Width = Width;
> Factor.Cost = Width * Cost;
>
> Added: llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.ll?rev=207512&view=auto
>
> ==============================================================================
> --- llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.ll (added)
> +++ llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.ll Tue Apr
> 29 03:55:11 2014
> @@ -0,0 +1,93 @@
> +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0
> -mcpu=corei7-avx -debug-only=loop-vectorize -stats -S 2>&1 | FileCheck %s
> +; REQUIRES: asserts
> +
> +; CHECK: LV: Loop hints: force=enabled
> +; CHECK: LV: Loop hints: force=?
> +; No more loops in the module
> +; CHECK-NOT: LV: Loop hints: force=
> +; CHECK: 2 loop-vectorize - Number of loops analyzed for
> vectorization
> +; CHECK: 1 loop-vectorize - Number of loops vectorized
> +
> +target datalayout =
> "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
> +target triple = "x86_64-apple-macosx10.8.0"
> +
> +;
> +; The source code for the test:
> +;
> +; #include <math.h>
> +; void foo(float* restrict A, float * restrict B, int size)
> +; {
> +; for (int i = 0; i < size; ++i) A[i] = sinf(B[i]);
> +; }
> +;
> +
> +;
> +; This loop will be vectorized, although the scalar cost is lower than
> any of vector costs, but vectorization is explicitly forced in metadata.
> +;
> +
> +define void @vectorized(float* noalias nocapture %A, float* noalias
> nocapture %B, i32 %size) {
> +entry:
> + %cmp6 = icmp sgt i32 %size, 0
> + br i1 %cmp6, label %for.body.preheader, label %for.end
> +
> +for.body.preheader:
> + br label %for.body
> +
> +for.body:
> + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0,
> %for.body.preheader ]
> + %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
> + %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1
> + %call = tail call float @llvm.sin.f32(float %0)
> + %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv
> + store float %call, float* %arrayidx2, align 4,
> !llvm.mem.parallel_loop_access !1
> + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
> + %lftr.wideiv = trunc i64 %indvars.iv.next to i32
> + %exitcond = icmp eq i32 %lftr.wideiv, %size
> + br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !1
> +
> +for.end.loopexit:
> + br label %for.end
> +
> +for.end:
> + ret void
> +}
> +
> +!1 = metadata !{metadata !1, metadata !2}
> +!2 = metadata !{metadata !"llvm.vectorizer.enable", i1 true}
> +
> +;
> +; This method will not be vectorized, as scalar cost is lower than any of
> vector costs.
> +;
> +
> +define void @not_vectorized(float* noalias nocapture %A, float* noalias
> nocapture %B, i32 %size) {
> +entry:
> + %cmp6 = icmp sgt i32 %size, 0
> + br i1 %cmp6, label %for.body.preheader, label %for.end
> +
> +for.body.preheader:
> + br label %for.body
> +
> +for.body:
> + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0,
> %for.body.preheader ]
> + %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
> + %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
> + %call = tail call float @llvm.sin.f32(float %0)
> + %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv
> + store float %call, float* %arrayidx2, align 4,
> !llvm.mem.parallel_loop_access !3
> + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
> + %lftr.wideiv = trunc i64 %indvars.iv.next to i32
> + %exitcond = icmp eq i32 %lftr.wideiv, %size
> + br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !3
> +
> +for.end.loopexit:
> + br label %for.end
> +
> +for.end:
> + ret void
> +}
> +
> +declare float @llvm.sin.f32(float) nounwind readnone
> +
> +; Dummy metadata
> +!3 = metadata !{metadata !3}
> +
>
> Added:
> llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll?rev=207512&view=auto
>
> ==============================================================================
> ---
> llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
> (added)
> +++
> llvm/trunk/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll Tue
> Apr 29 03:55:11 2014
> @@ -0,0 +1,73 @@
> +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0
> -mcpu=corei7-avx -debug-only=loop-vectorize -stats -S
> -vectorizer-min-trip-count=21 2>&1 | FileCheck %s
> +; REQUIRES: asserts
> +
> +; CHECK: LV: Loop hints: force=enabled
> +; CHECK: LV: Loop hints: force=?
> +; No more loops in the module
> +; CHECK-NOT: LV: Loop hints: force=
> +; CHECK: 2 loop-vectorize - Number of loops analyzed for
> vectorization
> +; CHECK: 1 loop-vectorize - Number of loops vectorized
> +
> +target datalayout =
> "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
> +target triple = "x86_64-apple-macosx10.8.0"
> +
> +;
> +; The source code for the test:
> +;
> +; void foo(float* restrict A, float* restrict B)
> +; {
> +; for (int i = 0; i < 20; ++i) A[i] += B[i];
> +; }
> +;
> +
> +;
> +; This loop will be vectorized, although the trip count is below the
> threshold, but vectorization is explicitly forced in metadata.
> +;
> +define void @vectorized(float* noalias nocapture %A, float* noalias
> nocapture readonly %B) {
> +entry:
> + br label %for.body
> +
> +for.body:
> + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
> + %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
> + %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1
> + %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv
> + %1 = load float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1
> + %add = fadd fast float %0, %1
> + store float %add, float* %arrayidx2, align 4,
> !llvm.mem.parallel_loop_access !1
> + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
> + %exitcond = icmp eq i64 %indvars.iv.next, 20
> + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1
> +
> +for.end:
> + ret void
> +}
> +
> +!1 = metadata !{metadata !1, metadata !2}
> +!2 = metadata !{metadata !"llvm.vectorizer.enable", i1 true}
> +
> +;
> +; This loop will not be vectorized as the trip count is below the
> threshold.
> +;
> +define void @not_vectorized(float* noalias nocapture %A, float* noalias
> nocapture readonly %B) {
> +entry:
> + br label %for.body
> +
> +for.body:
> + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
> + %arrayidx = getelementptr inbounds float* %B, i64 %indvars.iv
> + %0 = load float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !3
> + %arrayidx2 = getelementptr inbounds float* %A, i64 %indvars.iv
> + %1 = load float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !3
> + %add = fadd fast float %0, %1
> + store float %add, float* %arrayidx2, align 4,
> !llvm.mem.parallel_loop_access !3
> + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
> + %exitcond = icmp eq i64 %indvars.iv.next, 20
> + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !3
> +
> +for.end:
> + ret void
> +}
> +
> +!3 = metadata !{metadata !3}
> +
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20140429/564b5e84/attachment.html>
More information about the llvm-commits
mailing list