[llvm] r263509 - [LoopUnroll] Respect the convergent attribute.
Justin Lebar via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 14 16:15:34 PDT 2016
Author: jlebar
Date: Mon Mar 14 18:15:34 2016
New Revision: 263509
URL: http://llvm.org/viewvc/llvm-project?rev=263509&view=rev
Log:
[LoopUnroll] Respect the convergent attribute.
Summary:
Specifically, when we perform runtime loop unrolling of a loop that
contains a convergent op, we can only unroll k times, where k divides
the loop trip multiple.
Without this change, we'll happily unroll e.g. the following loop
for (int i = 0; i < N; ++i) {
if (i == 0) convergent_op();
foo();
}
into
int i = 0;
if (N % 2 == 1) {
convergent_op();
foo();
++i;
}
for (; i < N - 1; i += 2) {
if (i == 0) convergent_op();
foo();
foo();
}.
This is unsafe, because we've just added a control-flow dependency to
the convergent op in the prelude.
In general, runtime unrolling loops that contain convergent ops is safe
only if we don't have emit a prelude, which occurs when the unroll count
divides the trip multiple.
Reviewers: resistor
Subscribers: llvm-commits, mzolotukhin
Differential Revision: http://reviews.llvm.org/D17526
Added:
llvm/trunk/test/Transforms/LoopUnroll/convergent.ll
Modified:
llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp
llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp
Modified: llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp?rev=263509&r1=263508&r2=263509&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp Mon Mar 14 18:15:34 2016
@@ -362,7 +362,7 @@ analyzeLoopUnrollCost(const Loop *L, uns
/// ApproximateLoopSize - Approximate the size of the loop.
static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
- bool &NotDuplicatable,
+ bool &NotDuplicatable, bool &Convergent,
const TargetTransformInfo &TTI,
AssumptionCache *AC) {
SmallPtrSet<const Value *, 32> EphValues;
@@ -373,6 +373,7 @@ static unsigned ApproximateLoopSize(cons
Metrics.analyzeBasicBlock(BB, TTI, EphValues);
NumCalls = Metrics.NumInlineCandidates;
NotDuplicatable = Metrics.notDuplicatable;
+ Convergent = Metrics.convergent;
unsigned LoopSize = Metrics.NumInsts;
@@ -568,8 +569,9 @@ static bool tryToUnrollLoop(Loop *L, Dom
unsigned NumInlineCandidates;
bool NotDuplicatable;
- unsigned LoopSize =
- ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, TTI, &AC);
+ bool Convergent;
+ unsigned LoopSize = ApproximateLoopSize(
+ L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC);
DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n");
// When computing the unrolled size, note that the conditional branch on the
@@ -623,6 +625,7 @@ static bool tryToUnrollLoop(Loop *L, Dom
if (HasRuntimeUnrollDisablePragma(L) || PragmaFullUnroll) {
AllowRuntime = false;
}
+ bool DecreasedCountDueToConvergence = false;
if (Unrolling == Partial) {
bool AllowPartial = PragmaEnableUnroll || UP.Partial;
if (!AllowPartial && !CountSetExplicitly) {
@@ -643,14 +646,40 @@ static bool tryToUnrollLoop(Loop *L, Dom
<< "-unroll-runtime not given\n");
return false;
}
+
// Reduce unroll count to be the largest power-of-two factor of
// the original count which satisfies the threshold limit.
while (Count != 0 && UnrolledSize > UP.PartialThreshold) {
Count >>= 1;
UnrolledSize = (LoopSize-2) * Count + 2;
}
+
if (Count > UP.MaxCount)
Count = UP.MaxCount;
+
+ // If the loop contains a convergent operation, the prelude we'd add
+ // to do the first few instructions before we hit the unrolled loop
+ // is unsafe -- it adds a control-flow dependency to the convergent
+ // operation. Therefore Count must divide TripMultiple.
+ //
+ // TODO: This is quite conservative. In practice, convergent_op()
+ // is likely to be called unconditionally in the loop. In this
+ // case, the program would be ill-formed (on most architectures)
+ // unless n were the same on all threads in a thread group.
+ // Assuming n is the same on all threads, any kind of unrolling is
+ // safe. But currently llvm's notion of convergence isn't powerful
+ // enough to express this.
+ unsigned OrigCount = Count;
+ while (Convergent && Count != 0 && TripMultiple % Count != 0) {
+ DecreasedCountDueToConvergence = true;
+ Count >>= 1;
+ }
+ if (OrigCount > Count) {
+ DEBUG(dbgs() << " loop contains a convergent instruction, so unroll "
+ "count must divide the trip multiple, "
+ << TripMultiple << ". Reducing unroll count from "
+ << OrigCount << " to " << Count << ".\n");
+ }
DEBUG(dbgs() << " partially unrolling with count: " << Count << "\n");
}
@@ -665,7 +694,16 @@ static bool tryToUnrollLoop(Loop *L, Dom
DebugLoc LoopLoc = L->getStartLoc();
Function *F = Header->getParent();
LLVMContext &Ctx = F->getContext();
- if ((PragmaCount > 0) && Count != OriginalCount) {
+ if (PragmaCount > 0 && DecreasedCountDueToConvergence) {
+ emitOptimizationRemarkMissed(
+ Ctx, DEBUG_TYPE, *F, LoopLoc,
+ Twine("Unable to unroll loop the number of times directed by "
+ "unroll_count pragma because the loop contains a convergent "
+ "instruction, and so must have an unroll count that divides "
+ "the loop trip multiple of ") +
+ Twine(TripMultiple) + ". Unrolling instead " + Twine(Count) +
+ " time(s).");
+ } else if ((PragmaCount > 0) && Count != OriginalCount) {
emitOptimizationRemarkMissed(
Ctx, DEBUG_TYPE, *F, LoopLoc,
"Unable to unroll loop the number of times directed by "
Modified: llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp?rev=263509&r1=263508&r2=263509&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp (original)
+++ llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp Mon Mar 14 18:15:34 2016
@@ -273,7 +273,23 @@ bool llvm::UnrollLoop(Loop *L, unsigned
// flag is specified.
bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime);
- if (RuntimeTripCount &&
+ // Loops containing convergent instructions must have a count that divides
+ // their TripMultiple.
+ DEBUG(
+ bool HasConvergent = false;
+ for (auto &BB : L->blocks())
+ for (auto &I : *BB)
+ if (auto CS = CallSite(&I))
+ HasConvergent |= CS.isConvergent();
+ assert(
+ !HasConvergent || TripMultiple % Count == 0 &&
+ "Unroll count must divide trip multiple if loop contains a convergent "
+ "operation.");
+ );
+ // Don't output the runtime loop prolog if Count is a multiple of
+ // TripMultiple. Such a prolog is never needed, and is unsafe if the loop
+ // contains a convergent instruction.
+ if (RuntimeTripCount && TripMultiple % Count != 0 &&
!UnrollRuntimeLoopProlog(L, Count, AllowExpensiveTripCount, LI, SE, DT,
PreserveLCSSA))
return false;
Added: llvm/trunk/test/Transforms/LoopUnroll/convergent.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnroll/convergent.ll?rev=263509&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopUnroll/convergent.ll (added)
+++ llvm/trunk/test/Transforms/LoopUnroll/convergent.ll Mon Mar 14 18:15:34 2016
@@ -0,0 +1,83 @@
+; RUN: opt < %s -loop-unroll -unroll-runtime -unroll-allow-partial -S | FileCheck %s
+
+declare void @f() convergent
+
+; Although this loop contains a convergent instruction, it should be
+; fully unrolled.
+;
+; CHECK-LABEL: @full_unroll(
+define i32 @full_unroll() {
+entry:
+ br label %l3
+
+l3:
+ %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+; CHECK: call void @f()
+; CHECK: call void @f()
+; CHECK: call void @f()
+; CHECK-NOT: call void @f()
+ call void @f() ;convergent
+ %inc = add nsw i32 %x.0, 1
+ %exitcond = icmp eq i32 %inc, 3
+ br i1 %exitcond, label %exit, label %l3
+
+exit:
+ ret i32 0
+}
+
+; This loop contains a convergent instruction, but it should be partially
+; unrolled. The unroll count is the largest power of 2 that divides the
+; multiple -- 4, in this case.
+;
+; CHECK-LABEL: @runtime_unroll(
+define i32 @runtime_unroll(i32 %n) {
+entry:
+ %loop_ctl = mul nsw i32 %n, 12
+ br label %l3
+
+l3:
+ %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+; CHECK: call void @f()
+; CHECK: call void @f()
+; CHECK: call void @f()
+; CHECK: call void @f()
+; CHECK-NOT: call void @f()
+ call void @f() convergent
+ %inc = add nsw i32 %x.0, 1
+ %exitcond = icmp eq i32 %inc, %loop_ctl
+ br i1 %exitcond, label %exit, label %l3
+
+exit:
+ ret i32 0
+}
+
+; This loop contains a convergent instruction, so its partial unroll
+; count must divide its trip multiple. This overrides its unroll
+; pragma -- we unroll exactly 8 times, even though 16 is requested.
+; CHECK-LABEL: @pragma_unroll
+define i32 @pragma_unroll(i32 %n) {
+entry:
+ %loop_ctl = mul nsw i32 %n, 24
+ br label %l3, !llvm.loop !0
+
+l3:
+ %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+; CHECK: call void @f()
+; CHECK: call void @f()
+; CHECK: call void @f()
+; CHECK: call void @f()
+; CHECK: call void @f()
+; CHECK: call void @f()
+; CHECK: call void @f()
+; CHECK: call void @f()
+; CHECK-NOT: call void @f()
+ call void @f() convergent
+ %inc = add nsw i32 %x.0, 1
+ %exitcond = icmp eq i32 %inc, %loop_ctl
+ br i1 %exitcond, label %exit, label %l3, !llvm.loop !0
+
+exit:
+ ret i32 0
+}
+
+!0 = !{!0, !{!"llvm.loop.unroll.count", i32 16}}
More information about the llvm-commits
mailing list