[llvm] e0ac087 - [LoopUnroll] Consider convergence control tokens when unrolling (#91715)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 6 00:43:50 PDT 2024
Author: Sameer Sahasrabuddhe
Date: 2024-06-06T13:13:46+05:30
New Revision: e0ac087ff004f7a63ba64b9685f4f098d6ee54c5
URL: https://github.com/llvm/llvm-project/commit/e0ac087ff004f7a63ba64b9685f4f098d6ee54c5
DIFF: https://github.com/llvm/llvm-project/commit/e0ac087ff004f7a63ba64b9685f4f098d6ee54c5.diff
LOG: [LoopUnroll] Consider convergence control tokens when unrolling (#91715)
- There is no restriction on a loop with controlled convergent
operations when
the relevant tokens are defined and used within the loop.
- When a token defined outside a loop is used inside (also called a loop
convergence heart), unrolling is allowed only in the absence of
remainder or
runtime checks.
- When a token defined inside a loop is used outside, such a loop is
said to be
"extended". This loop can only be unrolled by also duplicating the
extended part
lying outside the loop. Such unrolling is disabled for now.
- Clean up loop hearts: When unrolling a loop with a heart, duplicating
the
heart will introduce multiple static uses of a convergence control token
in a
cycle that does not contain its definition. This violates the static
rules for
tokens, and needs to be cleaned up into a single occurrence of the
intrinsic.
- Spell out the initializer for UnrollLoopOptions to improve
readability.
Original implementation [D85605] by Nicolai Haehnle
<nicolai.haehnle at amd.com>.
Added:
llvm/test/Transforms/LoopUnroll/convergent.controlled.ll
Modified:
llvm/include/llvm/Analysis/CodeMetrics.h
llvm/include/llvm/Analysis/LoopInfo.h
llvm/include/llvm/IR/InstrTypes.h
llvm/include/llvm/IR/IntrinsicInst.h
llvm/include/llvm/Transforms/Utils/UnrollLoop.h
llvm/lib/Analysis/CodeMetrics.cpp
llvm/lib/Analysis/LoopInfo.cpp
llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
llvm/lib/Transforms/Utils/LoopUnroll.cpp
llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
Removed:
################################################################################
diff --git a/llvm/include/llvm/Analysis/CodeMetrics.h b/llvm/include/llvm/Analysis/CodeMetrics.h
index a9431bca11251..d09018daf9548 100644
--- a/llvm/include/llvm/Analysis/CodeMetrics.h
+++ b/llvm/include/llvm/Analysis/CodeMetrics.h
@@ -20,12 +20,15 @@
namespace llvm {
class AssumptionCache;
class BasicBlock;
+class Instruction;
class Loop;
class Function;
template <class T> class SmallPtrSetImpl;
class TargetTransformInfo;
class Value;
+enum struct ConvergenceKind { None, Controlled, ExtendedLoop, Uncontrolled };
+
/// Utility to calculate the size and a few similar metrics for a set
/// of basic blocks.
struct CodeMetrics {
@@ -42,8 +45,8 @@ struct CodeMetrics {
/// one or more 'noduplicate' instructions.
bool notDuplicatable = false;
- /// True if this function contains a call to a convergent function.
- bool convergent = false;
+ /// The kind of convergence specified in this function.
+ ConvergenceKind Convergence = ConvergenceKind::None;
/// True if this function calls alloca (in the C sense).
bool usesDynamicAlloca = false;
@@ -77,7 +80,7 @@ struct CodeMetrics {
/// Add information about a block to the current state.
void analyzeBasicBlock(const BasicBlock *BB, const TargetTransformInfo &TTI,
const SmallPtrSetImpl<const Value *> &EphValues,
- bool PrepareForLTO = false);
+ bool PrepareForLTO = false, const Loop *L = nullptr);
/// Collect a loop's ephemeral values (those used only by an assume
/// or similar intrinsics in the loop).
diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h
index 52084630560c5..4f06a7e889f91 100644
--- a/llvm/include/llvm/Analysis/LoopInfo.h
+++ b/llvm/include/llvm/Analysis/LoopInfo.h
@@ -649,6 +649,9 @@ int getIntLoopAttribute(const Loop *TheLoop, StringRef Name, int Default = 0);
std::optional<const MDOperand *> findStringMetadataForLoop(const Loop *TheLoop,
StringRef Name);
+/// Find the convergence heart of the loop.
+CallBase *getLoopConvergenceHeart(const Loop *TheLoop);
+
/// Look for the loop attribute that requires progress within the loop.
/// Note: Most consumers probably want "isMustProgress" which checks
/// the containing function attribute too.
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index 9dd1bb455a718..441e6a1e79843 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -1588,6 +1588,14 @@ class CallBase : public Instruction {
static CallBase *removeOperandBundle(CallBase *CB, uint32_t ID,
BasicBlock::iterator InsertPt);
+ /// Return the convergence control token for this call, if it exists.
+ Value *getConvergenceControlToken() const {
+ if (auto Bundle = getOperandBundle(llvm::LLVMContext::OB_convergencectrl)) {
+ return Bundle->Inputs[0].get();
+ }
+ return nullptr;
+ }
+
static bool classof(const Instruction *I) {
return I->getOpcode() == Instruction::Call ||
I->getOpcode() == Instruction::Invoke ||
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index fcd3a1025ac13..9010e1a1c896b 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -1799,17 +1799,14 @@ class ConvergenceControlInst : public IntrinsicInst {
return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
}
- // Returns the convergence intrinsic referenced by |I|'s convergencectrl
- // attribute if any.
- static IntrinsicInst *getParentConvergenceToken(Instruction *I) {
- auto *CI = dyn_cast<llvm::CallInst>(I);
- if (!CI)
- return nullptr;
-
- auto Bundle = CI->getOperandBundle(llvm::LLVMContext::OB_convergencectrl);
- assert(Bundle->Inputs.size() == 1 &&
- Bundle->Inputs[0]->getType()->isTokenTy());
- return dyn_cast<llvm::IntrinsicInst>(Bundle->Inputs[0].get());
+ bool isAnchor() {
+ return getIntrinsicID() == Intrinsic::experimental_convergence_anchor;
+ }
+ bool isEntry() {
+ return getIntrinsicID() == Intrinsic::experimental_convergence_entry;
+ }
+ bool isLoop() {
+ return getIntrinsicID() == Intrinsic::experimental_convergence_loop;
}
};
diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
index bd804dc112662..797c082333a76 100644
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -16,6 +16,7 @@
#define LLVM_TRANSFORMS_UTILS_UNROLLLOOP_H
#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Support/InstructionCost.h"
@@ -73,6 +74,7 @@ struct UnrollLoopOptions {
bool AllowExpensiveTripCount;
bool UnrollRemainder;
bool ForgetAllSCEV;
+ const Instruction *Heart = nullptr;
};
LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
@@ -128,14 +130,15 @@ class UnrollCostEstimator {
public:
unsigned NumInlineCandidates;
- bool Convergent;
+ ConvergenceKind Convergence;
+ bool ConvergenceAllowsRuntime;
UnrollCostEstimator(const Loop *L, const TargetTransformInfo &TTI,
const SmallPtrSetImpl<const Value *> &EphValues,
unsigned BEInsns);
/// Whether it is legal to unroll this loop.
- bool canUnroll() const { return LoopSize.isValid() && !NotDuplicatable; }
+ bool canUnroll() const;
uint64_t getRolledLoopSize() const { return *LoopSize.getValue(); }
diff --git a/llvm/lib/Analysis/CodeMetrics.cpp b/llvm/lib/Analysis/CodeMetrics.cpp
index 2637e2f97dbb2..ea67b526423bf 100644
--- a/llvm/lib/Analysis/CodeMetrics.cpp
+++ b/llvm/lib/Analysis/CodeMetrics.cpp
@@ -16,6 +16,7 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/InstructionCost.h"
@@ -111,11 +112,24 @@ void CodeMetrics::collectEphemeralValues(
completeEphemeralValues(Visited, Worklist, EphValues);
}
+static bool extendsConvergenceOutsideLoop(const Instruction &I, const Loop *L) {
+ if (!L)
+ return false;
+ if (!isa<ConvergenceControlInst>(I))
+ return false;
+ for (const auto *U : I.users()) {
+ if (!L->contains(cast<Instruction>(U)))
+ return true;
+ }
+ return false;
+}
+
/// Fill in the current structure with information gleaned from the specified
/// block.
void CodeMetrics::analyzeBasicBlock(
const BasicBlock *BB, const TargetTransformInfo &TTI,
- const SmallPtrSetImpl<const Value *> &EphValues, bool PrepareForLTO) {
+ const SmallPtrSetImpl<const Value *> &EphValues, bool PrepareForLTO,
+ const Loop *L) {
++NumBlocks;
InstructionCost NumInstsBeforeThisBB = NumInsts;
for (const Instruction &I : *BB) {
@@ -163,19 +177,38 @@ void CodeMetrics::analyzeBasicBlock(
if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
++NumVectorInsts;
- if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB))
+ if (I.getType()->isTokenTy() && !isa<ConvergenceControlInst>(I) &&
+ I.isUsedOutsideOfBlock(BB)) {
+ LLVM_DEBUG(dbgs() << I
+ << "\n Cannot duplicate a token value used outside "
+ "the current block (except convergence control).\n");
notDuplicatable = true;
-
- if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
- if (CI->cannotDuplicate())
- notDuplicatable = true;
- if (CI->isConvergent())
- convergent = true;
}
- if (const InvokeInst *InvI = dyn_cast<InvokeInst>(&I))
- if (InvI->cannotDuplicate())
+ if (const CallBase *CB = dyn_cast<CallBase>(&I)) {
+ if (CB->cannotDuplicate())
notDuplicatable = true;
+ // Compute a meet over the visited blocks for the following partial order:
+ //
+ // None -> { Controlled, ExtendedLoop, Uncontrolled}
+ // Controlled -> ExtendedLoop
+ if (Convergence <= ConvergenceKind::Controlled && CB->isConvergent()) {
+ if (isa<ConvergenceControlInst>(CB) ||
+ CB->getConvergenceControlToken()) {
+ assert(Convergence != ConvergenceKind::Uncontrolled);
+ LLVM_DEBUG(dbgs() << "Found controlled convergence:\n" << I << "\n");
+ if (extendsConvergenceOutsideLoop(I, L))
+ Convergence = ConvergenceKind::ExtendedLoop;
+ else {
+ assert(Convergence != ConvergenceKind::ExtendedLoop);
+ Convergence = ConvergenceKind::Controlled;
+ }
+ } else {
+ assert(Convergence == ConvergenceKind::None);
+ Convergence = ConvergenceKind::Uncontrolled;
+ }
+ }
+ }
NumInsts += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
}
diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index 369ab087ffc0f..c34c4974382ea 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -1105,6 +1105,26 @@ int llvm::getIntLoopAttribute(const Loop *TheLoop, StringRef Name,
return getOptionalIntLoopAttribute(TheLoop, Name).value_or(Default);
}
+CallBase *llvm::getLoopConvergenceHeart(const Loop *TheLoop) {
+ BasicBlock *H = TheLoop->getHeader();
+ for (Instruction &II : *H) {
+ if (auto *CB = dyn_cast<CallBase>(&II)) {
+ if (!CB->isConvergent())
+ continue;
+ // This is the heart if it uses a token defined outside the loop. The
+ // verifier has already checked that only the loop intrinsic can use such
+ // a token.
+ if (auto *Token = CB->getConvergenceControlToken()) {
+ auto *TokenDef = cast<Instruction>(Token);
+ if (!TheLoop->contains(TokenDef->getParent()))
+ return CB;
+ }
+ return nullptr;
+ }
+ }
+ return nullptr;
+}
+
bool llvm::isFinite(const Loop *L) {
return L->getHeader()->getParent()->willReturn();
}
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 2c4b45255d059..92213e19c9d9d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3961,7 +3961,7 @@ static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
// Loop is not unrollable if the loop contains certain instructions.
- if (!UCE.canUnroll() || UCE.Convergent) {
+ if (!UCE.canUnroll()) {
LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
return 1;
}
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index ba2546b8db0e2..4371b821eae63 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -827,7 +827,8 @@ struct TransformDFA {
return false;
}
- if (Metrics.convergent) {
+ // FIXME: Allow jump threading with controlled convergence.
+ if (Metrics.Convergence != ConvergenceKind::None) {
LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains "
<< "convergent instructions.\n");
ORE->emit([&]() {
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
index 7b4c54370e48a..f8e2f1f28088d 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -327,8 +327,7 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
UnrollCostEstimator OuterUCE(L, TTI, EphValues, UP.BEInsns);
if (!InnerUCE.canUnroll() || !OuterUCE.canUnroll()) {
- LLVM_DEBUG(dbgs() << " Not unrolling loop which contains instructions"
- << " which cannot be duplicated or have invalid cost.\n");
+ LLVM_DEBUG(dbgs() << " Loop not considered unrollable\n");
return LoopUnrollResult::Unmodified;
}
@@ -341,7 +340,10 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
return LoopUnrollResult::Unmodified;
}
- if (InnerUCE.Convergent || OuterUCE.Convergent) {
+ // FIXME: The call to canUnroll() allows some controlled convergent
+ // operations, but we block them here for future changes.
+ if (InnerUCE.Convergence != ConvergenceKind::None ||
+ OuterUCE.Convergence != ConvergenceKind::None) {
LLVM_DEBUG(
dbgs() << " Not unrolling loop with convergent instructions.\n");
return LoopUnrollResult::Unmodified;
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 10fc9e9303e89..cbc35b6dd4292 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -684,11 +684,15 @@ UnrollCostEstimator::UnrollCostEstimator(
const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
CodeMetrics Metrics;
for (BasicBlock *BB : L->blocks())
- Metrics.analyzeBasicBlock(BB, TTI, EphValues);
+ Metrics.analyzeBasicBlock(BB, TTI, EphValues, /* PrepareForLTO= */ false,
+ L);
NumInlineCandidates = Metrics.NumInlineCandidates;
NotDuplicatable = Metrics.notDuplicatable;
- Convergent = Metrics.convergent;
+ Convergence = Metrics.Convergence;
LoopSize = Metrics.NumInsts;
+ ConvergenceAllowsRuntime =
+ Metrics.Convergence != ConvergenceKind::Uncontrolled &&
+ !getLoopConvergenceHeart(L);
// Don't allow an estimate of size zero. This would allows unrolling of loops
// with huge iteration counts, which is a compile time problem even if it's
@@ -701,6 +705,25 @@ UnrollCostEstimator::UnrollCostEstimator(
LoopSize = BEInsns + 1;
}
+bool UnrollCostEstimator::canUnroll() const {
+ switch (Convergence) {
+ case ConvergenceKind::ExtendedLoop:
+ LLVM_DEBUG(dbgs() << " Convergence prevents unrolling.\n");
+ return false;
+ default:
+ break;
+ }
+ if (!LoopSize.isValid()) {
+ LLVM_DEBUG(dbgs() << " Invalid loop size prevents unrolling.\n");
+ return false;
+ }
+ if (NotDuplicatable) {
+ LLVM_DEBUG(dbgs() << " Non-duplicatable blocks prevent unrolling.\n");
+ return false;
+ }
+ return true;
+}
+
uint64_t UnrollCostEstimator::getUnrolledLoopSize(
const TargetTransformInfo::UnrollingPreferences &UP,
unsigned CountOverwrite) const {
@@ -1206,8 +1229,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
if (!UCE.canUnroll()) {
- LLVM_DEBUG(dbgs() << " Not unrolling loop which contains instructions"
- << " which cannot be duplicated or have invalid cost.\n");
+ LLVM_DEBUG(dbgs() << " Loop not considered unrollable.\n");
return LoopUnrollResult::Unmodified;
}
@@ -1254,15 +1276,9 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
// is unsafe -- it adds a control-flow dependency to the convergent
// operation. Therefore restrict remainder loop (try unrolling without).
//
- // TODO: This is quite conservative. In practice, convergent_op()
- // is likely to be called unconditionally in the loop. In this
- // case, the program would be ill-formed (on most architectures)
- // unless n were the same on all threads in a thread group.
- // Assuming n is the same on all threads, any kind of unrolling is
- // safe. But currently llvm's notion of convergence isn't powerful
- // enough to express this.
- if (UCE.Convergent)
- UP.AllowRemainder = false;
+ // TODO: This is somewhat conservative; we could allow the remainder if the
+ // trip count is uniform.
+ UP.AllowRemainder &= UCE.ConvergenceAllowsRuntime;
// Try to find the trip count upper bound if we cannot find the exact trip
// count.
@@ -1282,6 +1298,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
if (!UP.Count)
return LoopUnrollResult::Unmodified;
+ UP.Runtime &= UCE.ConvergenceAllowsRuntime;
+
if (PP.PeelCount) {
assert(UP.Count == 1 && "Cannot perform peel and unroll in the same step");
LLVM_DEBUG(dbgs() << "PEELING loop %" << L->getHeader()->getName()
@@ -1324,11 +1342,16 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
// Unroll the loop.
Loop *RemainderLoop = nullptr;
+ UnrollLoopOptions ULO;
+ ULO.Count = UP.Count;
+ ULO.Force = UP.Force;
+ ULO.AllowExpensiveTripCount = UP.AllowExpensiveTripCount;
+ ULO.UnrollRemainder = UP.UnrollRemainder;
+ ULO.Runtime = UP.Runtime;
+ ULO.ForgetAllSCEV = ForgetAllSCEV;
+ ULO.Heart = getLoopConvergenceHeart(L);
LoopUnrollResult UnrollResult = UnrollLoop(
- L,
- {UP.Count, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
- UP.UnrollRemainder, ForgetAllSCEV},
- LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
+ L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
if (UnrollResult == LoopUnrollResult::Unmodified)
return LoopUnrollResult::Unmodified;
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 08ba65d9483e0..3d950b151cd32 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -460,7 +460,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
L->dump());
return Rotated;
}
- if (Metrics.convergent) {
+ if (Metrics.Convergence != ConvergenceKind::None) {
LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
"instructions: ";
L->dump());
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 1216538195fbd..90d7b99e9d817 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -419,6 +419,26 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
}
}
+// Loops containing convergent instructions that are uncontrolled or controlled
+// from outside the loop must have a count that divides their TripMultiple.
+LLVM_ATTRIBUTE_USED
+static bool canHaveUnrollRemainder(const Loop *L) {
+ if (getLoopConvergenceHeart(L))
+ return false;
+
+ // Check for uncontrolled convergent operations.
+ for (auto &BB : L->blocks()) {
+ for (auto &I : *BB) {
+ if (isa<ConvergenceControlInst>(I))
+ return true;
+ if (auto *CB = dyn_cast<CallBase>(&I))
+ if (CB->isConvergent())
+ return CB->getConvergenceControlToken();
+ }
+ }
+ return true;
+}
+
/// Unroll the given loop by Count. The loop must be in LCSSA form. Unrolling
/// can only fail when the loop's latch block is not terminated by a conditional
/// branch instruction. However, if the trip count (and multiple) are not known,
@@ -564,19 +584,8 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
return LoopUnrollResult::Unmodified;
}
- // Loops containing convergent instructions cannot use runtime unrolling,
- // as the prologue/epilogue may add additional control-dependencies to
- // convergent operations.
- LLVM_DEBUG(
- {
- bool HasConvergent = false;
- for (auto &BB : L->blocks())
- for (auto &I : *BB)
- if (auto *CB = dyn_cast<CallBase>(&I))
- HasConvergent |= CB->isConvergent();
- assert((!HasConvergent || !ULO.Runtime) &&
- "Can't runtime unroll if loop contains a convergent operation.");
- });
+ assert((!ULO.Runtime || canHaveUnrollRemainder(L)) &&
+ "Can't runtime unroll if loop contains a convergent operation.");
bool EpilogProfitability =
UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog
@@ -722,7 +731,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
if (OldLoop)
LoopsToSimplify.insert(NewLoops[OldLoop]);
- if (*BB == Header)
+ if (*BB == Header) {
// Loop over all of the PHI nodes in the block, changing them to use
// the incoming values from the previous block.
for (PHINode *OrigPHI : OrigPHINode) {
@@ -735,6 +744,16 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
NewPHI->eraseFromParent();
}
+ // Eliminate copies of the loop heart intrinsic, if any.
+ if (ULO.Heart) {
+ auto it = VMap.find(ULO.Heart);
+ assert(it != VMap.end());
+ Instruction *heartCopy = cast<Instruction>(it->second);
+ heartCopy->eraseFromParent();
+ VMap.erase(it);
+ }
+ }
+
// Update our running map of newest clones
LastValueMap[*BB] = New;
for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index e1af02829c1da..dd7150bc63ec4 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -1016,12 +1016,17 @@ bool llvm::UnrollRuntimeLoopRemainder(
auto UnrollResult = LoopUnrollResult::Unmodified;
if (remainderLoop && UnrollRemainder) {
LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n");
- UnrollResult =
- UnrollLoop(remainderLoop,
- {/*Count*/ Count - 1, /*Force*/ false, /*Runtime*/ false,
- /*AllowExpensiveTripCount*/ false,
- /*UnrollRemainder*/ false, ForgetAllSCEV},
- LI, SE, DT, AC, TTI, /*ORE*/ nullptr, PreserveLCSSA);
+ UnrollLoopOptions ULO;
+ ULO.Count = Count - 1;
+ ULO.Force = false;
+ ULO.Runtime = false;
+ ULO.AllowExpensiveTripCount = false;
+ ULO.UnrollRemainder = false;
+ ULO.ForgetAllSCEV = ForgetAllSCEV;
+ assert(!getLoopConvergenceHeart(L) &&
+ "A loop with a convergence heart does not allow runtime unrolling.");
+ UnrollResult = UnrollLoop(remainderLoop, ULO, LI, SE, DT, AC, TTI,
+ /*ORE*/ nullptr, PreserveLCSSA);
}
if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled)
diff --git a/llvm/test/Transforms/LoopUnroll/convergent.controlled.ll b/llvm/test/Transforms/LoopUnroll/convergent.controlled.ll
new file mode 100644
index 0000000000000..7fd4eb18f16eb
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/convergent.controlled.ll
@@ -0,0 +1,562 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=loop-unroll -unroll-runtime -unroll-allow-partial -S | FileCheck %s
+
+declare void @f() convergent
+declare void @g()
+
+; Although this loop contains a convergent instruction, it should be
+; fully unrolled.
+define i32 @full_unroll() {
+; CHECK-LABEL: @full_unroll(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: br label [[L3:%.*]]
+; CHECK: l3:
+; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT: br label [[A:%.*]]
+; CHECK: a:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: br label [[A_1:%.*]]
+; CHECK: a.1:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: br label [[A_2:%.*]]
+; CHECK: a.2:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ %anchor = call token @llvm.experimental.convergence.anchor()
+ br label %l3
+
+l3:
+ %x.0 = phi i32 [ 0, %entry ], [ %inc, %a ]
+ %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+ %inc = add nsw i32 %x.0, 1
+ %exitcond = icmp eq i32 %inc, 3
+ br label %a
+
+a:
+ call void @f() [ "convergencectrl"(token %tok.loop) ]
+ br i1 %exitcond, label %exit, label %l3
+
+exit:
+ ret i32 0
+}
+
+; This loop contains a convergent instruction, but it should be partially
+; unrolled. The unroll count is the largest power of 2 that divides the
+; multiple -- 4, in this case.
+define i32 @runtime_unroll(i32 %n) {
+; CHECK-LABEL: @runtime_unroll(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: [[LOOP_CTL:%.*]] = mul nsw i32 [[N:%.*]], 12
+; CHECK-NEXT: br label [[L3:%.*]]
+; CHECK: l3:
+; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_3:%.*]], [[A_3:%.*]] ]
+; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT: br label [[A:%.*]]
+; CHECK: a:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: br label [[A_1:%.*]]
+; CHECK: a.1:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: br label [[A_2:%.*]]
+; CHECK: a.2:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: br label [[A_3]]
+; CHECK: a.3:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: [[INC_3]] = add nsw i32 [[X_0]], 4
+; CHECK-NEXT: [[EXITCOND_3:%.*]] = icmp eq i32 [[INC_3]], [[LOOP_CTL]]
+; CHECK-NEXT: br i1 [[EXITCOND_3]], label [[EXIT:%.*]], label [[L3]]
+; CHECK: exit:
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ %anchor = call token @llvm.experimental.convergence.anchor()
+ %loop_ctl = mul nsw i32 %n, 12
+ br label %l3
+
+l3:
+ %x.0 = phi i32 [ 0, %entry ], [ %inc, %a ]
+ %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+ br label %a
+
+a:
+ call void @f() [ "convergencectrl"(token %tok.loop) ]
+ %inc = add nsw i32 %x.0, 1
+ %exitcond = icmp eq i32 %inc, %loop_ctl
+ br i1 %exitcond, label %exit, label %l3
+
+exit:
+ ret i32 0
+}
+
+; This loop contains a convergent instruction, so its partial unroll
+; count must divide its trip multiple. This overrides its unroll
+; pragma -- we unroll exactly 8 times, even though 16 is requested.
+define i32 @pragma_unroll(i32 %n) {
+; CHECK-LABEL: @pragma_unroll(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: [[LOOP_CTL:%.*]] = mul nsw i32 [[N:%.*]], 24
+; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: l3:
+; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_7:%.*]], [[A_7:%.*]] ]
+; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT: br label [[A:%.*]]
+; CHECK: a:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: br label [[A_1:%.*]]
+; CHECK: a.1:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: br label [[A_2:%.*]]
+; CHECK: a.2:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: br label [[A_3:%.*]]
+; CHECK: a.3:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: br label [[A_4:%.*]]
+; CHECK: a.4:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: br label [[A_5:%.*]]
+; CHECK: a.5:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: br label [[A_6:%.*]]
+; CHECK: a.6:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: br label [[A_7]]
+; CHECK: a.7:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: [[INC_7]] = add nsw i32 [[X_0]], 8
+; CHECK-NEXT: [[EXITCOND_7:%.*]] = icmp eq i32 [[INC_7]], [[LOOP_CTL]]
+; CHECK-NEXT: br i1 [[EXITCOND_7]], label [[EXIT:%.*]], label [[L3]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK: exit:
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ %anchor = call token @llvm.experimental.convergence.anchor()
+ %loop_ctl = mul nsw i32 %n, 24
+ br label %l3, !llvm.loop !0
+
+l3:
+ %x.0 = phi i32 [ 0, %entry ], [ %inc, %a ]
+ %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+ br label %a
+
+a:
+ call void @f() [ "convergencectrl"(token %tok.loop) ]
+ %inc = add nsw i32 %x.0, 1
+ %exitcond = icmp eq i32 %inc, %loop_ctl
+ br i1 %exitcond, label %exit, label %l3, !llvm.loop !0
+
+exit:
+ ret i32 0
+}
+
+; This loop contains a convergent instruction. Since the pragma loop unroll
+; count 2 divides trip count 4. The loop unroll should respect the pragma.
+define void @pragma_unroll_divisible_trip_count() {
+; CHECK-LABEL: @pragma_unroll_divisible_trip_count(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK: l3:
+; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_1:%.*]], [[L3]] ]
+; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: [[INC_1]] = add nuw nsw i32 [[X_0]], 2
+; CHECK-NEXT: [[EXITCOND_1:%.*]] = icmp eq i32 [[INC_1]], 4
+; CHECK-NEXT: br i1 [[EXITCOND_1]], label [[EXIT:%.*]], label [[L3]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ %anchor = call token @llvm.experimental.convergence.anchor()
+ br label %l3, !llvm.loop !1
+
+l3:
+ %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+ %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+ call void @f() [ "convergencectrl"(token %tok.loop) ]
+ %inc = add nsw i32 %x.0, 1
+ %exitcond = icmp eq i32 %inc, 4
+ br i1 %exitcond, label %exit, label %l3, !llvm.loop !1
+
+exit:
+ ret void
+}
+
+; This loop contains a convergent instruction. Since the pragma loop unroll
+; count 2 divides trip multiple 2. The loop unroll should respect the pragma.
+define i32 @pragma_unroll_divisible_trip_multiple(i32 %n) {
+; CHECK-LABEL: @pragma_unroll_divisible_trip_multiple(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: [[LOOP_CTL:%.*]] = mul nsw i32 [[N:%.*]], 2
+; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: l3:
+; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_1:%.*]], [[L3]] ]
+; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: [[INC_1]] = add nsw i32 [[X_0]], 2
+; CHECK-NEXT: [[EXITCOND_1:%.*]] = icmp eq i32 [[INC_1]], [[LOOP_CTL]]
+; CHECK-NEXT: br i1 [[EXITCOND_1]], label [[EXIT:%.*]], label [[L3]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK: exit:
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ %anchor = call token @llvm.experimental.convergence.anchor()
+ %loop_ctl = mul nsw i32 %n, 2
+ br label %l3, !llvm.loop !1
+
+l3:
+ %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+ %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+ call void @f() [ "convergencectrl"(token %tok.loop) ]
+ %inc = add nsw i32 %x.0, 1
+ %exitcond = icmp eq i32 %inc, %loop_ctl
+ br i1 %exitcond, label %exit, label %l3, !llvm.loop !1
+
+exit:
+ ret i32 0
+}
+
+; This loop contains a convergent instruction. Since the pragma loop unroll
+; count 2 is unknown to divide runtime trip count, the loop is not unrolled
+; since remainder is forbidden for unrolling convergent loop.
+define i32 @pragma_unroll_indivisible_runtime_trip_count(i32 %n) {
+; CHECK-LABEL: @pragma_unroll_indivisible_runtime_trip_count(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: l3:
+; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[L3]] ]
+; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: [[INC]] = add nsw i32 [[X_0]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N:%.*]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[L3]], !llvm.loop [[LOOP4]]
+; CHECK: exit:
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ %anchor = call token @llvm.experimental.convergence.anchor()
+ br label %l3, !llvm.loop !1
+
+l3:
+ %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+ %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+ call void @f() [ "convergencectrl"(token %tok.loop) ]
+ %inc = add nsw i32 %x.0, 1
+ %exitcond = icmp eq i32 %inc, %n
+ br i1 %exitcond, label %exit, label %l3, !llvm.loop !1
+
+exit:
+ ret i32 0
+}
+
+; This loop contains a convergent instruction. Since the pragma loop unroll
+; count 2 does not divide trip count 5, the loop is not unrolled by 2
+; since remainder is forbidden for unrolling convergent loop. Instead, the
+; loop gets fully unrolled.
+define i32 @pragma_unroll_indivisible_trip_count() {
+; CHECK-LABEL: @pragma_unroll_indivisible_trip_count(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: l3:
+; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ %anchor = call token @llvm.experimental.convergence.anchor()
+ br label %l3, !llvm.loop !1
+
+l3:
+ %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+ %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+ call void @f() [ "convergencectrl"(token %tok.loop) ]
+ %inc = add nsw i32 %x.0, 1
+ %exitcond = icmp eq i32 %inc, 5
+ br i1 %exitcond, label %exit, label %l3, !llvm.loop !1
+
+exit:
+ ret i32 0
+}
+
+; This loop contains a convergent instruction that is anchored inside the loop
+; itself. It is unrolled by 2 with remainder, as requested by the loop metadata.
+define i32 @pragma_unroll_with_remainder(i32 %n) {
+; CHECK-LABEL: @pragma_unroll_with_remainder(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = freeze i32 [[N:%.*]]
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], -1
+; CHECK-NEXT: [[XTRAITER:%.*]] = and i32 [[TMP0]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 [[TMP1]], 1
+; CHECK-NEXT: br i1 [[TMP2]], label [[EXIT_UNR_LCSSA:%.*]], label [[ENTRY_NEW:%.*]]
+; CHECK: entry.new:
+; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i32 [[TMP0]], [[XTRAITER]]
+; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: l3:
+; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY_NEW]] ], [ [[INC_1:%.*]], [[L3]] ]
+; CHECK-NEXT: [[NITER:%.*]] = phi i32 [ 0, [[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[L3]] ]
+; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: [[TOK_LOOP_1:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP_1]]) ]
+; CHECK-NEXT: [[INC_1]] = add nsw i32 [[X_0]], 2
+; CHECK-NEXT: [[NITER_NEXT_1]] = add i32 [[NITER]], 2
+; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i32 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label [[EXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[L3]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: exit.unr-lcssa.loopexit:
+; CHECK-NEXT: br label [[EXIT_UNR_LCSSA]]
+; CHECK: exit.unr-lcssa:
+; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0
+; CHECK-NEXT: br i1 [[LCMP_MOD]], label [[L3_EPIL_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK: l3.epil.preheader:
+; CHECK-NEXT: br label [[L3_EPIL:%.*]]
+; CHECK: l3.epil:
+; CHECK-NEXT: [[TOK_LOOP_EPIL:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP_EPIL]]) ]
+; CHECK-NEXT: br label [[EXIT]]
+; CHECK: exit:
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %l3, !llvm.loop !1
+
+l3:
+ %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+ %tok.loop = call token @llvm.experimental.convergence.anchor()
+ call void @f() [ "convergencectrl"(token %tok.loop) ]
+ %inc = add nsw i32 %x.0, 1
+ %exitcond = icmp eq i32 %inc, %n
+ br i1 %exitcond, label %exit, label %l3, !llvm.loop !1
+
+exit:
+ ret i32 0
+}
+
+; Don't unroll a loop that is extended by convergence controls.
+;
+; We could theoretically duplicate the extension part, but this is not
+; implemented.
+define i32 @extended_loop(i32 %n) {
+; CHECK-LABEL: @extended_loop(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: l3:
+; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[L3]] ]
+; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: [[INC]] = add nsw i32 [[X_0]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N:%.*]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[L3]], !llvm.loop [[LOOP4]]
+; CHECK: exit:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %l3, !llvm.loop !1
+
+l3:
+ %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+ %tok.loop = call token @llvm.experimental.convergence.anchor()
+ %inc = add nsw i32 %x.0, 1
+ %exitcond = icmp eq i32 %inc, %n
+ br i1 %exitcond, label %exit, label %l3, !llvm.loop !1
+
+exit:
+ call void @f() [ "convergencectrl"(token %tok.loop) ]
+ ret i32 0
+}
+
+; Inner loop is extended beyond the outer loop. No unrolling possible.
+
+define i32 @extended_inner_loop_1(i32 %n, i1 %cond) {
+; CHECK-LABEL: @extended_inner_loop_1(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[L3:%.*]]
+; CHECK: l3:
+; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[LATCH:%.*]] ]
+; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: [[INC]] = add nsw i32 [[X_0]], 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 4
+; CHECK-NEXT: br label [[L2:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: l2:
+; CHECK-NEXT: [[TOK_L2:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2]]) ]
+; CHECK-NEXT: br i1 [[COND:%.*]], label [[L2]], label [[LATCH]], !llvm.loop [[LOOP4]]
+; CHECK: latch:
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[L3]]
+; CHECK: exit:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2]]) ]
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %l3
+
+l3:
+ %x.0 = phi i32 [ 0, %entry ], [ %inc, %latch ]
+ %tok.loop = call token @llvm.experimental.convergence.anchor()
+ %inc = add nsw i32 %x.0, 1
+ %exitcond = icmp eq i32 %inc, 4
+ br label %l2, !llvm.loop !1
+
+l2:
+ %tok.l2 = call token @llvm.experimental.convergence.anchor()
+ call void @f() [ "convergencectrl"(token %tok.l2) ]
+ br i1 %cond, label %l2, label %latch, !llvm.loop !1
+
+latch:
+ br i1 %exitcond, label %exit, label %l3
+
+exit:
+ call void @f() [ "convergencectrl"(token %tok.l2) ]
+ ret i32 0
+}
+
+; Inner loop is extended inside the outer loop. Outer loop is unrolled.
+
+define i32 @extended_inner_loop_2(i32 %n, i1 %cond) {
+; CHECK-LABEL: @extended_inner_loop_2(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[L3:%.*]]
+; CHECK: l3:
+; CHECK-NEXT: br label [[L2:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: l2:
+; CHECK-NEXT: [[TOK_L2:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2]]) ]
+; CHECK-NEXT: br i1 [[COND:%.*]], label [[L2]], label [[LATCH:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: latch:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2]]) ]
+; CHECK-NEXT: br label [[L2_1:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: l2.1:
+; CHECK-NEXT: [[TOK_L2_1:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_1]]) ]
+; CHECK-NEXT: br i1 [[COND]], label [[L2_1]], label [[LATCH_1:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: latch.1:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_1]]) ]
+; CHECK-NEXT: br label [[L2_2:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: l2.2:
+; CHECK-NEXT: [[TOK_L2_2:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_2]]) ]
+; CHECK-NEXT: br i1 [[COND]], label [[L2_2]], label [[LATCH_2:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: latch.2:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_2]]) ]
+; CHECK-NEXT: br label [[L2_3:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: l2.3:
+; CHECK-NEXT: [[TOK_L2_3:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_3]]) ]
+; CHECK-NEXT: br i1 [[COND]], label [[L2_3]], label [[LATCH_3:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: latch.3:
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_3]]) ]
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %l3
+
+l3:
+ %x.0 = phi i32 [ 0, %entry ], [ %inc, %latch ]
+ %tok.loop = call token @llvm.experimental.convergence.anchor()
+ %inc = add nsw i32 %x.0, 1
+ %exitcond = icmp eq i32 %inc, 4
+ br label %l2, !llvm.loop !1
+
+l2:
+ %tok.l2 = call token @llvm.experimental.convergence.anchor()
+ call void @f() [ "convergencectrl"(token %tok.l2) ]
+ br i1 %cond, label %l2, label %latch, !llvm.loop !1
+
+latch:
+ call void @f() [ "convergencectrl"(token %tok.l2) ]
+ br i1 %exitcond, label %exit, label %l3
+
+exit:
+ ret i32 0
+}
+
+; No extension. Both loops unrolled.
+
+define i32 @unroll_nest(i32 %n, i1 %cond) {
+; CHECK-LABEL: @unroll_nest(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[L3:%.*]]
+; CHECK: l3:
+; CHECK-NEXT: br label [[L2:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: l2:
+; CHECK-NEXT: [[TOK_L2:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2]]) ]
+; CHECK-NEXT: br i1 [[COND:%.*]], label [[L2_1:%.*]], label [[LATCH:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: l2.1:
+; CHECK-NEXT: [[TOK_L2_1:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_1]]) ]
+; CHECK-NEXT: br i1 [[COND]], label [[L2]], label [[LATCH]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK: latch:
+; CHECK-NEXT: br label [[L2_12:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: l2.12:
+; CHECK-NEXT: [[TOK_L2_11:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_11]]) ]
+; CHECK-NEXT: br i1 [[COND]], label [[L2_1_1:%.*]], label [[LATCH_1:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: l2.1.1:
+; CHECK-NEXT: [[TOK_L2_1_1:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_1_1]]) ]
+; CHECK-NEXT: br i1 [[COND]], label [[L2_12]], label [[LATCH_1]], !llvm.loop [[LOOP9]]
+; CHECK: latch.1:
+; CHECK-NEXT: br label [[L2_2:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: l2.2:
+; CHECK-NEXT: [[TOK_L2_2:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_2]]) ]
+; CHECK-NEXT: br i1 [[COND]], label [[L2_1_2:%.*]], label [[LATCH_2:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: l2.1.2:
+; CHECK-NEXT: [[TOK_L2_1_2:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_1_2]]) ]
+; CHECK-NEXT: br i1 [[COND]], label [[L2_2]], label [[LATCH_2]], !llvm.loop [[LOOP9]]
+; CHECK: latch.2:
+; CHECK-NEXT: br label [[L2_3:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: l2.3:
+; CHECK-NEXT: [[TOK_L2_3:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_3]]) ]
+; CHECK-NEXT: br i1 [[COND]], label [[L2_1_3:%.*]], label [[LATCH_3:%.*]], !llvm.loop [[LOOP4]]
+; CHECK: l2.1.3:
+; CHECK-NEXT: [[TOK_L2_1_3:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_1_3]]) ]
+; CHECK-NEXT: br i1 [[COND]], label [[L2_3]], label [[LATCH_3]], !llvm.loop [[LOOP9]]
+; CHECK: latch.3:
+; CHECK-NEXT: ret i32 0
+;
+entry:
+ br label %l3
+
+l3:
+ %x.0 = phi i32 [ 0, %entry ], [ %inc, %latch ]
+ %tok.loop = call token @llvm.experimental.convergence.anchor()
+ %inc = add nsw i32 %x.0, 1
+ %exitcond = icmp eq i32 %inc, 4
+ br label %l2, !llvm.loop !1
+
+l2:
+ %tok.l2 = call token @llvm.experimental.convergence.anchor()
+ call void @f() [ "convergencectrl"(token %tok.l2) ]
+ br i1 %cond, label %l2, label %latch, !llvm.loop !1
+
+latch:
+ br i1 %exitcond, label %exit, label %l3
+
+exit:
+ ret i32 0
+}
+
+declare token @llvm.experimental.convergence.anchor()
+declare token @llvm.experimental.convergence.loop()
+
+!0 = !{!0, !{!"llvm.loop.unroll.count", i32 16}}
+!1 = !{!1, !{!"llvm.loop.unroll.count", i32 2}}
More information about the llvm-commits
mailing list