[llvm] e0ac087 - [LoopUnroll] Consider convergence control tokens when unrolling (#91715)

via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 6 00:43:50 PDT 2024


Author: Sameer Sahasrabuddhe
Date: 2024-06-06T13:13:46+05:30
New Revision: e0ac087ff004f7a63ba64b9685f4f098d6ee54c5

URL: https://github.com/llvm/llvm-project/commit/e0ac087ff004f7a63ba64b9685f4f098d6ee54c5
DIFF: https://github.com/llvm/llvm-project/commit/e0ac087ff004f7a63ba64b9685f4f098d6ee54c5.diff

LOG: [LoopUnroll] Consider convergence control tokens when unrolling (#91715)

- There is no restriction on a loop with controlled convergent
operations when
  the relevant tokens are defined and used within the loop.

- When a token defined outside a loop is used inside (also called a loop
convergence heart), unrolling is allowed only in the absence of
remainder or
  runtime checks.

- When a token defined inside a loop is used outside, such a loop is
said to be
"extended". This loop can only be unrolled by also duplicating the
extended part
  lying outside the loop. Such unrolling is disabled for now.

- Clean up loop hearts: When unrolling a loop with a heart, duplicating
the
heart will introduce multiple static uses of a convergence control token
in a
cycle that does not contain its definition. This violates the static
rules for
tokens, and needs to be cleaned up into a single occurrence of the
intrinsic.

- Spell out the initializer for UnrollLoopOptions to improve
readability.


Original implementation [D85605] by Nicolai Haehnle
<nicolai.haehnle at amd.com>.

Added: 
    llvm/test/Transforms/LoopUnroll/convergent.controlled.ll

Modified: 
    llvm/include/llvm/Analysis/CodeMetrics.h
    llvm/include/llvm/Analysis/LoopInfo.h
    llvm/include/llvm/IR/InstrTypes.h
    llvm/include/llvm/IR/IntrinsicInst.h
    llvm/include/llvm/Transforms/Utils/UnrollLoop.h
    llvm/lib/Analysis/CodeMetrics.cpp
    llvm/lib/Analysis/LoopInfo.cpp
    llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
    llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
    llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
    llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
    llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
    llvm/lib/Transforms/Utils/LoopUnroll.cpp
    llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/CodeMetrics.h b/llvm/include/llvm/Analysis/CodeMetrics.h
index a9431bca11251..d09018daf9548 100644
--- a/llvm/include/llvm/Analysis/CodeMetrics.h
+++ b/llvm/include/llvm/Analysis/CodeMetrics.h
@@ -20,12 +20,15 @@
 namespace llvm {
 class AssumptionCache;
 class BasicBlock;
+class Instruction;
 class Loop;
 class Function;
 template <class T> class SmallPtrSetImpl;
 class TargetTransformInfo;
 class Value;
 
+enum struct ConvergenceKind { None, Controlled, ExtendedLoop, Uncontrolled };
+
 /// Utility to calculate the size and a few similar metrics for a set
 /// of basic blocks.
 struct CodeMetrics {
@@ -42,8 +45,8 @@ struct CodeMetrics {
   /// one or more 'noduplicate' instructions.
   bool notDuplicatable = false;
 
-  /// True if this function contains a call to a convergent function.
-  bool convergent = false;
+  /// The kind of convergence specified in this function.
+  ConvergenceKind Convergence = ConvergenceKind::None;
 
   /// True if this function calls alloca (in the C sense).
   bool usesDynamicAlloca = false;
@@ -77,7 +80,7 @@ struct CodeMetrics {
   /// Add information about a block to the current state.
   void analyzeBasicBlock(const BasicBlock *BB, const TargetTransformInfo &TTI,
                          const SmallPtrSetImpl<const Value *> &EphValues,
-                         bool PrepareForLTO = false);
+                         bool PrepareForLTO = false, const Loop *L = nullptr);
 
   /// Collect a loop's ephemeral values (those used only by an assume
   /// or similar intrinsics in the loop).

diff  --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h
index 52084630560c5..4f06a7e889f91 100644
--- a/llvm/include/llvm/Analysis/LoopInfo.h
+++ b/llvm/include/llvm/Analysis/LoopInfo.h
@@ -649,6 +649,9 @@ int getIntLoopAttribute(const Loop *TheLoop, StringRef Name, int Default = 0);
 std::optional<const MDOperand *> findStringMetadataForLoop(const Loop *TheLoop,
                                                            StringRef Name);
 
+/// Find the convergence heart of the loop.
+CallBase *getLoopConvergenceHeart(const Loop *TheLoop);
+
 /// Look for the loop attribute that requires progress within the loop.
 /// Note: Most consumers probably want "isMustProgress" which checks
 /// the containing function attribute too.

diff  --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index 9dd1bb455a718..441e6a1e79843 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -1588,6 +1588,14 @@ class CallBase : public Instruction {
   static CallBase *removeOperandBundle(CallBase *CB, uint32_t ID,
                                        BasicBlock::iterator InsertPt);
 
+  /// Return the convergence control token for this call, if it exists.
+  Value *getConvergenceControlToken() const {
+    if (auto Bundle = getOperandBundle(llvm::LLVMContext::OB_convergencectrl)) {
+      return Bundle->Inputs[0].get();
+    }
+    return nullptr;
+  }
+
   static bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::Call ||
            I->getOpcode() == Instruction::Invoke ||

diff  --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index fcd3a1025ac13..9010e1a1c896b 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -1799,17 +1799,14 @@ class ConvergenceControlInst : public IntrinsicInst {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
   }
 
-  // Returns the convergence intrinsic referenced by |I|'s convergencectrl
-  // attribute if any.
-  static IntrinsicInst *getParentConvergenceToken(Instruction *I) {
-    auto *CI = dyn_cast<llvm::CallInst>(I);
-    if (!CI)
-      return nullptr;
-
-    auto Bundle = CI->getOperandBundle(llvm::LLVMContext::OB_convergencectrl);
-    assert(Bundle->Inputs.size() == 1 &&
-           Bundle->Inputs[0]->getType()->isTokenTy());
-    return dyn_cast<llvm::IntrinsicInst>(Bundle->Inputs[0].get());
+  bool isAnchor() {
+    return getIntrinsicID() == Intrinsic::experimental_convergence_anchor;
+  }
+  bool isEntry() {
+    return getIntrinsicID() == Intrinsic::experimental_convergence_entry;
+  }
+  bool isLoop() {
+    return getIntrinsicID() == Intrinsic::experimental_convergence_loop;
   }
 };
 

diff  --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
index bd804dc112662..797c082333a76 100644
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -16,6 +16,7 @@
 #define LLVM_TRANSFORMS_UTILS_UNROLLLOOP_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/CodeMetrics.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Support/InstructionCost.h"
 
@@ -73,6 +74,7 @@ struct UnrollLoopOptions {
   bool AllowExpensiveTripCount;
   bool UnrollRemainder;
   bool ForgetAllSCEV;
+  const Instruction *Heart = nullptr;
 };
 
 LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
@@ -128,14 +130,15 @@ class UnrollCostEstimator {
 
 public:
   unsigned NumInlineCandidates;
-  bool Convergent;
+  ConvergenceKind Convergence;
+  bool ConvergenceAllowsRuntime;
 
   UnrollCostEstimator(const Loop *L, const TargetTransformInfo &TTI,
                       const SmallPtrSetImpl<const Value *> &EphValues,
                       unsigned BEInsns);
 
   /// Whether it is legal to unroll this loop.
-  bool canUnroll() const { return LoopSize.isValid() && !NotDuplicatable; }
+  bool canUnroll() const;
 
   uint64_t getRolledLoopSize() const { return *LoopSize.getValue(); }
 

diff  --git a/llvm/lib/Analysis/CodeMetrics.cpp b/llvm/lib/Analysis/CodeMetrics.cpp
index 2637e2f97dbb2..ea67b526423bf 100644
--- a/llvm/lib/Analysis/CodeMetrics.cpp
+++ b/llvm/lib/Analysis/CodeMetrics.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/InstructionCost.h"
 
@@ -111,11 +112,24 @@ void CodeMetrics::collectEphemeralValues(
   completeEphemeralValues(Visited, Worklist, EphValues);
 }
 
+static bool extendsConvergenceOutsideLoop(const Instruction &I, const Loop *L) {
+  if (!L)
+    return false;
+  if (!isa<ConvergenceControlInst>(I))
+    return false;
+  for (const auto *U : I.users()) {
+    if (!L->contains(cast<Instruction>(U)))
+      return true;
+  }
+  return false;
+}
+
 /// Fill in the current structure with information gleaned from the specified
 /// block.
 void CodeMetrics::analyzeBasicBlock(
     const BasicBlock *BB, const TargetTransformInfo &TTI,
-    const SmallPtrSetImpl<const Value *> &EphValues, bool PrepareForLTO) {
+    const SmallPtrSetImpl<const Value *> &EphValues, bool PrepareForLTO,
+    const Loop *L) {
   ++NumBlocks;
   InstructionCost NumInstsBeforeThisBB = NumInsts;
   for (const Instruction &I : *BB) {
@@ -163,19 +177,38 @@ void CodeMetrics::analyzeBasicBlock(
     if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy())
       ++NumVectorInsts;
 
-    if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB))
+    if (I.getType()->isTokenTy() && !isa<ConvergenceControlInst>(I) &&
+        I.isUsedOutsideOfBlock(BB)) {
+      LLVM_DEBUG(dbgs() << I
+                        << "\n  Cannot duplicate a token value used outside "
+                           "the current block (except convergence control).\n");
       notDuplicatable = true;
-
-    if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
-      if (CI->cannotDuplicate())
-        notDuplicatable = true;
-      if (CI->isConvergent())
-        convergent = true;
     }
 
-    if (const InvokeInst *InvI = dyn_cast<InvokeInst>(&I))
-      if (InvI->cannotDuplicate())
+    if (const CallBase *CB = dyn_cast<CallBase>(&I)) {
+      if (CB->cannotDuplicate())
         notDuplicatable = true;
+      // Compute a meet over the visited blocks for the following partial order:
+      //
+      // None -> { Controlled, ExtendedLoop, Uncontrolled}
+      // Controlled -> ExtendedLoop
+      if (Convergence <= ConvergenceKind::Controlled && CB->isConvergent()) {
+        if (isa<ConvergenceControlInst>(CB) ||
+            CB->getConvergenceControlToken()) {
+          assert(Convergence != ConvergenceKind::Uncontrolled);
+          LLVM_DEBUG(dbgs() << "Found controlled convergence:\n" << I << "\n");
+          if (extendsConvergenceOutsideLoop(I, L))
+            Convergence = ConvergenceKind::ExtendedLoop;
+          else {
+            assert(Convergence != ConvergenceKind::ExtendedLoop);
+            Convergence = ConvergenceKind::Controlled;
+          }
+        } else {
+          assert(Convergence == ConvergenceKind::None);
+          Convergence = ConvergenceKind::Uncontrolled;
+        }
+      }
+    }
 
     NumInsts += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
   }

diff  --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index 369ab087ffc0f..c34c4974382ea 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -1105,6 +1105,26 @@ int llvm::getIntLoopAttribute(const Loop *TheLoop, StringRef Name,
   return getOptionalIntLoopAttribute(TheLoop, Name).value_or(Default);
 }
 
+CallBase *llvm::getLoopConvergenceHeart(const Loop *TheLoop) {
+  BasicBlock *H = TheLoop->getHeader();
+  for (Instruction &II : *H) {
+    if (auto *CB = dyn_cast<CallBase>(&II)) {
+      if (!CB->isConvergent())
+        continue;
+      // This is the heart if it uses a token defined outside the loop. The
+      // verifier has already checked that only the loop intrinsic can use such
+      // a token.
+      if (auto *Token = CB->getConvergenceControlToken()) {
+        auto *TokenDef = cast<Instruction>(Token);
+        if (!TheLoop->contains(TokenDef->getParent()))
+          return CB;
+      }
+      return nullptr;
+    }
+  }
+  return nullptr;
+}
+
 bool llvm::isFinite(const Loop *L) {
   return L->getHeader()->getParent()->willReturn();
 }

diff  --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 2c4b45255d059..92213e19c9d9d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3961,7 +3961,7 @@ static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
   UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
 
   // Loop is not unrollable if the loop contains certain instructions.
-  if (!UCE.canUnroll() || UCE.Convergent) {
+  if (!UCE.canUnroll()) {
     LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
     return 1;
   }

diff  --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index ba2546b8db0e2..4371b821eae63 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -827,7 +827,8 @@ struct TransformDFA {
         return false;
       }
 
-      if (Metrics.convergent) {
+      // FIXME: Allow jump threading with controlled convergence.
+      if (Metrics.Convergence != ConvergenceKind::None) {
         LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains "
                           << "convergent instructions.\n");
         ORE->emit([&]() {

diff  --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
index 7b4c54370e48a..f8e2f1f28088d 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -327,8 +327,7 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   UnrollCostEstimator OuterUCE(L, TTI, EphValues, UP.BEInsns);
 
   if (!InnerUCE.canUnroll() || !OuterUCE.canUnroll()) {
-    LLVM_DEBUG(dbgs() << "  Not unrolling loop which contains instructions"
-                      << " which cannot be duplicated or have invalid cost.\n");
+    LLVM_DEBUG(dbgs() << "  Loop not considered unrollable\n");
     return LoopUnrollResult::Unmodified;
   }
 
@@ -341,7 +340,10 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
     LLVM_DEBUG(dbgs() << "  Not unrolling loop with inlinable calls.\n");
     return LoopUnrollResult::Unmodified;
   }
-  if (InnerUCE.Convergent || OuterUCE.Convergent) {
+  // FIXME: The call to canUnroll() allows some controlled convergent
+  // operations, but we block them here for future changes.
+  if (InnerUCE.Convergence != ConvergenceKind::None ||
+      OuterUCE.Convergence != ConvergenceKind::None) {
     LLVM_DEBUG(
         dbgs() << "  Not unrolling loop with convergent instructions.\n");
     return LoopUnrollResult::Unmodified;

diff  --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 10fc9e9303e89..cbc35b6dd4292 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -684,11 +684,15 @@ UnrollCostEstimator::UnrollCostEstimator(
     const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
   CodeMetrics Metrics;
   for (BasicBlock *BB : L->blocks())
-    Metrics.analyzeBasicBlock(BB, TTI, EphValues);
+    Metrics.analyzeBasicBlock(BB, TTI, EphValues, /* PrepareForLTO= */ false,
+                              L);
   NumInlineCandidates = Metrics.NumInlineCandidates;
   NotDuplicatable = Metrics.notDuplicatable;
-  Convergent = Metrics.convergent;
+  Convergence = Metrics.Convergence;
   LoopSize = Metrics.NumInsts;
+  ConvergenceAllowsRuntime =
+      Metrics.Convergence != ConvergenceKind::Uncontrolled &&
+      !getLoopConvergenceHeart(L);
 
   // Don't allow an estimate of size zero.  This would allows unrolling of loops
   // with huge iteration counts, which is a compile time problem even if it's
@@ -701,6 +705,25 @@ UnrollCostEstimator::UnrollCostEstimator(
     LoopSize = BEInsns + 1;
 }
 
+bool UnrollCostEstimator::canUnroll() const {
+  switch (Convergence) {
+  case ConvergenceKind::ExtendedLoop:
+    LLVM_DEBUG(dbgs() << "  Convergence prevents unrolling.\n");
+    return false;
+  default:
+    break;
+  }
+  if (!LoopSize.isValid()) {
+    LLVM_DEBUG(dbgs() << "  Invalid loop size prevents unrolling.\n");
+    return false;
+  }
+  if (NotDuplicatable) {
+    LLVM_DEBUG(dbgs() << "  Non-duplicatable blocks prevent unrolling.\n");
+    return false;
+  }
+  return true;
+}
+
 uint64_t UnrollCostEstimator::getUnrolledLoopSize(
     const TargetTransformInfo::UnrollingPreferences &UP,
     unsigned CountOverwrite) const {
@@ -1206,8 +1229,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
 
   UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
   if (!UCE.canUnroll()) {
-    LLVM_DEBUG(dbgs() << "  Not unrolling loop which contains instructions"
-                      << " which cannot be duplicated or have invalid cost.\n");
+    LLVM_DEBUG(dbgs() << "  Loop not considered unrollable.\n");
     return LoopUnrollResult::Unmodified;
   }
 
@@ -1254,15 +1276,9 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
   // is unsafe -- it adds a control-flow dependency to the convergent
   // operation.  Therefore restrict remainder loop (try unrolling without).
   //
-  // TODO: This is quite conservative.  In practice, convergent_op()
-  // is likely to be called unconditionally in the loop.  In this
-  // case, the program would be ill-formed (on most architectures)
-  // unless n were the same on all threads in a thread group.
-  // Assuming n is the same on all threads, any kind of unrolling is
-  // safe.  But currently llvm's notion of convergence isn't powerful
-  // enough to express this.
-  if (UCE.Convergent)
-    UP.AllowRemainder = false;
+  // TODO: This is somewhat conservative; we could allow the remainder if the
+  // trip count is uniform.
+  UP.AllowRemainder &= UCE.ConvergenceAllowsRuntime;
 
   // Try to find the trip count upper bound if we cannot find the exact trip
   // count.
@@ -1282,6 +1298,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
   if (!UP.Count)
     return LoopUnrollResult::Unmodified;
 
+  UP.Runtime &= UCE.ConvergenceAllowsRuntime;
+
   if (PP.PeelCount) {
     assert(UP.Count == 1 && "Cannot perform peel and unroll in the same step");
     LLVM_DEBUG(dbgs() << "PEELING loop %" << L->getHeader()->getName()
@@ -1324,11 +1342,16 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
 
   // Unroll the loop.
   Loop *RemainderLoop = nullptr;
+  UnrollLoopOptions ULO;
+  ULO.Count = UP.Count;
+  ULO.Force = UP.Force;
+  ULO.AllowExpensiveTripCount = UP.AllowExpensiveTripCount;
+  ULO.UnrollRemainder = UP.UnrollRemainder;
+  ULO.Runtime = UP.Runtime;
+  ULO.ForgetAllSCEV = ForgetAllSCEV;
+  ULO.Heart = getLoopConvergenceHeart(L);
   LoopUnrollResult UnrollResult = UnrollLoop(
-      L,
-      {UP.Count, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
-       UP.UnrollRemainder, ForgetAllSCEV},
-      LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
+      L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
   if (UnrollResult == LoopUnrollResult::Unmodified)
     return LoopUnrollResult::Unmodified;
 

diff  --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 08ba65d9483e0..3d950b151cd32 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -460,7 +460,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
                    L->dump());
         return Rotated;
       }
-      if (Metrics.convergent) {
+      if (Metrics.Convergence != ConvergenceKind::None) {
         LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
                    "instructions: ";
                    L->dump());

diff  --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 1216538195fbd..90d7b99e9d817 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -419,6 +419,26 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
   }
 }
 
+// Loops containing convergent instructions that are uncontrolled or controlled
+// from outside the loop must have a count that divides their TripMultiple.
+LLVM_ATTRIBUTE_USED
+static bool canHaveUnrollRemainder(const Loop *L) {
+  if (getLoopConvergenceHeart(L))
+    return false;
+
+  // Check for uncontrolled convergent operations.
+  for (auto &BB : L->blocks()) {
+    for (auto &I : *BB) {
+      if (isa<ConvergenceControlInst>(I))
+        return true;
+      if (auto *CB = dyn_cast<CallBase>(&I))
+        if (CB->isConvergent())
+          return CB->getConvergenceControlToken();
+    }
+  }
+  return true;
+}
+
 /// Unroll the given loop by Count. The loop must be in LCSSA form.  Unrolling
 /// can only fail when the loop's latch block is not terminated by a conditional
 /// branch instruction. However, if the trip count (and multiple) are not known,
@@ -564,19 +584,8 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     return LoopUnrollResult::Unmodified;
   }
 
-  // Loops containing convergent instructions cannot use runtime unrolling,
-  // as the prologue/epilogue may add additional control-dependencies to
-  // convergent operations.
-  LLVM_DEBUG(
-      {
-        bool HasConvergent = false;
-        for (auto &BB : L->blocks())
-          for (auto &I : *BB)
-            if (auto *CB = dyn_cast<CallBase>(&I))
-              HasConvergent |= CB->isConvergent();
-        assert((!HasConvergent || !ULO.Runtime) &&
-               "Can't runtime unroll if loop contains a convergent operation.");
-      });
+  assert((!ULO.Runtime || canHaveUnrollRemainder(L)) &&
+         "Can't runtime unroll if loop contains a convergent operation.");
 
   bool EpilogProfitability =
       UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog
@@ -722,7 +731,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       if (OldLoop)
         LoopsToSimplify.insert(NewLoops[OldLoop]);
 
-      if (*BB == Header)
+      if (*BB == Header) {
         // Loop over all of the PHI nodes in the block, changing them to use
         // the incoming values from the previous block.
         for (PHINode *OrigPHI : OrigPHINode) {
@@ -735,6 +744,16 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
           NewPHI->eraseFromParent();
         }
 
+        // Eliminate copies of the loop heart intrinsic, if any.
+        if (ULO.Heart) {
+          auto it = VMap.find(ULO.Heart);
+          assert(it != VMap.end());
+          Instruction *heartCopy = cast<Instruction>(it->second);
+          heartCopy->eraseFromParent();
+          VMap.erase(it);
+        }
+      }
+
       // Update our running map of newest clones
       LastValueMap[*BB] = New;
       for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();

diff  --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index e1af02829c1da..dd7150bc63ec4 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -1016,12 +1016,17 @@ bool llvm::UnrollRuntimeLoopRemainder(
   auto UnrollResult = LoopUnrollResult::Unmodified;
   if (remainderLoop && UnrollRemainder) {
     LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n");
-    UnrollResult =
-        UnrollLoop(remainderLoop,
-                   {/*Count*/ Count - 1, /*Force*/ false, /*Runtime*/ false,
-                    /*AllowExpensiveTripCount*/ false,
-                    /*UnrollRemainder*/ false, ForgetAllSCEV},
-                   LI, SE, DT, AC, TTI, /*ORE*/ nullptr, PreserveLCSSA);
+    UnrollLoopOptions ULO;
+    ULO.Count = Count - 1;
+    ULO.Force = false;
+    ULO.Runtime = false;
+    ULO.AllowExpensiveTripCount = false;
+    ULO.UnrollRemainder = false;
+    ULO.ForgetAllSCEV = ForgetAllSCEV;
+    assert(!getLoopConvergenceHeart(L) &&
+           "A loop with a convergence heart does not allow runtime unrolling.");
+    UnrollResult = UnrollLoop(remainderLoop, ULO, LI, SE, DT, AC, TTI,
+                              /*ORE*/ nullptr, PreserveLCSSA);
   }
 
   if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled)

diff  --git a/llvm/test/Transforms/LoopUnroll/convergent.controlled.ll b/llvm/test/Transforms/LoopUnroll/convergent.controlled.ll
new file mode 100644
index 0000000000000..7fd4eb18f16eb
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/convergent.controlled.ll
@@ -0,0 +1,562 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=loop-unroll -unroll-runtime -unroll-allow-partial -S | FileCheck %s
+
+declare void @f() convergent
+declare void @g()
+
+; Although this loop contains a convergent instruction, it should be
+; fully unrolled.
+define i32 @full_unroll() {
+; CHECK-LABEL: @full_unroll(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    br label [[L3:%.*]]
+; CHECK:       l3:
+; CHECK-NEXT:    [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT:    br label [[A:%.*]]
+; CHECK:       a:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    br label [[A_1:%.*]]
+; CHECK:       a.1:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    br label [[A_2:%.*]]
+; CHECK:       a.2:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %anchor = call token @llvm.experimental.convergence.anchor()
+  br label %l3
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %a ]
+  %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, 3
+  br label %a
+
+a:
+  call void @f() [ "convergencectrl"(token %tok.loop) ]
+  br i1 %exitcond, label %exit, label %l3
+
+exit:
+  ret i32 0
+}
+
+; This loop contains a convergent instruction, but it should be partially
+; unrolled.  The unroll count is the largest power of 2 that divides the
+; multiple -- 4, in this case.
+define i32 @runtime_unroll(i32 %n) {
+; CHECK-LABEL: @runtime_unroll(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    [[LOOP_CTL:%.*]] = mul nsw i32 [[N:%.*]], 12
+; CHECK-NEXT:    br label [[L3:%.*]]
+; CHECK:       l3:
+; CHECK-NEXT:    [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_3:%.*]], [[A_3:%.*]] ]
+; CHECK-NEXT:    [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT:    br label [[A:%.*]]
+; CHECK:       a:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    br label [[A_1:%.*]]
+; CHECK:       a.1:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    br label [[A_2:%.*]]
+; CHECK:       a.2:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    br label [[A_3]]
+; CHECK:       a.3:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC_3]] = add nsw i32 [[X_0]], 4
+; CHECK-NEXT:    [[EXITCOND_3:%.*]] = icmp eq i32 [[INC_3]], [[LOOP_CTL]]
+; CHECK-NEXT:    br i1 [[EXITCOND_3]], label [[EXIT:%.*]], label [[L3]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %anchor = call token @llvm.experimental.convergence.anchor()
+  %loop_ctl = mul nsw i32 %n, 12
+  br label %l3
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %a ]
+  %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+  br label %a
+
+a:
+  call void @f() [ "convergencectrl"(token %tok.loop) ]
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, %loop_ctl
+  br i1 %exitcond, label %exit, label %l3
+
+exit:
+  ret i32 0
+}
+
+; This loop contains a convergent instruction, so its partial unroll
+; count must divide its trip multiple.  This overrides its unroll
+; pragma -- we unroll exactly 8 times, even though 16 is requested.
+define i32 @pragma_unroll(i32 %n) {
+; CHECK-LABEL: @pragma_unroll(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    [[LOOP_CTL:%.*]] = mul nsw i32 [[N:%.*]], 24
+; CHECK-NEXT:    br label [[L3:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       l3:
+; CHECK-NEXT:    [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_7:%.*]], [[A_7:%.*]] ]
+; CHECK-NEXT:    [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT:    br label [[A:%.*]]
+; CHECK:       a:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    br label [[A_1:%.*]]
+; CHECK:       a.1:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    br label [[A_2:%.*]]
+; CHECK:       a.2:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    br label [[A_3:%.*]]
+; CHECK:       a.3:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    br label [[A_4:%.*]]
+; CHECK:       a.4:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    br label [[A_5:%.*]]
+; CHECK:       a.5:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    br label [[A_6:%.*]]
+; CHECK:       a.6:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    br label [[A_7]]
+; CHECK:       a.7:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC_7]] = add nsw i32 [[X_0]], 8
+; CHECK-NEXT:    [[EXITCOND_7:%.*]] = icmp eq i32 [[INC_7]], [[LOOP_CTL]]
+; CHECK-NEXT:    br i1 [[EXITCOND_7]], label [[EXIT:%.*]], label [[L3]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %anchor = call token @llvm.experimental.convergence.anchor()
+  %loop_ctl = mul nsw i32 %n, 24
+  br label %l3, !llvm.loop !0
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %a ]
+  %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+  br label %a
+
+a:
+  call void @f() [ "convergencectrl"(token %tok.loop) ]
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, %loop_ctl
+  br i1 %exitcond, label %exit, label %l3, !llvm.loop !0
+
+exit:
+  ret i32 0
+}
+
+; This loop contains a convergent instruction. Since the pragma loop unroll
+; count 2 divides trip count 4. The loop unroll should respect the pragma.
+define void @pragma_unroll_divisible_trip_count() {
+; CHECK-LABEL: @pragma_unroll_divisible_trip_count(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    br label [[L3:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       l3:
+; CHECK-NEXT:    [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_1:%.*]], [[L3]] ]
+; CHECK-NEXT:    [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC_1]] = add nuw nsw i32 [[X_0]], 2
+; CHECK-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i32 [[INC_1]], 4
+; CHECK-NEXT:    br i1 [[EXITCOND_1]], label [[EXIT:%.*]], label [[L3]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %anchor = call token @llvm.experimental.convergence.anchor()
+  br label %l3, !llvm.loop !1
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+  %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+  call void @f() [ "convergencectrl"(token %tok.loop) ]
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, 4
+  br i1 %exitcond, label %exit, label %l3, !llvm.loop !1
+
+exit:
+  ret void
+}
+
+; This loop contains a convergent instruction. Since the pragma loop unroll
+; count 2 divides trip multiple 2. The loop unroll should respect the pragma.
+define i32 @pragma_unroll_divisible_trip_multiple(i32 %n) {
+; CHECK-LABEL: @pragma_unroll_divisible_trip_multiple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    [[LOOP_CTL:%.*]] = mul nsw i32 [[N:%.*]], 2
+; CHECK-NEXT:    br label [[L3:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       l3:
+; CHECK-NEXT:    [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_1:%.*]], [[L3]] ]
+; CHECK-NEXT:    [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC_1]] = add nsw i32 [[X_0]], 2
+; CHECK-NEXT:    [[EXITCOND_1:%.*]] = icmp eq i32 [[INC_1]], [[LOOP_CTL]]
+; CHECK-NEXT:    br i1 [[EXITCOND_1]], label [[EXIT:%.*]], label [[L3]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %anchor = call token @llvm.experimental.convergence.anchor()
+  %loop_ctl = mul nsw i32 %n, 2
+  br label %l3, !llvm.loop !1
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+  %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+  call void @f() [ "convergencectrl"(token %tok.loop) ]
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, %loop_ctl
+  br i1 %exitcond, label %exit, label %l3, !llvm.loop !1
+
+exit:
+  ret i32 0
+}
+
+; This loop contains a convergent instruction. Since the pragma loop unroll
+; count 2 is unknown to divide runtime trip count, the loop is not unrolled
+; since remainder is forbidden for unrolling convergent loop.
+define i32 @pragma_unroll_indivisible_runtime_trip_count(i32 %n) {
+; CHECK-LABEL: @pragma_unroll_indivisible_runtime_trip_count(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    br label [[L3:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       l3:
+; CHECK-NEXT:    [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[L3]] ]
+; CHECK-NEXT:    [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[X_0]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[L3]], !llvm.loop [[LOOP4]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %anchor = call token @llvm.experimental.convergence.anchor()
+  br label %l3, !llvm.loop !1
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+  %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+  call void @f() [ "convergencectrl"(token %tok.loop) ]
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %exit, label %l3, !llvm.loop !1
+
+exit:
+  ret i32 0
+}
+
+; This loop contains a convergent instruction. Since the pragma loop unroll
+; count 2 does not divide trip count 5, the loop is not unrolled by 2
+; since remainder is forbidden for unrolling convergent loop. Instead, the
+; loop gets fully unrolled.
+define i32 @pragma_unroll_indivisible_trip_count() {
+; CHECK-LABEL: @pragma_unroll_indivisible_trip_count(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    br label [[L3:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       l3:
+; CHECK-NEXT:    [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %anchor = call token @llvm.experimental.convergence.anchor()
+  br label %l3, !llvm.loop !1
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+  %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ]
+  call void @f() [ "convergencectrl"(token %tok.loop) ]
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, 5
+  br i1 %exitcond, label %exit, label %l3, !llvm.loop !1
+
+exit:
+  ret i32 0
+}
+
+; This loop contains a convergent instruction that is anchored inside the loop
+; itself. It is unrolled by 2 with remainder, as requested by the loop metadata.
+define i32 @pragma_unroll_with_remainder(i32 %n) {
+; CHECK-LABEL: @pragma_unroll_with_remainder(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = freeze i32 [[N:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], -1
+; CHECK-NEXT:    [[XTRAITER:%.*]] = and i32 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[TMP1]], 1
+; CHECK-NEXT:    br i1 [[TMP2]], label [[EXIT_UNR_LCSSA:%.*]], label [[ENTRY_NEW:%.*]]
+; CHECK:       entry.new:
+; CHECK-NEXT:    [[UNROLL_ITER:%.*]] = sub i32 [[TMP0]], [[XTRAITER]]
+; CHECK-NEXT:    br label [[L3:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       l3:
+; CHECK-NEXT:    [[X_0:%.*]] = phi i32 [ 0, [[ENTRY_NEW]] ], [ [[INC_1:%.*]], [[L3]] ]
+; CHECK-NEXT:    [[NITER:%.*]] = phi i32 [ 0, [[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[L3]] ]
+; CHECK-NEXT:    [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    [[TOK_LOOP_1:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP_1]]) ]
+; CHECK-NEXT:    [[INC_1]] = add nsw i32 [[X_0]], 2
+; CHECK-NEXT:    [[NITER_NEXT_1]] = add i32 [[NITER]], 2
+; CHECK-NEXT:    [[NITER_NCMP_1:%.*]] = icmp eq i32 [[NITER_NEXT_1]], [[UNROLL_ITER]]
+; CHECK-NEXT:    br i1 [[NITER_NCMP_1]], label [[EXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[L3]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       exit.unr-lcssa.loopexit:
+; CHECK-NEXT:    br label [[EXIT_UNR_LCSSA]]
+; CHECK:       exit.unr-lcssa:
+; CHECK-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0
+; CHECK-NEXT:    br i1 [[LCMP_MOD]], label [[L3_EPIL_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       l3.epil.preheader:
+; CHECK-NEXT:    br label [[L3_EPIL:%.*]]
+; CHECK:       l3.epil:
+; CHECK-NEXT:    [[TOK_LOOP_EPIL:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP_EPIL]]) ]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %l3, !llvm.loop !1
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+  %tok.loop = call token @llvm.experimental.convergence.anchor()
+  call void @f() [ "convergencectrl"(token %tok.loop) ]
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %exit, label %l3, !llvm.loop !1
+
+exit:
+  ret i32 0
+}
+
+; Don't unroll a loop that is extended by convergence controls.
+;
+; We could theoretically duplicate the extension part, but this is not
+; implemented.
+define i32 @extended_loop(i32 %n) {
+; CHECK-LABEL: @extended_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[L3:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       l3:
+; CHECK-NEXT:    [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[L3]] ]
+; CHECK-NEXT:    [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[X_0]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[L3]], !llvm.loop [[LOOP4]]
+; CHECK:       exit:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ]
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %l3, !llvm.loop !1
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+  %tok.loop = call token @llvm.experimental.convergence.anchor()
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %exit, label %l3, !llvm.loop !1
+
+exit:
+  call void @f() [ "convergencectrl"(token %tok.loop) ]
+  ret i32 0
+}
+
+; Inner loop is extended beyond the outer loop. No unrolling possible.
+
+define i32 @extended_inner_loop_1(i32 %n, i1 %cond) {
+; CHECK-LABEL: @extended_inner_loop_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[L3:%.*]]
+; CHECK:       l3:
+; CHECK-NEXT:    [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[LATCH:%.*]] ]
+; CHECK-NEXT:    [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[X_0]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 4
+; CHECK-NEXT:    br label [[L2:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       l2:
+; CHECK-NEXT:    [[TOK_L2:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_L2]]) ]
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[L2]], label [[LATCH]], !llvm.loop [[LOOP4]]
+; CHECK:       latch:
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[L3]]
+; CHECK:       exit:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_L2]]) ]
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %l3
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %latch ]
+  %tok.loop = call token @llvm.experimental.convergence.anchor()
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, 4
+  br label %l2, !llvm.loop !1
+
+l2:
+  %tok.l2 = call token @llvm.experimental.convergence.anchor()
+  call void @f() [ "convergencectrl"(token %tok.l2) ]
+  br i1 %cond, label %l2, label %latch, !llvm.loop !1
+
+latch:
+  br i1 %exitcond, label %exit, label %l3
+
+exit:
+  call void @f() [ "convergencectrl"(token %tok.l2) ]
+  ret i32 0
+}
+
+; Inner loop is extended inside the outer loop. Outer loop is unrolled.
+
+define i32 @extended_inner_loop_2(i32 %n, i1 %cond) {
+; CHECK-LABEL: @extended_inner_loop_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[L3:%.*]]
+; CHECK:       l3:
+; CHECK-NEXT:    br label [[L2:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       l2:
+; CHECK-NEXT:    [[TOK_L2:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_L2]]) ]
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[L2]], label [[LATCH:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       latch:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_L2]]) ]
+; CHECK-NEXT:    br label [[L2_1:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       l2.1:
+; CHECK-NEXT:    [[TOK_L2_1:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_L2_1]]) ]
+; CHECK-NEXT:    br i1 [[COND]], label [[L2_1]], label [[LATCH_1:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       latch.1:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_L2_1]]) ]
+; CHECK-NEXT:    br label [[L2_2:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       l2.2:
+; CHECK-NEXT:    [[TOK_L2_2:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_L2_2]]) ]
+; CHECK-NEXT:    br i1 [[COND]], label [[L2_2]], label [[LATCH_2:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       latch.2:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_L2_2]]) ]
+; CHECK-NEXT:    br label [[L2_3:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       l2.3:
+; CHECK-NEXT:    [[TOK_L2_3:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_L2_3]]) ]
+; CHECK-NEXT:    br i1 [[COND]], label [[L2_3]], label [[LATCH_3:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       latch.3:
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_L2_3]]) ]
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %l3
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %latch ]
+  %tok.loop = call token @llvm.experimental.convergence.anchor()
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, 4
+  br label %l2, !llvm.loop !1
+
+l2:
+  %tok.l2 = call token @llvm.experimental.convergence.anchor()
+  call void @f() [ "convergencectrl"(token %tok.l2) ]
+  br i1 %cond, label %l2, label %latch, !llvm.loop !1
+
+latch:
+  call void @f() [ "convergencectrl"(token %tok.l2) ]
+  br i1 %exitcond, label %exit, label %l3
+
+exit:
+  ret i32 0
+}
+
+; No extension. Both loops unrolled.
+
+define i32 @unroll_nest(i32 %n, i1 %cond) {
+; CHECK-LABEL: @unroll_nest(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[L3:%.*]]
+; CHECK:       l3:
+; CHECK-NEXT:    br label [[L2:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       l2:
+; CHECK-NEXT:    [[TOK_L2:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_L2]]) ]
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[L2_1:%.*]], label [[LATCH:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       l2.1:
+; CHECK-NEXT:    [[TOK_L2_1:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_L2_1]]) ]
+; CHECK-NEXT:    br i1 [[COND]], label [[L2]], label [[LATCH]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       latch:
+; CHECK-NEXT:    br label [[L2_12:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       l2.12:
+; CHECK-NEXT:    [[TOK_L2_11:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_L2_11]]) ]
+; CHECK-NEXT:    br i1 [[COND]], label [[L2_1_1:%.*]], label [[LATCH_1:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       l2.1.1:
+; CHECK-NEXT:    [[TOK_L2_1_1:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_L2_1_1]]) ]
+; CHECK-NEXT:    br i1 [[COND]], label [[L2_12]], label [[LATCH_1]], !llvm.loop [[LOOP9]]
+; CHECK:       latch.1:
+; CHECK-NEXT:    br label [[L2_2:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       l2.2:
+; CHECK-NEXT:    [[TOK_L2_2:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_L2_2]]) ]
+; CHECK-NEXT:    br i1 [[COND]], label [[L2_1_2:%.*]], label [[LATCH_2:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       l2.1.2:
+; CHECK-NEXT:    [[TOK_L2_1_2:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_L2_1_2]]) ]
+; CHECK-NEXT:    br i1 [[COND]], label [[L2_2]], label [[LATCH_2]], !llvm.loop [[LOOP9]]
+; CHECK:       latch.2:
+; CHECK-NEXT:    br label [[L2_3:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       l2.3:
+; CHECK-NEXT:    [[TOK_L2_3:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_L2_3]]) ]
+; CHECK-NEXT:    br i1 [[COND]], label [[L2_1_3:%.*]], label [[LATCH_3:%.*]], !llvm.loop [[LOOP4]]
+; CHECK:       l2.1.3:
+; CHECK-NEXT:    [[TOK_L2_1_3:%.*]] = call token @llvm.experimental.convergence.anchor()
+; CHECK-NEXT:    call void @f() [ "convergencectrl"(token [[TOK_L2_1_3]]) ]
+; CHECK-NEXT:    br i1 [[COND]], label [[L2_3]], label [[LATCH_3]], !llvm.loop [[LOOP9]]
+; CHECK:       latch.3:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br label %l3
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %latch ]
+  %tok.loop = call token @llvm.experimental.convergence.anchor()
+  %inc = add nsw i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, 4
+  br label %l2, !llvm.loop !1
+
+l2:
+  %tok.l2 = call token @llvm.experimental.convergence.anchor()
+  call void @f() [ "convergencectrl"(token %tok.l2) ]
+  br i1 %cond, label %l2, label %latch, !llvm.loop !1
+
+latch:
+  br i1 %exitcond, label %exit, label %l3
+
+exit:
+  ret i32 0
+}
+
+declare token @llvm.experimental.convergence.anchor()
+declare token @llvm.experimental.convergence.loop()
+
+!0 = !{!0, !{!"llvm.loop.unroll.count", i32 16}}
+!1 = !{!1, !{!"llvm.loop.unroll.count", i32 2}}


        


More information about the llvm-commits mailing list