[llvm] fa6a287 - [LoopInterchange] Enable interchange with multiple inner loop indvars

Fri Jan 14 13:31:10 PST 2022

Author: Congzhe Cao
Date: 2022-01-14T16:28:41-05:00
New Revision: fa6a2876c7e43fbb37c076dcf80ffe1ac22e49fc

URL: https://github.com/llvm/llvm-project/commit/fa6a2876c7e43fbb37c076dcf80ffe1ac22e49fc
DIFF: https://github.com/llvm/llvm-project/commit/fa6a2876c7e43fbb37c076dcf80ffe1ac22e49fc.diff

LOG: [LoopInterchange] Enable interchange with multiple inner loop indvars

Currently loop interchange only supports loops with one inner loop
induction variable. This patch adds support for transformation with
more than one inner loop induction variables. The induction PHIs and
induction increment instructions are moved/duplicated properly to the
new outer header and the new outer latch, respectively.

Reviewed By: bmahjour

Differential Revision: https://reviews.llvm.org/D114917

Added: 
    llvm/test/Transforms/LoopInterchange/interchangeable-innerloop-multiple-indvars.ll

Modified: 
    llvm/lib/Transforms/Scalar/LoopInterchange.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index e10af46309603..c2b065c4eb314 100644

--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -292,33 +292,6 @@ static LoopVector populateWorklist(Loop &L) {
   return LoopList;
 }
 
-static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
-  PHINode *InnerIndexVar = L->getCanonicalInductionVariable();
-  if (InnerIndexVar)
-    return InnerIndexVar;
-  if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr)
-    return nullptr;
-  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
-    PHINode *PhiVar = cast<PHINode>(I);
-    Type *PhiTy = PhiVar->getType();
-    if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
-        !PhiTy->isPointerTy())
-      return nullptr;
-    const SCEVAddRecExpr *AddRec =
-        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar));
-    if (!AddRec || !AddRec->isAffine())
-      continue;
-    const SCEV *Step = AddRec->getStepRecurrence(*SE);
-    if (!isa<SCEVConstant>(Step))
-      continue;
-    // Found the induction variable.
-    // FIXME: Handle loops with more than one induction variable. Note that,
-    // currently, legality makes sure we have only one induction variable.
-    return PhiVar;
-  }
-  return nullptr;
-}
-
 namespace {
 
 /// LoopInterchangeLegality checks if it is legal to interchange the loop.
@@ -332,9 +305,13 @@ class LoopInterchangeLegality {
   bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
                            CharMatrix &DepMatrix);
 
+  /// Discover induction PHIs in the header of \p L. Induction
+  /// PHIs are added to \p Inductions.
+  bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);
+
   /// Check if the loop structure is understood. We do not handle triangular
   /// loops for now.
-  bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
+  bool isLoopStructureUnderstood();
 
   bool currentLimitations();
 
@@ -342,6 +319,10 @@ class LoopInterchangeLegality {
     return OuterInnerReductions;
   }
 
+  const SmallVectorImpl<PHINode *> &getInnerLoopInductions() const {
+    return InnerLoopInductions;
+  }
+
 private:
   bool tightlyNested(Loop *Outer, Loop *Inner);
   bool containsUnsafeInstructions(BasicBlock *BB);
@@ -365,6 +346,9 @@ class LoopInterchangeLegality {
   /// Set of reduction PHIs taking part of a reduction across the inner and
   /// outer loop.
   SmallPtrSet<PHINode *, 4> OuterInnerReductions;
+
+  /// Set of inner loop induction PHIs
+  SmallVector<PHINode *, 8> InnerLoopInductions;
 };
 
 /// LoopInterchangeProfitability checks if it is profitable to interchange the
@@ -635,25 +619,26 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
   return true;
 }
 
-bool LoopInterchangeLegality::isLoopStructureUnderstood(
-    PHINode *InnerInduction) {
-  unsigned Num = InnerInduction->getNumOperands();
+bool LoopInterchangeLegality::isLoopStructureUnderstood() {
   BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
-  for (unsigned i = 0; i < Num; ++i) {
-    Value *Val = InnerInduction->getOperand(i);
-    if (isa<Constant>(Val))
-      continue;
-    Instruction *I = dyn_cast<Instruction>(Val);
-    if (!I)
-      return false;
-    // TODO: Handle triangular loops.
-    // e.g. for(int i=0;i<N;i++)
-    //        for(int j=i;j<N;j++)
-    unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
-    if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
-            InnerLoopPreheader &&
-        !OuterLoop->isLoopInvariant(I)) {
-      return false;
+  for (PHINode *InnerInduction : InnerLoopInductions) {
+    unsigned Num = InnerInduction->getNumOperands();
+    for (unsigned i = 0; i < Num; ++i) {
+      Value *Val = InnerInduction->getOperand(i);
+      if (isa<Constant>(Val))
+        continue;
+      Instruction *I = dyn_cast<Instruction>(Val);
+      if (!I)
+        return false;
+      // TODO: Handle triangular loops.
+      // e.g. for(int i=0;i<N;i++)
+      //        for(int j=i;j<N;j++)
+      unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
+      if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
+              InnerLoopPreheader &&
+          !OuterLoop->isLoopInvariant(I)) {
+        return false;
+      }
     }
   }
 
@@ -682,27 +667,34 @@ bool LoopInterchangeLegality::isLoopStructureUnderstood(
     // Return true if V is InnerInduction, or a cast from
     // InnerInduction, or a binary operator that involves
     // InnerInduction and a constant.
-    std::function<bool(Value *)> IsPathToIndVar;
-    IsPathToIndVar = [&InnerInduction, &IsPathToIndVar](Value *V) -> bool {
-      if (V == InnerInduction)
+    std::function<bool(Value *)> IsPathToInnerIndVar;
+    IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
+      if (llvm::is_contained(InnerLoopInductions, V))
         return true;
       if (isa<Constant>(V))
         return true;
-      Instruction *I = dyn_cast<Instruction>(V);
+      const Instruction *I = dyn_cast<Instruction>(V);
       if (!I)
         return false;
       if (isa<CastInst>(I))
-        return IsPathToIndVar(I->getOperand(0));
+        return IsPathToInnerIndVar(I->getOperand(0));
       if (isa<BinaryOperator>(I))
-        return IsPathToIndVar(I->getOperand(0)) &&
-               IsPathToIndVar(I->getOperand(1));
+        return IsPathToInnerIndVar(I->getOperand(0)) &&
+               IsPathToInnerIndVar(I->getOperand(1));
       return false;
     };
 
-    if (IsPathToIndVar(Op0) && !isa<Constant>(Op0)) {
+    // In case of multiple inner loop indvars, it is okay if LHS and RHS
+    // are both inner indvar related variables.
+    if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
+      return true;
+
+    // Otherwise we check if the cmp instruction compares an inner indvar
+    // related variable (Left) with a outer loop invariant (Right).
+    if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {
       Left = Op0;
       Right = Op1;
-    } else if (IsPathToIndVar(Op1) && !isa<Constant>(Op1)) {
+    } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {
       Left = Op1;
       Right = Op0;
     }
@@ -814,7 +806,6 @@ bool LoopInterchangeLegality::currentLimitations() {
     return true;
   }
 
-  PHINode *InnerInductionVar;
   SmallVector<PHINode *, 8> Inductions;
   if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
     LLVM_DEBUG(
@@ -845,24 +836,8 @@ bool LoopInterchangeLegality::currentLimitations() {
     return true;
   }
 
-  // TODO: Currently we handle only loops with 1 induction variable.
-  if (Inductions.size() != 1) {
-    LLVM_DEBUG(
-        dbgs() << "We currently only support loops with 1 induction variable."
-               << "Failed to interchange due to current limitation\n");
-    ORE->emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner",
-                                      InnerLoop->getStartLoc(),
-                                      InnerLoop->getHeader())
-             << "Only inner loops with 1 induction variable can be "
-                "interchanged currently.";
-    });
-    return true;
-  }
-  InnerInductionVar = Inductions.pop_back_val();
-
   // TODO: Triangular loops are not handled for now.
-  if (!isLoopStructureUnderstood(InnerInductionVar)) {
+  if (!isLoopStructureUnderstood()) {
     LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
@@ -876,6 +851,16 @@ bool LoopInterchangeLegality::currentLimitations() {
   return false;
 }
 
+bool LoopInterchangeLegality::findInductions(
+    Loop *L, SmallVectorImpl<PHINode *> &Inductions) {
+  for (PHINode &PHI : L->getHeader()->phis()) {
+    InductionDescriptor ID;
+    if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
+      Inductions.push_back(&PHI);
+  }
+  return !Inductions.empty();
+}
+
 // We currently only support LCSSA PHI nodes in the inner loop exit, if their
 // users are either reduction PHIs or PHIs outside the outer loop (which means
 // the we are only interested in the final value after the loop).
@@ -1004,6 +989,11 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
         return false;
       }
 
+  if (!findInductions(InnerLoop, InnerLoopInductions)) {
+    LLVM_DEBUG(dbgs() << "Cound not find inner loop induction variables.\n");
+    return false;
+  }
+
   if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
     LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
     ORE->emit([&]() {
@@ -1260,25 +1250,25 @@ void LoopInterchangeTransform::restructureLoops(
 
 bool LoopInterchangeTransform::transform() {
   bool Transformed = false;
-  Instruction *InnerIndexVar;
 
   if (InnerLoop->getSubLoops().empty()) {
     BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
     LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
-    PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
-    if (!InductionPHI) {
+    auto &InductionPHIs = LIL.getInnerLoopInductions();
+    if (InductionPHIs.empty()) {
       LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
       return false;
     }
 
-    if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
-      InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1));
-    else
-      InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
-
-    // Ensure that InductionPHI is the first Phi node.
-    if (&InductionPHI->getParent()->front() != InductionPHI)
-      InductionPHI->moveBefore(&InductionPHI->getParent()->front());
+    SmallVector<Instruction *, 8> InnerIndexVarList;
+    for (PHINode *CurInductionPHI : InductionPHIs) {
+      if (CurInductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
+        InnerIndexVarList.push_back(
+            dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(1)));
+      else
+        InnerIndexVarList.push_back(
+            dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(0)));
+    }
 
     // Create a new latch block for the inner loop. We split at the
     // current latch's terminator and then move the condition and all
@@ -1290,7 +1280,7 @@ bool LoopInterchangeTransform::transform() {
 
     SmallSetVector<Instruction *, 4> WorkList;
     unsigned i = 0;
-    auto MoveInstructions = [&i, &WorkList, this, InductionPHI, NewLatch]() {
+    auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
       for (; i < WorkList.size(); i++) {
         // Duplicate instruction and move it the new latch. Update uses that
         // have been moved.
@@ -1302,7 +1292,8 @@ bool LoopInterchangeTransform::transform() {
         for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
           Instruction *UserI = cast<Instruction>(U.getUser());
           if (!InnerLoop->contains(UserI->getParent()) ||
-              UserI->getParent() == NewLatch || UserI == InductionPHI)
+              UserI->getParent() == NewLatch ||
+              llvm::is_contained(InductionPHIs, UserI))
             U.set(NewI);
         }
         // Add operands of moved instruction to the worklist, except if they are
@@ -1311,7 +1302,7 @@ bool LoopInterchangeTransform::transform() {
           Instruction *OpI = dyn_cast<Instruction>(Op);
           if (!OpI ||
               this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
-              OpI == InductionPHI)
+              llvm::is_contained(InductionPHIs, OpI))
             continue;
           WorkList.insert(OpI);
         }
@@ -1325,7 +1316,8 @@ bool LoopInterchangeTransform::transform() {
     if (CondI)
       WorkList.insert(CondI);
     MoveInstructions();
-    WorkList.insert(cast<Instruction>(InnerIndexVar));
+    for (Instruction *InnerIndexVar : InnerIndexVarList)
+      WorkList.insert(cast<Instruction>(InnerIndexVar));
     MoveInstructions();
 
     // Splits the inner loops phi nodes out into a separate basic block.
@@ -1624,7 +1616,8 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
   SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
   for (PHINode &PHI : InnerLoopHeader->phis())
     if (OuterInnerReductions.contains(&PHI))
-      InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
+      InnerLoopPHIs.push_back(&PHI);
+
   for (PHINode &PHI : OuterLoopHeader->phis())
     if (OuterInnerReductions.contains(&PHI))
       OuterLoopPHIs.push_back(&PHI);
@@ -1638,6 +1631,7 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
     assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
   }
   for (PHINode *PHI : InnerLoopPHIs) {
+    LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
     PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
     assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
   }

diff  --git a/llvm/test/Transforms/LoopInterchange/interchangeable-innerloop-multiple-indvars.ll b/llvm/test/Transforms/LoopInterchange/interchangeable-innerloop-multiple-indvars.ll
new file mode 100644
index 0000000000000..830d0a26c9507
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/interchangeable-innerloop-multiple-indvars.ll
@@ -0,0 +1,297 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basic-aa -loop-interchange -verify-dom-info -verify-loop-info -verify-scev -verify-loop-lcssa -S | FileCheck %s
+
+ at b = common dso_local local_unnamed_addr global [200 x [200 x i32]] zeroinitializer, align 4
+ at a = common dso_local local_unnamed_addr global i32 0, align 4
+
+;; int a, c, d, e;
+;; int b[200][200];
+;; void fn1() {
+;;   for (c = 0; c < 100; c++) {
+;;     for (d = 5, e = 5; d > 0, e > 0; d--, e--)
+;;       a |= b[d][c + 9];
+;;   }
+;; }
+;
+; There are multiple inner loop indvars and only one
+; of them is used in the loop exit condition at the
+; inner loop latch.
+;
+define void @test1() {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    br label [[FOR_BODY3_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_OUTER:%.*]] = phi i64 [ [[INDVARS_OUTER_NEXT:%.*]], [[FOR_INC7:%.*]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ]
+; CHECK-NEXT:    [[OR_REDUCTION_INNER:%.*]] = phi i32 [ [[OR:%.*]], [[FOR_INC7]] ], [ [[OR_REDUCTION_OUTER:%.*]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = add nsw i64 [[INDVARS_OUTER]], 9
+; CHECK-NEXT:    br label [[FOR_BODY3_SPLIT1:%.*]]
+; CHECK:       for.body3.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
+; CHECK:       for.body3:
+; CHECK-NEXT:    [[INDVAR0:%.*]] = phi i64 [ [[TMP0:%.*]], [[FOR_BODY3_SPLIT:%.*]] ], [ 5, [[FOR_BODY3_PREHEADER]] ]
+; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY3_SPLIT]] ], [ 5, [[FOR_BODY3_PREHEADER]] ]
+; CHECK-NEXT:    [[OR_REDUCTION_OUTER]] = phi i32 [ [[OR_LCSSA:%.*]], [[FOR_BODY3_SPLIT]] ], [ [[A]], [[FOR_BODY3_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY_PREHEADER]]
+; CHECK:       for.body3.split1:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [200 x [200 x i32]], [200 x [200 x i32]]* @b, i64 0, i64 [[INDVAR0]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LOAD_VAL:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[OR]] = or i32 [[OR_REDUCTION_INNER]], [[LOAD_VAL]]
+; CHECK-NEXT:    [[INDVAR0_NEXT:%.*]] = add nsw i64 [[INDVAR0]], -1
+; CHECK-NEXT:    [[INDVAR1_NEXT:%.*]] = add nsw i32 [[INDVAR1]], -1
+; CHECK-NEXT:    [[TOBOOL2:%.*]] = icmp eq i32 [[INDVAR1_NEXT]], 0
+; CHECK-NEXT:    br label [[FOR_INC7]]
+; CHECK:       for.body3.split:
+; CHECK-NEXT:    [[OR_LCSSA]] = phi i32 [ [[OR]], [[FOR_INC7]] ]
+; CHECK-NEXT:    [[TMP0]] = add nsw i64 [[INDVAR0]], -1
+; CHECK-NEXT:    [[TMP1]] = add nsw i32 [[INDVAR1]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_COND_FOR_END8_CRIT_EDGE:%.*]], label [[FOR_BODY3]]
+; CHECK:       for.inc7:
+; CHECK-NEXT:    [[INDVARS_OUTER_NEXT]] = add nsw i64 [[INDVARS_OUTER]], 1
+; CHECK-NEXT:    [[INDVARS_OUTER_NEXT_TRUNC:%.*]] = trunc i64 [[INDVARS_OUTER_NEXT]] to i32
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[INDVARS_OUTER_NEXT_TRUNC]], 100
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[FOR_BODY3_SPLIT]], label [[FOR_BODY]]
+; CHECK:       for.cond.for.end8_crit_edge:
+; CHECK-NEXT:    [[OR_LCSSA_LCSSA:%.*]] = phi i32 [ [[OR_LCSSA]], [[FOR_BODY3_SPLIT]] ]
+; CHECK-NEXT:    store i32 [[OR_LCSSA_LCSSA]], i32* @a, align 4
+; CHECK-NEXT:    br label [[FOR_END8:%.*]]
+; CHECK:       for.end8:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  %a = load i32, i32* @a
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc7
+  %indvars.outer = phi i64 [ 0, %entry ], [ %indvars.outer.next, %for.inc7 ]
+  %or.reduction.outer = phi i32 [ %a, %entry ], [ %or.lcssa, %for.inc7 ]
+  %index = add nsw i64 %indvars.outer, 9
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body, %for.body3
+  %or.reduction.inner = phi i32 [ %or.reduction.outer, %for.body ], [ %or, %for.body3 ]
+  %indvar0 = phi i64 [ 5, %for.body ], [ %indvar0.next, %for.body3 ]
+  %indvar1 = phi i32 [ 5, %for.body ], [ %indvar1.next, %for.body3 ]
+  %arrayidx5 = getelementptr inbounds [200 x [200 x i32]], [200 x [200 x i32]]* @b, i64 0, i64 %indvar0, i64 %index
+  %load.val = load i32, i32* %arrayidx5, align 4
+  %or = or i32 %or.reduction.inner, %load.val
+  %indvar0.next = add nsw i64 %indvar0, -1
+  %indvar1.next = add nsw i32 %indvar1, -1
+  %tobool2 = icmp eq i32 %indvar1.next, 0
+  br i1 %tobool2, label %for.inc7, label %for.body3
+
+for.inc7:                                         ; preds = %for.body3
+  %or.lcssa = phi i32 [ %or, %for.body3 ]
+  %indvars.outer.next = add nsw i64 %indvars.outer, 1
+  %indvars.outer.next.trunc = trunc i64 %indvars.outer.next to i32
+  %tobool = icmp eq i32 %indvars.outer.next.trunc, 100
+  br i1 %tobool, label %for.cond.for.end8_crit_edge, label %for.body
+
+for.cond.for.end8_crit_edge:                      ; preds = %for.inc7
+  %or.lcssa.lcssa = phi i32 [ %or.lcssa, %for.inc7 ]
+  store i32 %or.lcssa.lcssa, i32* @a
+  br label %for.end8
+
+for.end8:                                         ; preds = %for.cond.for.end8_crit_edge, %entry
+  ret void
+}
+
+;; int a, c, d, e;
+;; int b[200][200];
+;; void fn1() {
+;;   for (c = 0 ; c < 100; c++) {
+;;     for (d = 5, e = 6; d + e > 0; d--, e = e - 2)
+;;       a |= b[d][c + 9];
+;;   }
+;; }
+;
+; All inner loop indvars are used in the inner latch.
+;
+define void @test2() {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    br label [[FOR_BODY3_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_OUTER:%.*]] = phi i64 [ [[INDVARS_OUTER_NEXT:%.*]], [[FOR_INC7:%.*]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ]
+; CHECK-NEXT:    [[OR_REDUCTION_INNER:%.*]] = phi i32 [ [[OR:%.*]], [[FOR_INC7]] ], [ [[OR_REDUCTION_OUTER:%.*]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = add nsw i64 [[INDVARS_OUTER]], 9
+; CHECK-NEXT:    br label [[FOR_BODY3_SPLIT1:%.*]]
+; CHECK:       for.body3.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
+; CHECK:       for.body3:
+; CHECK-NEXT:    [[INDVAR0:%.*]] = phi i64 [ [[TMP2:%.*]], [[FOR_BODY3_SPLIT:%.*]] ], [ 5, [[FOR_BODY3_PREHEADER]] ]
+; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i32 [ [[TMP0:%.*]], [[FOR_BODY3_SPLIT]] ], [ 6, [[FOR_BODY3_PREHEADER]] ]
+; CHECK-NEXT:    [[OR_REDUCTION_OUTER]] = phi i32 [ [[OR_LCSSA:%.*]], [[FOR_BODY3_SPLIT]] ], [ [[A]], [[FOR_BODY3_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY_PREHEADER]]
+; CHECK:       for.body3.split1:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [200 x [200 x i32]], [200 x [200 x i32]]* @b, i64 0, i64 [[INDVAR0]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LOAD_VAL:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[OR]] = or i32 [[OR_REDUCTION_INNER]], [[LOAD_VAL]]
+; CHECK-NEXT:    [[INDVAR0_NEXT:%.*]] = add nsw i64 [[INDVAR0]], -1
+; CHECK-NEXT:    [[INDVAR1_NEXT:%.*]] = add nsw i32 [[INDVAR1]], -2
+; CHECK-NEXT:    [[INDVAR1_NEXT_EXT:%.*]] = sext i32 [[INDVAR1_NEXT]] to i64
+; CHECK-NEXT:    [[INDVARS_ADD:%.*]] = add nsw i64 [[INDVAR0_NEXT]], [[INDVAR1_NEXT_EXT]]
+; CHECK-NEXT:    [[TOBOOL2:%.*]] = icmp eq i64 [[INDVARS_ADD]], 0
+; CHECK-NEXT:    br label [[FOR_INC7]]
+; CHECK:       for.body3.split:
+; CHECK-NEXT:    [[OR_LCSSA]] = phi i32 [ [[OR]], [[FOR_INC7]] ]
+; CHECK-NEXT:    [[TMP0]] = add nsw i32 [[INDVAR1]], -2
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP2]] = add nsw i64 [[INDVAR0]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[FOR_COND_FOR_END8_CRIT_EDGE:%.*]], label [[FOR_BODY3]]
+; CHECK:       for.inc7:
+; CHECK-NEXT:    [[INDVARS_OUTER_NEXT]] = add nsw i64 [[INDVARS_OUTER]], 1
+; CHECK-NEXT:    [[INDVARS_OUTER_NEXT_TRUNC:%.*]] = trunc i64 [[INDVARS_OUTER_NEXT]] to i32
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[INDVARS_OUTER_NEXT_TRUNC]], 100
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[FOR_BODY3_SPLIT]], label [[FOR_BODY]]
+; CHECK:       for.cond.for.end8_crit_edge:
+; CHECK-NEXT:    [[OR_LCSSA_LCSSA:%.*]] = phi i32 [ [[OR_LCSSA]], [[FOR_BODY3_SPLIT]] ]
+; CHECK-NEXT:    store i32 [[OR_LCSSA_LCSSA]], i32* @a, align 4
+; CHECK-NEXT:    br label [[FOR_END8:%.*]]
+; CHECK:       for.end8:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = load i32, i32* @a
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc7
+  %indvars.outer = phi i64 [ 0, %entry ], [ %indvars.outer.next, %for.inc7 ]
+  %or.reduction.outer = phi i32 [ %a, %entry ], [ %or.lcssa, %for.inc7 ]
+  %index = add nsw i64 %indvars.outer, 9
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body, %for.body3
+  %or.reduction.inner = phi i32 [ %or.reduction.outer, %for.body ], [ %or, %for.body3 ]
+  %indvar0 = phi i64 [ 5, %for.body ], [ %indvar0.next, %for.body3 ]
+  %indvar1 = phi i32 [ 6, %for.body ], [ %indvar1.next, %for.body3 ]
+  %arrayidx5 = getelementptr inbounds [200 x [200 x i32]], [200 x [200 x i32]]* @b, i64 0, i64 %indvar0, i64 %index
+  %load.val = load i32, i32* %arrayidx5, align 4
+  %or = or i32 %or.reduction.inner, %load.val
+  %indvar0.next = add nsw i64 %indvar0, -1
+  %indvar1.next = add nsw i32 %indvar1, -2
+  %indvar1.next.ext = sext i32 %indvar1.next to i64
+  %indvars.add = add nsw i64 %indvar0.next, %indvar1.next.ext
+  %tobool2 = icmp eq i64 %indvars.add, 0
+  br i1 %tobool2, label %for.inc7, label %for.body3
+
+for.inc7:                                         ; preds = %for.body3
+  %or.lcssa = phi i32 [ %or, %for.body3 ]
+  %indvars.outer.next = add nsw i64 %indvars.outer, 1
+  %indvars.outer.next.trunc = trunc i64 %indvars.outer.next to i32
+  %tobool = icmp eq i32 %indvars.outer.next.trunc, 100
+  br i1 %tobool, label %for.cond.for.end8_crit_edge, label %for.body
+
+for.cond.for.end8_crit_edge:                      ; preds = %for.inc7
+  %or.lcssa.lcssa = phi i32 [ %or.lcssa, %for.inc7 ]
+  store i32 %or.lcssa.lcssa, i32* @a
+  br label %for.end8
+
+for.end8:                                         ; preds = %for.cond.for.end8_crit_edge, %entry
+  ret void
+}
+
+;; int a, c, d, e;
+;; int b[200][200];
+;; void fn1() {
+;;   for (c = 0 ; c < 100; c++) {
+;;     d = 5;
+;;     e = 49;
+;;     for (; d != e; d++, e--)
+;;       a |= b[d][c + 9];
+;;   }
+;; }
+;
+; Two inner loop indvars are involved in the inner loop exit
+; condition as LHS and RHS.
+define void @test3() {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    br label [[FOR_BODY3_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_OUTER:%.*]] = phi i64 [ [[INDVARS_OUTER_NEXT:%.*]], [[FOR_INC7:%.*]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ]
+; CHECK-NEXT:    [[OR_REDUCTION_INNER:%.*]] = phi i32 [ [[OR:%.*]], [[FOR_INC7]] ], [ [[OR_REDUCTION_OUTER:%.*]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = add nsw i64 [[INDVARS_OUTER]], 9
+; CHECK-NEXT:    br label [[FOR_BODY3_SPLIT1:%.*]]
+; CHECK:       for.body3.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
+; CHECK:       for.body3:
+; CHECK-NEXT:    [[INDVAR0:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY3_SPLIT:%.*]] ], [ 5, [[FOR_BODY3_PREHEADER]] ]
+; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i32 [ [[TMP0:%.*]], [[FOR_BODY3_SPLIT]] ], [ 49, [[FOR_BODY3_PREHEADER]] ]
+; CHECK-NEXT:    [[OR_REDUCTION_OUTER]] = phi i32 [ [[OR_LCSSA:%.*]], [[FOR_BODY3_SPLIT]] ], [ [[A]], [[FOR_BODY3_PREHEADER]] ]
+; CHECK-NEXT:    br label [[FOR_BODY_PREHEADER]]
+; CHECK:       for.body3.split1:
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [200 x [200 x i32]], [200 x [200 x i32]]* @b, i64 0, i32 [[INDVAR0]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LOAD_VAL:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[OR]] = or i32 [[OR_REDUCTION_INNER]], [[LOAD_VAL]]
+; CHECK-NEXT:    [[INDVAR0_NEXT:%.*]] = add nsw i32 [[INDVAR0]], 1
+; CHECK-NEXT:    [[INDVAR1_NEXT:%.*]] = add nsw i32 [[INDVAR1]], -1
+; CHECK-NEXT:    [[TOBOOL2:%.*]] = icmp eq i32 [[INDVAR0_NEXT]], [[INDVAR1_NEXT]]
+; CHECK-NEXT:    br label [[FOR_INC7]]
+; CHECK:       for.body3.split:
+; CHECK-NEXT:    [[OR_LCSSA]] = phi i32 [ [[OR]], [[FOR_INC7]] ]
+; CHECK-NEXT:    [[TMP0]] = add nsw i32 [[INDVAR1]], -1
+; CHECK-NEXT:    [[TMP1]] = add nsw i32 [[INDVAR0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[FOR_COND_FOR_END8_CRIT_EDGE:%.*]], label [[FOR_BODY3]]
+; CHECK:       for.inc7:
+; CHECK-NEXT:    [[INDVARS_OUTER_NEXT]] = add nsw i64 [[INDVARS_OUTER]], 1
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i64 [[INDVARS_OUTER_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[FOR_BODY3_SPLIT]], label [[FOR_BODY]]
+; CHECK:       for.cond.for.end8_crit_edge:
+; CHECK-NEXT:    [[OR_LCSSA_LCSSA:%.*]] = phi i32 [ [[OR_LCSSA]], [[FOR_BODY3_SPLIT]] ]
+; CHECK-NEXT:    store i32 [[OR_LCSSA_LCSSA]], i32* @a, align 4
+; CHECK-NEXT:    br label [[FOR_END8:%.*]]
+; CHECK:       for.end8:
+; CHECK-NEXT:    ret void
+;
+
+entry:
+  %a = load i32, i32* @a
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc7
+  %indvars.outer = phi i64 [ 0, %entry ], [ %indvars.outer.next, %for.inc7 ]
+  %or.reduction.outer = phi i32 [ %a, %entry ], [ %or.lcssa, %for.inc7 ]
+  %index = add nsw i64 %indvars.outer, 9
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body, %for.body3
+  %or.reduction.inner = phi i32 [ %or.reduction.outer, %for.body ], [ %or, %for.body3 ]
+  %indvar0 = phi i32 [ 5, %for.body ], [ %indvar0.next, %for.body3 ]
+  %indvar1 = phi i32 [ 49, %for.body ], [ %indvar1.next, %for.body3 ]
+  %arrayidx5 = getelementptr inbounds [200 x [200 x i32]], [200 x [200 x i32]]* @b, i64 0, i32 %indvar0, i64 %index
+  %load.val = load i32, i32* %arrayidx5, align 4
+  %or = or i32 %or.reduction.inner, %load.val
+  %indvar0.next = add nsw i32 %indvar0, 1
+  %indvar1.next = add nsw i32 %indvar1, -1
+  %tobool2 = icmp eq i32 %indvar0.next, %indvar1.next
+  br i1 %tobool2, label %for.inc7, label %for.body3
+
+for.inc7:                                         ; preds = %for.body3
+  %or.lcssa = phi i32 [ %or, %for.body3 ]
+  %indvars.outer.next = add nsw i64 %indvars.outer, 1
+  %tobool = icmp eq i64 %indvars.outer.next, 100
+  br i1 %tobool, label %for.cond.for.end8_crit_edge, label %for.body
+
+for.cond.for.end8_crit_edge:                      ; preds = %for.inc7
+  %or.lcssa.lcssa = phi i32 [ %or.lcssa, %for.inc7 ]
+  store i32 %or.lcssa.lcssa, i32* @a
+  br label %for.end8
+
+for.end8:                                         ; preds = %for.cond.for.end8_crit_edge, %entry
+  ret void
+}