[llvm] 2df8bf9 - [LoopFlatten] Fix missed LoopFlatten opportunity

Thu Jul 29 01:53:28 PDT 2021

Author: Rosie Sumpter
Date: 2021-07-29T09:47:41+01:00
New Revision: 2df8bf9339e43de63d8d28e07182e1d6d7ffb843

URL: https://github.com/llvm/llvm-project/commit/2df8bf9339e43de63d8d28e07182e1d6d7ffb843
DIFF: https://github.com/llvm/llvm-project/commit/2df8bf9339e43de63d8d28e07182e1d6d7ffb843.diff

LOG: [LoopFlatten] Fix missed LoopFlatten opportunity

When the trip count of the inner loop is a constant, the InstCombine
pass now causes the transformation e.g. imcp ult i32 %inc, tripcount ->
icmp ult %j, tripcount-step (where %j is the inner loop induction
variable and %inc is add %j, step), which is now accounted for when
identifying the trip count of the loop. This is also an acceptable use
of %j (provided the step is 1) so is ignored as long as the compare
that it's used in is also the condition of the inner branch.

Differential Revision: https://reviews.llvm.org/D105802

Added: 
    

Modified: 
    llvm/lib/Transforms/Scalar/LoopFlatten.cpp
    llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll
    llvm/test/Transforms/LoopFlatten/loop-flatten.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index 46f12aafb552d..9b94cf9dde9f5 100644

--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -167,8 +167,7 @@ static bool findLoopComponents(
   // count computed by SCEV then this is either because the trip count variable
   // has been widened (then leave the trip count as it is), or because it is a
   // constant and another transformation has changed the compare, e.g.
-  // icmp ult %inc, tripcount -> icmp ult %j, tripcount-1, then we don't flatten
-  // the loop (yet).
+  // icmp ult %inc, tripcount -> icmp ult %j, tripcount-1.
   TripCount = Compare->getOperand(1);
   const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
@@ -176,12 +175,22 @@ static bool findLoopComponents(
     return false;
   }
   const SCEV *SCEVTripCount = SE->getTripCountFromExitCount(BackedgeTakenCount);
-  if (SE->getSCEV(TripCount) != SCEVTripCount) {
-    if (!IsWidened) {
-      LLVM_DEBUG(dbgs() << "Could not find valid trip count\n");
-      return false;
-    }
-    auto TripCountInst = dyn_cast<Instruction>(TripCount);
+  if (SE->getSCEV(TripCount) != SCEVTripCount && !IsWidened) {
+    ConstantInt *RHS = dyn_cast<ConstantInt>(TripCount);
+    // If the IV hasn't been widened, the only way the RHS of the Compare can be
+    // 
diff erent from the SCEV trip count is if it is a constant which has been
+    // changed by another transformation.
+    assert(RHS && "Expected RHS of compare to be constant");
+    // The L->isCanonical check above ensures we only get here if the loop
+    // increments by 1 on each iteration, so the RHS of the Compare is
+    // tripcount-1 (i.e equivalent to the backedge taken count).
+    assert(SE->getSCEV(RHS) == BackedgeTakenCount &&
+           "Expected RHS of compare to be equal to the backedge taken count");
+    ConstantInt *One = ConstantInt::get(RHS->getType(), 1);
+    TripCount = ConstantInt::get(TripCount->getContext(),
+                                 RHS->getValue() + One->getValue());
+  } else if (SE->getSCEV(TripCount) != SCEVTripCount) {
+    auto *TripCountInst = dyn_cast<Instruction>(TripCount);
     if (!TripCountInst) {
       LLVM_DEBUG(dbgs() << "Could not find valid extended trip count\n");
       return false;
@@ -368,6 +377,13 @@ static bool checkIVUsers(FlattenInfo &FI) {
       U = *U->user_begin();
     }
 
+    // If the use is in the compare (which is also the condition of the inner
+    // branch) then the compare has been altered by another transformation e.g
+    // icmp ult %inc, tripcount -> icmp ult %j, tripcount-1, where tripcount is
+    // a constant. Ignore this use as the compare gets removed later anyway.
+    if (U == FI.InnerBranch->getCondition())
+      continue;
+
     LLVM_DEBUG(dbgs() << "Found use of inner induction variable: "; U->dump());
 
     Value *MatchedMul;

diff  --git a/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll b/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll
index 3806414e1e503..1e264ff427524 100644
--- a/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll
+++ b/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll
@@ -341,38 +341,111 @@ for.end8:                                         ; preds = %for.inc6
   ret i32 10
 }
 
-; When the loop trip count is a constant (e.g. 20) and the step size is
-; 1, InstCombine causes the transformation icmp ult i32 %inc, 20 ->
-; icmp ult i32 %j, 19. In this case a valid trip count is not found so
-; the loop is not flattened. 
-define i32 @test9(i32* nocapture %A) {
+; test_10, test_11 and test_12 are for the case when the
+; inner trip count is a constant, then the InstCombine pass
+; makes the transformation icmp ult i32 %inc, tripcount ->
+; icmp ult i32 %j, tripcount-step.
+
+; test_10: The step is not 1.
+define i32 @test_10(i32* nocapture %A) {
 entry:
   br label %for.cond1.preheader
 
 for.cond1.preheader:
-  %i.017 = phi i32 [ 0, %entry ], [ %inc6, %for.cond.cleanup3 ]
+  %i.017 = phi i32 [ 0, %entry ], [ %inc, %for.cond.cleanup3 ]
   %mul = mul i32 %i.017, 20
   br label %for.body4
 
+for.body4:
+  %j.016 = phi i32 [ 0, %for.cond1.preheader ], [ %add5, %for.body4 ]
+  %add = add i32 %j.016, %mul
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add
+  store i32 30, i32* %arrayidx, align 4
+  %add5 = add nuw nsw i32 %j.016, 2
+  %cmp2 = icmp ult i32 %j.016, 18
+  br i1 %cmp2, label %for.body4, label %for.cond.cleanup3
+
 for.cond.cleanup3:
-  %inc6 = add i32 %i.017, 1
-  %cmp = icmp ult i32 %inc6, 11
+  %inc = add i32 %i.017, 1
+  %cmp = icmp ult i32 %inc, 11
   br i1 %cmp, label %for.cond1.preheader, label %for.cond.cleanup
 
+for.cond.cleanup:
+  %0 = load i32, i32* %A, align 4
+  ret i32 %0
+}
+
+; test_11: The inner inducation variable is used in a compare which
+; isn't the condition of the inner branch.
+define i32 @test_11(i32* nocapture %A) {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %i.020 = phi i32 [ 0, %entry ], [ %inc7, %for.cond.cleanup3 ]
+  %mul = mul i32 %i.020, 20
+  br label %for.body4
+
 for.body4:
-  %j.016 = phi i32 [ 0, %for.cond1.preheader ], [ %inc, %for.body4 ]
-  %add = add i32 %j.016, %mul
+  %j.019 = phi i32 [ 0, %for.cond1.preheader ], [ %inc, %for.body4 ]
+  %cmp5 = icmp ult i32 %j.019, 5
+  %cond = select i1 %cmp5, i32 30, i32 15
+  %add = add i32 %j.019, %mul
   %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add
-  store i32 30, i32* %arrayidx, align 4
-  %inc = add nuw nsw i32 %j.016, 1
-  %cmp2 = icmp ult i32 %j.016, 19
+  store i32 %cond, i32* %arrayidx, align 4
+  %inc = add nuw nsw i32 %j.019, 1
+  %cmp2 = icmp ult i32 %j.019, 19
   br i1 %cmp2, label %for.body4, label %for.cond.cleanup3
 
+for.cond.cleanup3:
+  %inc7 = add i32 %i.020, 1
+  %cmp = icmp ult i32 %inc7, 11
+  br i1 %cmp, label %for.cond1.preheader, label %for.cond.cleanup
+
 for.cond.cleanup:
   %0 = load i32, i32* %A, align 4
   ret i32 %0
 }
 
+; test_12: Incoming phi node value for preheader is a variable
+define i32 @test_12(i32* %A) {
+entry:
+  br label %while.cond1.preheader
+
+while.cond1.preheader:
+  %j.017 = phi i32 [ 0, %entry ], [ %j.1, %while.end ]
+  %i.016 = phi i32 [ 0, %entry ], [ %inc4, %while.end ]
+  %mul = mul i32 %i.016, 20
+  %cmp214 = icmp ult i32 %j.017, 20
+  br i1 %cmp214, label %while.body3.preheader, label %while.end
+
+while.body3.preheader:
+  br label %while.body3
+
+while.body3:
+  %j.115 = phi i32 [ %inc, %while.body3 ], [ %j.017, %while.body3.preheader ]
+  %add = add i32 %j.115, %mul
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add
+  store i32 30, i32* %arrayidx, align 4
+  %inc = add nuw nsw i32 %j.115, 1
+  %cmp2 = icmp ult i32 %j.115, 19
+  br i1 %cmp2, label %while.body3, label %while.end.loopexit
+
+while.end.loopexit:
+  %inc.lcssa = phi i32 [ %inc, %while.body3 ]
+  br label %while.end
+  
+while.end:
+  %j.1 = phi i32 [ %j.017, %while.cond1.preheader], [ %inc.lcssa, %while.end.loopexit ]
+  %inc4 = add i32 %i.016, 1
+  %cmp = icmp ult i32 %inc4, 11
+  br i1 %cmp, label %while.cond1.preheader, label %while.end5
+
+while.end5:
+  %0 = load i32, i32* %A, align 4
+  ret i32 %0
+}
+
 ; Outer loop conditional phi
 define i32 @e() {
 entry:

diff  --git a/llvm/test/Transforms/LoopFlatten/loop-flatten.ll b/llvm/test/Transforms/LoopFlatten/loop-flatten.ll
index 2d7f897472cea..73f50c0c843c5 100644
--- a/llvm/test/Transforms/LoopFlatten/loop-flatten.ll
+++ b/llvm/test/Transforms/LoopFlatten/loop-flatten.ll
@@ -586,6 +586,59 @@ for.end8:                                         ; preds = %for.inc6
   ret i32 10
 }
 
+; When the inner loop trip count is a constant and the step
+; is 1, the InstCombine pass causes the transformation e.g.
+; icmp ult i32 %inc, 20 -> icmp ult i32 %j, 19. This doesn't
+; match the pattern (OuterPHI * InnerTripCount) + InnerPHI but
+; we should still flatten the loop as the compare is removed
+; later anyway.
+define i32 @test9(i32* nocapture %A) {
+entry:
+  br label %for.cond1.preheader
+; CHECK-LABEL: test9
+; CHECK: entry:
+; CHECK: %flatten.tripcount = mul i32 20, 11
+; CHECK: br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %i.017 = phi i32 [ 0, %entry ], [ %inc6, %for.cond.cleanup3 ]
+  %mul = mul i32 %i.017, 20
+  br label %for.body4
+; CHECK: for.cond1.preheader:
+; CHECK:   %i.017 = phi i32 [ 0, %entry ], [ %inc6, %for.cond.cleanup3 ]
+; CHECK:   %mul = mul i32 %i.017, 20
+; CHECK:   br label %for.body4
+
+for.cond.cleanup3:
+  %inc6 = add i32 %i.017, 1
+  %cmp = icmp ult i32 %inc6, 11
+  br i1 %cmp, label %for.cond1.preheader, label %for.cond.cleanup
+; CHECK: for.cond.cleanup3:
+; CHECK:   %inc6 = add i32 %i.017, 1
+; CHECK:   %cmp = icmp ult i32 %inc6, %flatten.tripcount
+; CHECK:   br i1 %cmp, label %for.cond1.preheader, label %for.cond.cleanup
+
+for.body4:
+  %j.016 = phi i32 [ 0, %for.cond1.preheader ], [ %inc, %for.body4 ]
+  %add = add i32 %j.016, %mul
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add
+  store i32 30, i32* %arrayidx, align 4
+  %inc = add nuw nsw i32 %j.016, 1
+  %cmp2 = icmp ult i32 %j.016, 19
+  br i1 %cmp2, label %for.body4, label %for.cond.cleanup3
+; CHECK: for.body4
+; CHECK:   %j.016 = phi i32 [ 0, %for.cond1.preheader ]
+; CHECK:   %add = add i32 %j.016, %mul
+; CHECK:   %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.017
+; CHECK:   store i32 30, i32* %arrayidx, align 4
+; CHECK:   %inc = add nuw nsw i32 %j.016, 1
+; CHECK:   %cmp2 = icmp ult i32 %j.016, 19
+; CHECK:   br label %for.cond.cleanup3
+
+for.cond.cleanup:
+  %0 = load i32, i32* %A, align 4
+  ret i32 %0
+}
 
 declare i32 @func(i32)