[llvm] dfb7a90 - [LoopPred] Robustly handle partially unswitched loops

Thu Nov 21 15:44:42 PST 2019

Author: Philip Reames
Date: 2019-11-21T15:44:36-08:00
New Revision: dfb7a9091affce6e5a8fdb017b4e2d58675bb9b0

URL: https://github.com/llvm/llvm-project/commit/dfb7a9091affce6e5a8fdb017b4e2d58675bb9b0
DIFF: https://github.com/llvm/llvm-project/commit/dfb7a9091affce6e5a8fdb017b4e2d58675bb9b0.diff

LOG: [LoopPred] Robustly handle partially unswitched loops

We may end up with a case where we have a widenable branch above the loop, but not all widenable branches within the loop have been removed.  Since a widenable branch inhibit SCEVs ability to reason about exit counts (by design), we have a tradeoff between effectiveness of this optimization and allowing future widening of the branches within the loop.  LoopPred is thought to be one of the most important optimizations for range check elimination, so let's pay the cost.

Added: 
    

Modified: 
    llvm/lib/Transforms/Scalar/LoopPredication.cpp
    llvm/test/Transforms/LoopPredication/predicate-exits.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
index 1962c8ba39fb..1a42f6b23443 100644

--- a/llvm/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -1074,6 +1074,35 @@ bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   if (isa<SCEVCouldNotCompute>(LatchEC))
     return false; // profitability - want hot exit in analyzeable set
 
+  // At this point, we have found an analyzeable latch, and a widenable
+  // condition above the loop.  If we have a widenable exit within the loop
+  // (for which we can't compute exit counts), drop the ability to further
+  // widen so that we gain ability to analyze it's exit count and perform this
+  // transform.  TODO: It'd be nice to know for sure the exit became
+  // analyzeable after dropping widenability.
+  {
+    bool Invalidate = false;
+    
+    for (auto *ExitingBB : ExitingBlocks) {
+      if (LI->getLoopFor(ExitingBB) != L)
+        continue;
+
+      auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+      if (!BI)
+        continue;
+
+      Use *Cond, *WC;
+      BasicBlock *IfTrueBB, *IfFalseBB;
+      if (parseWidenableBranch(BI, Cond, WC, IfTrueBB, IfFalseBB) &&
+          L->contains(IfTrueBB)) {
+        WC->set(ConstantInt::getTrue(IfTrueBB->getContext()));
+        Invalidate = true;
+      }
+    }
+    if (Invalidate)
+      SE->forgetLoop(L);
+  }
+
   // The use of umin(all analyzeable exits) instead of latch is subtle, but
   // important for profitability.  We may have a loop which hasn't been fully
   // canonicalized just yet.  If the exit we chose to widen is provably never

diff  --git a/llvm/test/Transforms/LoopPredication/predicate-exits.ll b/llvm/test/Transforms/LoopPredication/predicate-exits.ll
index 0d6eba9015a4..83d85eb74a62 100644
--- a/llvm/test/Transforms/LoopPredication/predicate-exits.ll
+++ b/llvm/test/Transforms/LoopPredication/predicate-exits.ll
@@ -989,6 +989,117 @@ guarded:
 }
 
 
+; If we have a stray widenable branch in the loop, we should still be able to
+; run.  This can happen when unswitching's cost model avoids unswitching some
+; branches.
+define i32 @wb_in_loop(i32* %array, i32 %length, i32 %n, i1 %cond_0) {
+; CHECK-LABEL: @wb_in_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[WIDENABLE_COND:%.*]] = call i1 @llvm.experimental.widenable.condition()
+; CHECK-NEXT:    [[WC2:%.*]] = call i1 @llvm.experimental.widenable.condition()
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[N:%.*]], 1
+; CHECK-NEXT:    [[UMAX:%.*]] = select i1 [[TMP0]], i32 [[N]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[UMAX]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[LENGTH:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[UMIN:%.*]] = select i1 [[TMP2]], i32 [[LENGTH]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i32 [[LENGTH]], [[UMIN]]
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze i1 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and i1 [[TMP4]], [[COND_0:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i32 [[LENGTH]], [[UMIN]]
+; CHECK-NEXT:    [[TMP7:%.*]] = freeze i1 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = and i1 [[TMP7]], [[TMP5]]
+; CHECK-NEXT:    [[EXIPLICIT_GUARD_COND:%.*]] = and i1 [[TMP8]], [[WIDENABLE_COND]]
+; CHECK-NEXT:    br i1 [[EXIPLICIT_GUARD_COND]], label [[LOOP_PREHEADER:%.*]], label [[DEOPT:%.*]], !prof !0
+; CHECK:       deopt:
+; CHECK-NEXT:    [[DEOPTRET:%.*]] = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"() ]
+; CHECK-NEXT:    ret i32 [[DEOPTRET]]
+; CHECK:       loop.preheader:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[LOOP_ACC:%.*]] = phi i32 [ [[LOOP_ACC_NEXT:%.*]], [[GUARDED2:%.*]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[GUARDED2]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    [[WITHIN_BOUNDS:%.*]] = icmp ult i32 [[I]], [[LENGTH]]
+; CHECK-NEXT:    br i1 true, label [[GUARDED:%.*]], label [[DEOPT2:%.*]], !prof !0
+; CHECK:       deopt2:
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    [[DEOPTRET2:%.*]] = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"() ]
+; CHECK-NEXT:    ret i32 [[DEOPTRET2]]
+; CHECK:       guarded:
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    [[WITHIN_BOUNDS2:%.*]] = icmp ult i32 [[I]], [[LENGTH]]
+; CHECK-NEXT:    [[WB_COND:%.*]] = and i1 [[WITHIN_BOUNDS2]], true
+; CHECK-NEXT:    br i1 true, label [[GUARDED2]], label [[DEOPT3:%.*]], !prof !0
+; CHECK:       deopt3:
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    [[DEOPTRET3:%.*]] = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"() ]
+; CHECK-NEXT:    ret i32 [[DEOPTRET3]]
+; CHECK:       guarded2:
+; CHECK-NEXT:    [[I_I64:%.*]] = zext i32 [[I]] to i64
+; CHECK-NEXT:    [[ARRAY_I_PTR:%.*]] = getelementptr inbounds i32, i32* [[ARRAY:%.*]], i64 [[I_I64]]
+; CHECK-NEXT:    [[ARRAY_I:%.*]] = load i32, i32* [[ARRAY_I_PTR]], align 4
+; CHECK-NEXT:    store i32 0, i32* [[ARRAY_I_PTR]]
+; CHECK-NEXT:    [[LOOP_ACC_NEXT]] = add i32 [[LOOP_ACC]], [[ARRAY_I]]
+; CHECK-NEXT:    [[I_NEXT]] = add nuw i32 [[I]], 1
+; CHECK-NEXT:    [[CONTINUE:%.*]] = icmp ult i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CONTINUE]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RESULT:%.*]] = phi i32 [ [[LOOP_ACC_NEXT]], [[GUARDED2]] ]
+; CHECK-NEXT:    ret i32 [[RESULT]]
+;
+entry:
+  %widenable_cond = call i1 @llvm.experimental.widenable.condition()
+  %wc2 = call i1 @llvm.experimental.widenable.condition()
+  %exiplicit_guard_cond = and i1 %cond_0, %widenable_cond
+  br i1 %exiplicit_guard_cond, label %loop.preheader, label %deopt, !prof !0
+
+deopt:
+  %deoptret = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"() ]
+  ret i32 %deoptret
+
+loop.preheader:
+  br label %loop
+
+loop:
+  %loop.acc = phi i32 [ %loop.acc.next, %guarded2 ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %guarded2 ], [ 0, %loop.preheader ]
+  call void @unknown()
+  %within.bounds = icmp ult i32 %i, %length
+  br i1 %within.bounds, label %guarded, label %deopt2, !prof !0
+
+deopt2:
+  call void @unknown()
+  %deoptret2 = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"() ]
+  ret i32 %deoptret2
+
+guarded:
+  call void @unknown()
+  %within.bounds2 = icmp ult i32 %i, %length
+  %wb_cond = and i1 %within.bounds2, %wc2
+  br i1 %wb_cond, label %guarded2, label %deopt3, !prof !0
+
+deopt3:
+  call void @unknown()
+  %deoptret3 = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"() ]
+  ret i32 %deoptret3
+
+guarded2:
+  %i.i64 = zext i32 %i to i64
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  store i32 0, i32* %array.i.ptr
+  %loop.acc.next = add i32 %loop.acc, %array.i
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ %loop.acc.next, %guarded2 ]
+  ret i32 %result
+}
+
+
+
 declare void @unknown()
 
 declare i1 @llvm.experimental.widenable.condition()