[llvm] r327617 - [LoopUnroll] Ignore ephemeral values when checking full unroll profitability.

Thu Mar 15 02:59:15 PDT 2018

Author: a.elovikov
Date: Thu Mar 15 02:59:15 2018
New Revision: 327617

URL: http://llvm.org/viewvc/llvm-project?rev=327617&view=rev
Log:
[LoopUnroll] Ignore ephemeral values when checking full unroll profitability.

Summary:
Before this patch call graph is like this in the LoopUnrollPass:

  tryToUnrollLoop
    ApproximateLoopSize
      collectEphemeralValues
      /* Use collected ephemeral values */
    computeUnrollCount
      analyzeLoopUnrollCost
        /* Bail out from the analysis if loop contains CallInst */

This patch moves collection of the ephemeral values to the tryToUnrollLoop
function and passes the collected values into both ApproximateLoopsize (as
before) and additionally starts using them in analyzeLoopUnrollCost:

  tryToUnrollLoop
    collectEphemeralValues
    ApproximateLoopSize(EphValues)
      /* Use EphValues */
    computeUnrollCount(EphValues)
      analyzeLoopUnrollCost(EphValues)
        /* Ignore ephemeral values - they don't contribute to the final cost */
        /* Bail out from the analysis if loop contains CallInst */

Reviewers: mzolotukhin, evstupac, sanjoy

Reviewed By: evstupac

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D43931

Added:
    llvm/trunk/test/Transforms/LoopUnroll/complete_unroll_profitability_with_assume.ll
Modified:
    llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp

Modified: llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp?rev=327617&r1=327616&r2=327617&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp Thu Mar 15 02:59:15 2018
@@ -308,10 +308,10 @@ struct EstimatedUnrollCost {
 /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
 /// the analysis failed (no benefits expected from the unrolling, or the loop is
 /// too big to analyze), the returned value is None.
-static Optional<EstimatedUnrollCost>
-analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
-                      ScalarEvolution &SE, const TargetTransformInfo &TTI,
-                      unsigned MaxUnrolledLoopSize) {
+static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
+    const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
+    const SmallPtrSetImpl<const Value *> &EphValues,
+    const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize) {
   // We want to be able to scale offsets by the trip count and add more offsets
   // to them without checking for overflows, and we already don't want to
   // analyze *massive* trip counts, so we force the max to be reasonably small.
@@ -490,7 +490,9 @@ analyzeLoopUnrollCost(const Loop *L, uns
       // it.  We don't change the actual IR, just count optimization
       // opportunities.
       for (Instruction &I : *BB) {
-        if (isa<DbgInfoIntrinsic>(I))
+        // These won't get into the final code - don't even try calculating the
+        // cost for them.
+        if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))
           continue;
 
         // Track this instruction's expected baseline cost when executing the
@@ -607,13 +609,11 @@ analyzeLoopUnrollCost(const Loop *L, uns
 }
 
 /// ApproximateLoopSize - Approximate the size of the loop.
-static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
-                                    bool &NotDuplicatable, bool &Convergent,
-                                    const TargetTransformInfo &TTI,
-                                    AssumptionCache *AC, unsigned BEInsns) {
-  SmallPtrSet<const Value *, 32> EphValues;
-  CodeMetrics::collectEphemeralValues(L, AC, EphValues);
-
+static unsigned
+ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable,
+                    bool &Convergent, const TargetTransformInfo &TTI,
+                    const SmallPtrSetImpl<const Value *> &EphValues,
+                    unsigned BEInsns) {
   CodeMetrics Metrics;
   for (BasicBlock *BB : L->blocks())
     Metrics.analyzeBasicBlock(BB, TTI, EphValues);
@@ -708,8 +708,9 @@ static uint64_t getUnrolledLoopSize(
 // Calculates unroll count and writes it to UP.Count.
 static bool computeUnrollCount(
     Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
-    ScalarEvolution &SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount,
-    unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize,
+    ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
+    OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
+    unsigned &TripMultiple, unsigned LoopSize,
     TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
   // Check for explicit Count.
   // 1st priority is unroll count set by "unroll-count" option.
@@ -779,7 +780,7 @@ static bool computeUnrollCount(
       // helps to remove a significant number of instructions.
       // To check that, run additional analysis on the loop.
       if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
-              L, FullUnrollTripCount, DT, SE, TTI,
+              L, FullUnrollTripCount, DT, SE, EphValues, TTI,
               UP.Threshold * UP.MaxPercentThresholdBoost / 100)) {
         unsigned Boost =
             getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
@@ -975,8 +976,13 @@ static LoopUnrollResult tryToUnrollLoop(
   // Exit early if unrolling is disabled.
   if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0))
     return LoopUnrollResult::Unmodified;
-  unsigned LoopSize = ApproximateLoopSize(
-      L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC, UP.BEInsns);
+
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+
+  unsigned LoopSize =
+      ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
+                          TTI, EphValues, UP.BEInsns);
   DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
   if (NotDuplicatable) {
     DEBUG(dbgs() << "  Not unrolling loop which contains non-duplicatable"
@@ -1040,9 +1046,9 @@ static LoopUnrollResult tryToUnrollLoop(
   // computeUnrollCount() decides whether it is beneficial to use upper bound to
   // fully unroll the loop.
   bool UseUpperBound = false;
-  bool IsCountSetExplicitly =
-      computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount,
-                         TripMultiple, LoopSize, UP, UseUpperBound);
+  bool IsCountSetExplicitly = computeUnrollCount(
+      L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount,
+      TripMultiple, LoopSize, UP, UseUpperBound);
   if (!UP.Count)
     return LoopUnrollResult::Unmodified;
   // Unroll factor (Count) must be less or equal to TripCount.

Added: llvm/trunk/test/Transforms/LoopUnroll/complete_unroll_profitability_with_assume.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnroll/complete_unroll_profitability_with_assume.ll?rev=327617&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopUnroll/complete_unroll_profitability_with_assume.ll (added)
+++ llvm/trunk/test/Transforms/LoopUnroll/complete_unroll_profitability_with_assume.ll Thu Mar 15 02:59:15 2018
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S < %s -loop-unroll -unroll-threshold=42 | FileCheck %s --check-prefix=ANALYZE-FULL
+
+; This test is supposed to check that calls to @llvm.assume builtin are not
+; prohibiting the analysis of full unroll profitability in case the cost of the
+; unrolled loop (not acounting to any simplifications done by such unrolling) is
+; higher than some threshold.
+;
+; Ensure that we indeed are testing this code path by verifying that the loop is
+; not unrolled without such analysis:
+
+; RUN: opt -S < %s -loop-unroll -unroll-threshold=42 -unroll-max-iteration-count-to-analyze=2 \
+; RUN:        | FileCheck %s --check-prefix=DONT-ANALYZE-FULL
+
+; Function Attrs: nounwind
+declare void @llvm.assume(i1) #1
+
+define i32 @foo(i32* %a) {
+; ANALYZE-FULL-LABEL: @foo(
+; ANALYZE-FULL-NEXT:  entry:
+; ANALYZE-FULL-NEXT:    br label [[FOR_BODY:%.*]]
+; ANALYZE-FULL:       for.body:
+; ANALYZE-FULL-NEXT:    br i1 true, label [[DO_STORE:%.*]], label [[FOR_NEXT:%.*]]
+; ANALYZE-FULL:       do_store:
+; ANALYZE-FULL-NEXT:    store i32 0, i32* [[A:%.*]]
+; ANALYZE-FULL-NEXT:    br label [[FOR_NEXT]]
+; ANALYZE-FULL:       for.next:
+; ANALYZE-FULL-NEXT:    br i1 true, label [[DO_STORE_1:%.*]], label [[FOR_NEXT_1:%.*]]
+; ANALYZE-FULL:       do_store.1:
+; ANALYZE-FULL-NEXT:    [[GEP_1:%.*]] = getelementptr i32, i32* [[A]], i32 1
+; ANALYZE-FULL-NEXT:    store i32 1, i32* [[GEP_1]]
+; ANALYZE-FULL-NEXT:    br label [[FOR_NEXT_1]]
+; ANALYZE-FULL:       for.next.1:
+; ANALYZE-FULL-NEXT:    br i1 true, label [[DO_STORE_2:%.*]], label [[FOR_NEXT_2:%.*]]
+; ANALYZE-FULL:       do_store.2:
+; ANALYZE-FULL-NEXT:    [[GEP_2:%.*]] = getelementptr i32, i32* [[A]], i32 2
+; ANALYZE-FULL-NEXT:    store i32 2, i32* [[GEP_2]]
+; ANALYZE-FULL-NEXT:    br label [[FOR_NEXT_2]]
+; ANALYZE-FULL:       for.next.2:
+; ANALYZE-FULL-NEXT:    br i1 true, label [[DO_STORE_3:%.*]], label [[FOR_NEXT_3:%.*]]
+; ANALYZE-FULL:       do_store.3:
+; ANALYZE-FULL-NEXT:    [[GEP_3:%.*]] = getelementptr i32, i32* [[A]], i32 3
+; ANALYZE-FULL-NEXT:    store i32 3, i32* [[GEP_3]]
+; ANALYZE-FULL-NEXT:    br label [[FOR_NEXT_3]]
+; ANALYZE-FULL:       for.next.3:
+; ANALYZE-FULL-NEXT:    br i1 false, label [[DO_STORE_4:%.*]], label [[FOR_NEXT_4:%.*]]
+; ANALYZE-FULL:       do_store.4:
+; ANALYZE-FULL-NEXT:    [[GEP_4:%.*]] = getelementptr i32, i32* [[A]], i32 4
+; ANALYZE-FULL-NEXT:    store i32 4, i32* [[GEP_4]]
+; ANALYZE-FULL-NEXT:    br label [[FOR_NEXT_4]]
+; ANALYZE-FULL:       for.next.4:
+; ANALYZE-FULL-NEXT:    br i1 false, label [[DO_STORE_5:%.*]], label [[FOR_NEXT_5:%.*]]
+; ANALYZE-FULL:       do_store.5:
+; ANALYZE-FULL-NEXT:    [[GEP_5:%.*]] = getelementptr i32, i32* [[A]], i32 5
+; ANALYZE-FULL-NEXT:    store i32 5, i32* [[GEP_5]]
+; ANALYZE-FULL-NEXT:    br label [[FOR_NEXT_5]]
+; ANALYZE-FULL:       for.next.5:
+; ANALYZE-FULL-NEXT:    br i1 false, label [[DO_STORE_6:%.*]], label [[FOR_NEXT_6:%.*]]
+; ANALYZE-FULL:       do_store.6:
+; ANALYZE-FULL-NEXT:    [[GEP_6:%.*]] = getelementptr i32, i32* [[A]], i32 6
+; ANALYZE-FULL-NEXT:    store i32 6, i32* [[GEP_6]]
+; ANALYZE-FULL-NEXT:    br label [[FOR_NEXT_6]]
+; ANALYZE-FULL:       for.next.6:
+; ANALYZE-FULL-NEXT:    br i1 false, label [[DO_STORE_7:%.*]], label [[FOR_NEXT_7:%.*]]
+; ANALYZE-FULL:       do_store.7:
+; ANALYZE-FULL-NEXT:    [[GEP_7:%.*]] = getelementptr i32, i32* [[A]], i32 7
+; ANALYZE-FULL-NEXT:    store i32 7, i32* [[GEP_7]]
+; ANALYZE-FULL-NEXT:    br label [[FOR_NEXT_7]]
+; ANALYZE-FULL:       for.next.7:
+; ANALYZE-FULL-NEXT:    br i1 false, label [[DO_STORE_8:%.*]], label [[FOR_NEXT_8:%.*]]
+; ANALYZE-FULL:       do_store.8:
+; ANALYZE-FULL-NEXT:    [[GEP_8:%.*]] = getelementptr i32, i32* [[A]], i32 8
+; ANALYZE-FULL-NEXT:    store i32 8, i32* [[GEP_8]]
+; ANALYZE-FULL-NEXT:    br label [[FOR_NEXT_8]]
+; ANALYZE-FULL:       for.next.8:
+; ANALYZE-FULL-NEXT:    ret i32 9
+;
+; DONT-ANALYZE-FULL-LABEL: @foo(
+; DONT-ANALYZE-FULL-NEXT:  entry:
+; DONT-ANALYZE-FULL-NEXT:    br label [[FOR_BODY:%.*]]
+; DONT-ANALYZE-FULL:       for.body:
+; DONT-ANALYZE-FULL-NEXT:    [[INDVAR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_NEXT:%.*]] ]
+; DONT-ANALYZE-FULL-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
+; DONT-ANALYZE-FULL-NEXT:    [[CMP:%.*]] = icmp ule i32 [[INDVAR]], 20
+; DONT-ANALYZE-FULL-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; DONT-ANALYZE-FULL-NEXT:    [[CMP2:%.*]] = icmp ule i32 [[INDVAR]], 3
+; DONT-ANALYZE-FULL-NEXT:    br i1 [[CMP2]], label [[DO_STORE:%.*]], label [[FOR_NEXT]]
+; DONT-ANALYZE-FULL:       do_store:
+; DONT-ANALYZE-FULL-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i32 [[INDVAR]]
+; DONT-ANALYZE-FULL-NEXT:    store i32 [[INDVAR]], i32* [[GEP]]
+; DONT-ANALYZE-FULL-NEXT:    br label [[FOR_NEXT]]
+; DONT-ANALYZE-FULL:       for.next:
+; DONT-ANALYZE-FULL-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INDVAR_NEXT]], 9
+; DONT-ANALYZE-FULL-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[LOOPEXIT:%.*]]
+; DONT-ANALYZE-FULL:       loopexit:
+; DONT-ANALYZE-FULL-NEXT:    [[INDVAR_NEXT_LCSSA:%.*]] = phi i32 [ [[INDVAR_NEXT]], [[FOR_NEXT]] ]
+; DONT-ANALYZE-FULL-NEXT:    ret i32 [[INDVAR_NEXT_LCSSA]]
+;
+entry:
+  br label %for.body
+for.body:
+  %indvar = phi i32 [ 0, %entry ], [ %indvar.next, %for.next ]
+  %indvar.next = add i32 %indvar, 1
+  %cmp = icmp ule i32 %indvar, 20
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp2 = icmp ule i32 %indvar, 3
+  br i1 %cmp2, label %do_store, label %for.next
+
+do_store:
+  %gep = getelementptr i32, i32* %a, i32 %indvar
+  store i32 %indvar, i32* %gep
+  br label %for.next
+
+for.next:
+  %exitcond = icmp ne i32 %indvar.next, 9
+  br i1 %exitcond, label %for.body, label %loopexit
+loopexit:
+  ret i32 %indvar.next
+}