[llvm] r255454 - [LoopVectorizer] Refine loop vectorizer's register usage calculator by ignoring specific instructions.

Sun Dec 13 00:44:39 PST 2015

Author: conghou
Date: Sun Dec 13 02:44:08 2015
New Revision: 255454

URL: http://llvm.org/viewvc/llvm-project?rev=255454&view=rev
Log:
[LoopVectorizer] Refine loop vectorizer's register usage calculator by ignoring specific instructions.

LoopVectorizationCostModel::calculateRegisterUsage() is used to estimate the
register usage for specific VFs. However, it takes into account many
instructions that won't be vectorized, such as induction variables,
GetElementPtr instruction, etc.. This makes the loop vectorizer too conservative
when choosing VF. In this patch, the induction variables that won't be
vectorized plus GetElementPtr instruction will be added to ValuesToIgnore set
so that their register usage won't be considered any more.


Differential revision: http://reviews.llvm.org/D15177



Added:
    llvm/trunk/test/Transforms/LoopVectorize/reg-usage.ll
Modified:
    llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/trunk/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll

Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=255454&r1=255453&r2=255454&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Sun Dec 13 02:44:08 2015
@@ -1409,15 +1409,14 @@ private:
 /// different operations.
 class LoopVectorizationCostModel {
 public:
-  LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
-                             LoopVectorizationLegality *Legal,
+  LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
+                             LoopInfo *LI, LoopVectorizationLegality *Legal,
                              const TargetTransformInfo &TTI,
                              const TargetLibraryInfo *TLI, DemandedBits *DB,
                              AssumptionCache *AC, const Function *F,
-                             const LoopVectorizeHints *Hints,
-                             SmallPtrSetImpl<const Value *> &ValuesToIgnore)
-      : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
-        TheFunction(F), Hints(Hints), ValuesToIgnore(ValuesToIgnore) {}
+                             const LoopVectorizeHints *Hints)
+      : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
+        AC(AC), TheFunction(F), Hints(Hints) {}
 
   /// Information about vectorization costs
   struct VectorizationFactor {
@@ -1465,6 +1464,9 @@ public:
   SmallVector<RegisterUsage, 8>
   calculateRegisterUsage(const SmallVector<unsigned, 8> &VFs);
 
+  /// Collect values we want to ignore in the cost model.
+  void collectValuesToIgnore();
+
 private:
   /// Returns the expected execution cost. The unit of the cost does
   /// not matter because we use the 'cost' units to compare different
@@ -1496,8 +1498,8 @@ public:
 
   /// The loop that we evaluate.
   Loop *TheLoop;
-  /// Scev analysis.
-  ScalarEvolution *SE;
+  /// Predicated scalar evolution analysis.
+  PredicatedScalarEvolution &PSE;
   /// Loop Info analysis.
   LoopInfo *LI;
   /// Vectorization legality.
@@ -1506,13 +1508,17 @@ public:
   const TargetTransformInfo &TTI;
   /// Target Library Info.
   const TargetLibraryInfo *TLI;
-  /// Demanded bits analysis
+  /// Demanded bits analysis.
   DemandedBits *DB;
+  /// Assumption cache.
+  AssumptionCache *AC;
   const Function *TheFunction;
-  // Loop Vectorize Hint.
+  /// Loop Vectorize Hint.
   const LoopVectorizeHints *Hints;
-  // Values to ignore in the cost model.
-  const SmallPtrSetImpl<const Value *> &ValuesToIgnore;
+  /// Values to ignore in the cost model.
+  SmallPtrSet<const Value *, 16> ValuesToIgnore;
+  /// Values to ignore in the cost model when VF > 1.
+  SmallPtrSet<const Value *, 16> VecValuesToIgnore;
 };
 
 /// \brief This holds vectorization requirements that must be verified late in
@@ -1757,19 +1763,10 @@ struct LoopVectorize : public FunctionPa
       return false;
     }
 
-    // Collect values we want to ignore in the cost model. This includes
-    // type-promoting instructions we identified during reduction detection.
-    SmallPtrSet<const Value *, 32> ValuesToIgnore;
-    CodeMetrics::collectEphemeralValues(L, AC, ValuesToIgnore);
-    for (auto &Reduction : *LVL.getReductionVars()) {
-      RecurrenceDescriptor &RedDes = Reduction.second;
-      SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
-      ValuesToIgnore.insert(Casts.begin(), Casts.end());
-    }
-
     // Use the cost model.
-    LoopVectorizationCostModel CM(L, PSE.getSE(), LI, &LVL, *TTI, TLI, DB, AC,
-                                  F, &Hints, ValuesToIgnore);
+    LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, F,
+                                  &Hints);
+    CM.collectValuesToIgnore();
 
     // Check the function attributes to find out if this function should be
     // optimized for size.
@@ -4737,7 +4734,7 @@ LoopVectorizationCostModel::selectVector
   }
 
   // Find the trip count.
-  unsigned TC = SE->getSmallConstantTripCount(TheLoop);
+  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
   DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
 
   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
@@ -4939,7 +4936,7 @@ unsigned LoopVectorizationCostModel::sel
     return 1;
 
   // Do not interleave loops with a relatively small trip count.
-  unsigned TC = SE->getSmallConstantTripCount(TheLoop);
+  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
   if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
     return 1;
 
@@ -5167,15 +5164,15 @@ LoopVectorizationCostModel::calculateReg
     // Ignore instructions that are never used within the loop.
     if (!Ends.count(I)) continue;
 
-    // Skip ignored values.
-    if (ValuesToIgnore.count(I))
-      continue;
-
     // Remove all of the instructions that end at this location.
     InstrList &List = TransposeEnds[i];
     for (unsigned int j = 0, e = List.size(); j < e; ++j)
       OpenIntervals.erase(List[j]);
 
+    // Skip ignored values.
+    if (ValuesToIgnore.count(I))
+      continue;
+
     // For each VF find the maximum usage of registers.
     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
       if (VFs[j] == 1) {
@@ -5185,8 +5182,12 @@ LoopVectorizationCostModel::calculateReg
 
       // Count the number of live intervals.
       unsigned RegUsage = 0;
-      for (auto Inst : OpenIntervals)
+      for (auto Inst : OpenIntervals) {
+        // Skip ignored values for VF > 1.
+        if (VecValuesToIgnore.count(Inst))
+          continue;
         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
+      }
       MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
     }
 
@@ -5330,6 +5331,7 @@ LoopVectorizationCostModel::getInstructi
   if (VF > 1 && MinBWs.count(I))
     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
   Type *VectorTy = ToVectorTy(RetTy, VF);
+  auto SE = PSE.getSE();
 
   // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
@@ -5631,6 +5633,79 @@ bool LoopVectorizationCostModel::isConse
   return false;
 }
 
+void LoopVectorizationCostModel::collectValuesToIgnore() {
+  // Ignore ephemeral values.
+  CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
+
+  // Ignore type-promoting instructions we identified during reduction
+  // detection.
+  for (auto &Reduction : *Legal->getReductionVars()) {
+    RecurrenceDescriptor &RedDes = Reduction.second;
+    SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
+    VecValuesToIgnore.insert(Casts.begin(), Casts.end());
+  }
+
+  // Ignore induction phis that are only used in either GetElementPtr or ICmp
+  // instruction to exit loop. Induction variables usually have large types and
+  // can have big impact when estimating register usage.
+  // This is for when VF > 1.
+  for (auto &Induction : *Legal->getInductionVars()) {
+    auto *PN = Induction.first;
+    auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch());
+
+    // Check that the PHI is only used by the induction increment (UpdateV) or
+    // by GEPs. Then check that UpdateV is only used by a compare instruction or
+    // the loop header PHI.
+    // FIXME: Need precise def-use analysis to determine if this instruction
+    // variable will be vectorized.
+    if (std::all_of(PN->user_begin(), PN->user_end(),
+                    [&](const User *U) -> bool {
+                      return U == UpdateV || isa<GetElementPtrInst>(U);
+                    }) &&
+        std::all_of(UpdateV->user_begin(), UpdateV->user_end(),
+                    [&](const User *U) -> bool {
+                      return U == PN || isa<ICmpInst>(U);
+                    })) {
+      VecValuesToIgnore.insert(PN);
+      VecValuesToIgnore.insert(UpdateV);
+    }
+  }
+
+  // Ignore instructions that will not be vectorized.
+  // This is for when VF > 1.
+  for (auto bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be;
+       ++bb) {
+    for (auto &Inst : **bb) {
+      switch (Inst.getOpcode()) {
+      case Instruction::GetElementPtr: {
+        // Ignore GEP if its last operand is an induction variable so that it is
+        // a consecutive load/store and won't be vectorized as scatter/gather
+        // pattern.
+
+        GetElementPtrInst *Gep = cast<GetElementPtrInst>(&Inst);
+        unsigned NumOperands = Gep->getNumOperands();
+        unsigned InductionOperand = getGEPInductionOperand(Gep);
+        bool GepToIgnore = true;
+
+        // Check that all of the gep indices are uniform except for the
+        // induction operand.
+        for (unsigned i = 0; i != NumOperands; ++i) {
+          if (i != InductionOperand &&
+              !PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)),
+                                            TheLoop)) {
+            GepToIgnore = false;
+            break;
+          }
+        }
+
+        if (GepToIgnore)
+          VecValuesToIgnore.insert(&Inst);
+        break;
+      }
+      }
+    }
+  }
+}
 
 void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
                                              bool IfPredicateStore) {

Modified: llvm/trunk/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll?rev=255454&r1=255453&r2=255454&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll Sun Dec 13 02:44:08 2015
@@ -16,7 +16,7 @@ target triple = "x86_64-unknown-linux-gn
 ; -vectorizer-maximize-bandwidth is indicated.
 ;
 ; CHECK-label: foo
-; CHECK: LV: Selecting VF: 16.
+; CHECK: LV: Selecting VF: 32.
 define void @foo() {
 entry:
   br label %for.body

Added: llvm/trunk/test/Transforms/LoopVectorize/reg-usage.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/reg-usage.ll?rev=255454&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/reg-usage.ll (added)
+++ llvm/trunk/test/Transforms/LoopVectorize/reg-usage.ll Sun Dec 13 02:44:08 2015
@@ -0,0 +1,68 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth 2>&1 | FileCheck %s
+
+
+ at a = global [1024 x i8] zeroinitializer, align 16
+ at b = global [1024 x i8] zeroinitializer, align 16
+
+define i32 @foo() {
+; This function has a loop of SAD pattern. Here we check when VF = 16 the
+; register usage doesn't exceed 16.
+;
+; CHECK-LABEL: foo
+; CHECK:      LV(REG): VF = 4
+; CHECK-NEXT: LV(REG): Found max usage: 4
+; CHECK:      LV(REG): VF = 8
+; CHECK-NEXT: LV(REG): Found max usage: 7
+; CHECK:      LV(REG): VF = 16
+; CHECK-NEXT: LV(REG): Found max usage: 13
+
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %add.lcssa
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %indvars.iv
+  %0 = load i8, i8* %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %indvars.iv
+  %1 = load i8, i8* %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i32
+  %sub = sub nsw i32 %conv, %conv3
+  %ispos = icmp sgt i32 %sub, -1
+  %neg = sub nsw i32 0, %sub
+  %2 = select i1 %ispos, i32 %sub, i32 %neg
+  %add = add nsw i32 %2, %s.015
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+define i64 @bar(i64* nocapture %a) {
+; CHECK-LABEL: bar
+; CHECK:       LV(REG): VF = 2
+; CHECK:       LV(REG): Found max usage: 4
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  %add2.lcssa = phi i64 [ %add2, %for.body ]
+  ret i64 %add2.lcssa
+
+for.body:
+  %i.012 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %s.011 = phi i64 [ 0, %entry ], [ %add2, %for.body ]
+  %arrayidx = getelementptr inbounds i64, i64* %a, i64 %i.012
+  %0 = load i64, i64* %arrayidx, align 8
+  %add = add nsw i64 %0, %i.012
+  store i64 %add, i64* %arrayidx, align 8
+  %add2 = add nsw i64 %add, %s.011
+  %inc = add nuw nsw i64 %i.012, 1
+  %exitcond = icmp eq i64 %inc, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}