[llvm] r270113 - Recommit r255691 since PR26509 has been fixed.

Thu May 19 15:02:00 PDT 2016

Ok, will do that. Thanks.

Wei.

On Thu, May 19, 2016 at 2:36 PM, Quentin Colombet <qcolombet at apple.com> wrote:
> Hi Wei,
>
> For future reference, this is nice to repeat the original commit message.
> That avoids digging into logs.
>
> Cheers,
> -Quentin
>> On May 19, 2016, at 1:38 PM, Wei Mi via llvm-commits <llvm-commits at lists.llvm.org> wrote:
>>
>> Author: wmi
>> Date: Thu May 19 15:38:03 2016
>> New Revision: 270113
>>
>> URL: http://llvm.org/viewvc/llvm-project?rev=270113&view=rev
>> Log:
>> Recommit r255691 since PR26509 has been fixed.
>>
>> Added:
>>    llvm/trunk/test/Transforms/LoopVectorize/X86/reg-usage.ll
>> Modified:
>>    llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
>>    llvm/trunk/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll
>>
>> Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=270113&r1=270112&r2=270113&view=diff
>> ==============================================================================
>> --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
>> +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Thu May 19 15:38:03 2016
>> @@ -1518,15 +1518,14 @@ private:
>> /// different operations.
>> class LoopVectorizationCostModel {
>> public:
>> -  LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
>> -                             LoopVectorizationLegality *Legal,
>> +  LoopVectorizationCostModel(Loop *L, PredicatedScalarEvolution &PSE,
>> +                             LoopInfo *LI, LoopVectorizationLegality *Legal,
>>                              const TargetTransformInfo &TTI,
>>                              const TargetLibraryInfo *TLI, DemandedBits *DB,
>>                              AssumptionCache *AC, const Function *F,
>> -                             const LoopVectorizeHints *Hints,
>> -                             SmallPtrSetImpl<const Value *> &ValuesToIgnore)
>> -      : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
>> -        TheFunction(F), Hints(Hints), ValuesToIgnore(ValuesToIgnore) {}
>> +                             const LoopVectorizeHints *Hints)
>> +      : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
>> +        AC(AC), TheFunction(F), Hints(Hints) {}
>>
>>   /// Information about vectorization costs
>>   struct VectorizationFactor {
>> @@ -1573,6 +1572,9 @@ public:
>>   /// given vectorization factors.
>>   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
>>
>> +  /// Collect values we want to ignore in the cost model.
>> +  void collectValuesToIgnore();
>> +
>> private:
>>   /// The vectorization cost is a combination of the cost itself and a boolean
>>   /// indicating whether any of the contributing operations will actually
>> @@ -1617,8 +1619,8 @@ public:
>>
>>   /// The loop that we evaluate.
>>   Loop *TheLoop;
>> -  /// Scev analysis.
>> -  ScalarEvolution *SE;
>> +  /// Predicated scalar evolution analysis.
>> +  PredicatedScalarEvolution &PSE;
>>   /// Loop Info analysis.
>>   LoopInfo *LI;
>>   /// Vectorization legality.
>> @@ -1627,13 +1629,17 @@ public:
>>   const TargetTransformInfo &TTI;
>>   /// Target Library Info.
>>   const TargetLibraryInfo *TLI;
>> -  /// Demanded bits analysis
>> +  /// Demanded bits analysis.
>>   DemandedBits *DB;
>> +  /// Assumption cache.
>> +  AssumptionCache *AC;
>>   const Function *TheFunction;
>> -  // Loop Vectorize Hint.
>> +  /// Loop Vectorize Hint.
>>   const LoopVectorizeHints *Hints;
>> -  // Values to ignore in the cost model.
>> -  const SmallPtrSetImpl<const Value *> &ValuesToIgnore;
>> +  /// Values to ignore in the cost model.
>> +  SmallPtrSet<const Value *, 16> ValuesToIgnore;
>> +  /// Values to ignore in the cost model when VF > 1.
>> +  SmallPtrSet<const Value *, 16> VecValuesToIgnore;
>> };
>>
>> /// \brief This holds vectorization requirements that must be verified late in
>> @@ -1881,19 +1887,10 @@ struct LoopVectorize : public FunctionPa
>>       return false;
>>     }
>>
>> -    // Collect values we want to ignore in the cost model. This includes
>> -    // type-promoting instructions we identified during reduction detection.
>> -    SmallPtrSet<const Value *, 32> ValuesToIgnore;
>> -    CodeMetrics::collectEphemeralValues(L, AC, ValuesToIgnore);
>> -    for (auto &Reduction : *LVL.getReductionVars()) {
>> -      RecurrenceDescriptor &RedDes = Reduction.second;
>> -      SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
>> -      ValuesToIgnore.insert(Casts.begin(), Casts.end());
>> -    }
>> -
>>     // Use the cost model.
>> -    LoopVectorizationCostModel CM(L, PSE.getSE(), LI, &LVL, *TTI, TLI, DB, AC,
>> -                                  F, &Hints, ValuesToIgnore);
>> +    LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, F,
>> +                                  &Hints);
>> +    CM.collectValuesToIgnore();
>>
>>     // Check the function attributes to find out if this function should be
>>     // optimized for size.
>> @@ -5190,7 +5187,7 @@ LoopVectorizationCostModel::selectVector
>>   }
>>
>>   // Find the trip count.
>> -  unsigned TC = SE->getSmallConstantTripCount(TheLoop);
>> +  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
>>   DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
>>
>>   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
>> @@ -5409,7 +5406,7 @@ unsigned LoopVectorizationCostModel::sel
>>     return 1;
>>
>>   // Do not interleave loops with a relatively small trip count.
>> -  unsigned TC = SE->getSmallConstantTripCount(TheLoop);
>> +  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
>>   if (TC > 1 && TC < TinyTripCountInterleaveThreshold)
>>     return 1;
>>
>> @@ -5639,15 +5636,15 @@ LoopVectorizationCostModel::calculateReg
>>     if (!Ends.count(I))
>>       continue;
>>
>> -    // Skip ignored values.
>> -    if (ValuesToIgnore.count(I))
>> -      continue;
>> -
>>     // Remove all of the instructions that end at this location.
>>     InstrList &List = TransposeEnds[i];
>>     for (unsigned int j = 0, e = List.size(); j < e; ++j)
>>       OpenIntervals.erase(List[j]);
>>
>> +    // Skip ignored values.
>> +    if (ValuesToIgnore.count(I))
>> +      continue;
>> +
>>     // For each VF find the maximum usage of registers.
>>     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
>>       if (VFs[j] == 1) {
>> @@ -5657,8 +5654,12 @@ LoopVectorizationCostModel::calculateReg
>>
>>       // Count the number of live intervals.
>>       unsigned RegUsage = 0;
>> -      for (auto Inst : OpenIntervals)
>> +      for (auto Inst : OpenIntervals) {
>> +        // Skip ignored values for VF > 1.
>> +        if (VecValuesToIgnore.count(Inst))
>> +          continue;
>>         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
>> +      }
>>       MaxUsages[j] = std::max(MaxUsages[j], RegUsage);
>>     }
>>
>> @@ -5830,6 +5831,7 @@ unsigned LoopVectorizationCostModel::get
>>   if (VF > 1 && MinBWs.count(I))
>>     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
>>   VectorTy = ToVectorTy(RetTy, VF);
>> +  auto SE = PSE.getSE();
>>
>>   // TODO: We need to estimate the cost of intrinsic calls.
>>   switch (I->getOpcode()) {
>> @@ -6158,6 +6160,79 @@ bool LoopVectorizationCostModel::isConse
>>   return false;
>> }
>>
>> +void LoopVectorizationCostModel::collectValuesToIgnore() {
>> +  // Ignore ephemeral values.
>> +  CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
>> +
>> +  // Ignore type-promoting instructions we identified during reduction
>> +  // detection.
>> +  for (auto &Reduction : *Legal->getReductionVars()) {
>> +    RecurrenceDescriptor &RedDes = Reduction.second;
>> +    SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
>> +    VecValuesToIgnore.insert(Casts.begin(), Casts.end());
>> +  }
>> +
>> +  // Ignore induction phis that are only used in either GetElementPtr or ICmp
>> +  // instruction to exit loop. Induction variables usually have large types and
>> +  // can have big impact when estimating register usage.
>> +  // This is for when VF > 1.
>> +  for (auto &Induction : *Legal->getInductionVars()) {
>> +    auto *PN = Induction.first;
>> +    auto *UpdateV = PN->getIncomingValueForBlock(TheLoop->getLoopLatch());
>> +
>> +    // Check that the PHI is only used by the induction increment (UpdateV) or
>> +    // by GEPs. Then check that UpdateV is only used by a compare instruction or
>> +    // the loop header PHI.
>> +    // FIXME: Need precise def-use analysis to determine if this instruction
>> +    // variable will be vectorized.
>> +    if (std::all_of(PN->user_begin(), PN->user_end(),
>> +                    [&](const User *U) -> bool {
>> +                      return U == UpdateV || isa<GetElementPtrInst>(U);
>> +                    }) &&
>> +        std::all_of(UpdateV->user_begin(), UpdateV->user_end(),
>> +                    [&](const User *U) -> bool {
>> +                      return U == PN || isa<ICmpInst>(U);
>> +                    })) {
>> +      VecValuesToIgnore.insert(PN);
>> +      VecValuesToIgnore.insert(UpdateV);
>> +    }
>> +  }
>> +
>> +  // Ignore instructions that will not be vectorized.
>> +  // This is for when VF > 1.
>> +  for (auto bb = TheLoop->block_begin(), be = TheLoop->block_end(); bb != be;
>> +       ++bb) {
>> +    for (auto &Inst : **bb) {
>> +      switch (Inst.getOpcode())
>> +      case Instruction::GetElementPtr: {
>> +        // Ignore GEP if its last operand is an induction variable so that it is
>> +        // a consecutive load/store and won't be vectorized as scatter/gather
>> +        // pattern.
>> +
>> +        GetElementPtrInst *Gep = cast<GetElementPtrInst>(&Inst);
>> +        unsigned NumOperands = Gep->getNumOperands();
>> +        unsigned InductionOperand = getGEPInductionOperand(Gep);
>> +        bool GepToIgnore = true;
>> +
>> +        // Check that all of the gep indices are uniform except for the
>> +        // induction operand.
>> +        for (unsigned i = 0; i != NumOperands; ++i) {
>> +          if (i != InductionOperand &&
>> +              !PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)),
>> +                                            TheLoop)) {
>> +            GepToIgnore = false;
>> +            break;
>> +          }
>> +        }
>> +
>> +        if (GepToIgnore)
>> +          VecValuesToIgnore.insert(&Inst);
>> +        break;
>> +      }
>> +    }
>> +  }
>> +}
>> +
>> void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
>>                                              bool IfPredicateStore) {
>>   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
>>
>> Added: llvm/trunk/test/Transforms/LoopVectorize/X86/reg-usage.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/reg-usage.ll?rev=270113&view=auto
>> ==============================================================================
>> --- llvm/trunk/test/Transforms/LoopVectorize/X86/reg-usage.ll (added)
>> +++ llvm/trunk/test/Transforms/LoopVectorize/X86/reg-usage.ll Thu May 19 15:38:03 2016
>> @@ -0,0 +1,71 @@
>> +; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -O2 -S 2>&1 | FileCheck %s
>> +; REQUIRES: asserts
>> +
>> +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
>> +target triple = "x86_64-unknown-linux-gnu"
>> +
>> + at a = global [1024 x i8] zeroinitializer, align 16
>> + at b = global [1024 x i8] zeroinitializer, align 16
>> +
>> +define i32 @foo() {
>> +; This function has a loop of SAD pattern. Here we check when VF = 16 the
>> +; register usage doesn't exceed 16.
>> +;
>> +; CHECK-LABEL: foo
>> +; CHECK:      LV(REG): VF = 4
>> +; CHECK-NEXT: LV(REG): Found max usage: 4
>> +; CHECK:      LV(REG): VF = 8
>> +; CHECK-NEXT: LV(REG): Found max usage: 7
>> +; CHECK:      LV(REG): VF = 16
>> +; CHECK-NEXT: LV(REG): Found max usage: 13
>> +
>> +entry:
>> +  br label %for.body
>> +
>> +for.cond.cleanup:
>> +  %add.lcssa = phi i32 [ %add, %for.body ]
>> +  ret i32 %add.lcssa
>> +
>> +for.body:
>> +  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
>> +  %s.015 = phi i32 [ 0, %entry ], [ %add, %for.body ]
>> +  %arrayidx = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %indvars.iv
>> +  %0 = load i8, i8* %arrayidx, align 1
>> +  %conv = zext i8 %0 to i32
>> +  %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %indvars.iv
>> +  %1 = load i8, i8* %arrayidx2, align 1
>> +  %conv3 = zext i8 %1 to i32
>> +  %sub = sub nsw i32 %conv, %conv3
>> +  %ispos = icmp sgt i32 %sub, -1
>> +  %neg = sub nsw i32 0, %sub
>> +  %2 = select i1 %ispos, i32 %sub, i32 %neg
>> +  %add = add nsw i32 %2, %s.015
>> +  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
>> +  %exitcond = icmp eq i64 %indvars.iv.next, 1024
>> +  br i1 %exitcond, label %for.cond.cleanup, label %for.body
>> +}
>> +
>> +define i64 @bar(i64* nocapture %a) {
>> +; CHECK-LABEL: bar
>> +; CHECK:       LV(REG): VF = 2
>> +; CHECK:       LV(REG): Found max usage: 4
>> +;
>> +entry:
>> +  br label %for.body
>> +
>> +for.cond.cleanup:
>> +  %add2.lcssa = phi i64 [ %add2, %for.body ]
>> +  ret i64 %add2.lcssa
>> +
>> +for.body:
>> +  %i.012 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
>> +  %s.011 = phi i64 [ 0, %entry ], [ %add2, %for.body ]
>> +  %arrayidx = getelementptr inbounds i64, i64* %a, i64 %i.012
>> +  %0 = load i64, i64* %arrayidx, align 8
>> +  %add = add nsw i64 %0, %i.012
>> +  store i64 %add, i64* %arrayidx, align 8
>> +  %add2 = add nsw i64 %add, %s.011
>> +  %inc = add nuw nsw i64 %i.012, 1
>> +  %exitcond = icmp eq i64 %inc, 1024
>> +  br i1 %exitcond, label %for.cond.cleanup, label %for.body
>> +}
>>
>> Modified: llvm/trunk/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll?rev=270113&r1=270112&r2=270113&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll (original)
>> +++ llvm/trunk/test/Transforms/LoopVectorize/X86/vector_max_bandwidth.ll Thu May 19 15:38:03 2016
>> @@ -16,7 +16,7 @@ target triple = "x86_64-unknown-linux-gn
>> ; -vectorizer-maximize-bandwidth is indicated.
>> ;
>> ; CHECK-label: foo
>> -; CHECK: LV: Selecting VF: 16.
>> +; CHECK: LV: Selecting VF: 32.
>> define void @foo() {
>> entry:
>>   br label %for.body
>>
>>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>