[llvm] r189281 - LoopVectorize: Implement partial loop unrolling when vectorization is not profitable.

Tue Aug 27 14:37:37 PDT 2013

So, curiosity. Why a partial unroller in the vectorizer? Why not as a
separate pass etc?

This seems a little odd.

-eric

On Mon, Aug 26, 2013 at 3:33 PM, Nadav Rotem <nrotem at apple.com> wrote:
> Author: nadav
> Date: Mon Aug 26 17:33:26 2013
> New Revision: 189281
>
> URL: http://llvm.org/viewvc/llvm-project?rev=189281&view=rev
> Log:
> LoopVectorize: Implement partial loop unrolling when vectorization is not profitable.
> This patch enables unrolling of loops when vectorization is legal but not profitable.
> We add a new class InnerLoopUnroller, that extends InnerLoopVectorizer and replaces some of the vector-specific logic with scalars.
>
> This patch does not introduce any runtime regressions and improves the following workloads:
>
> SingleSource/Benchmarks/Shootout/matrix -22.64%
> SingleSource/Benchmarks/Shootout-C++/matrix -13.06%
> External/SPEC/CINT2006/464_h264ref/464_h264ref  -3.99%
> SingleSource/Benchmarks/Adobe-C++/simple_types_constant_folding -1.95%
>
>
> Added:
>     llvm/trunk/test/Transforms/LoopVectorize/unroll_novec.ll
> Modified:
>     llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
>
> Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=189281&r1=189280&r2=189281&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
> +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Mon Aug 26 17:33:26 2013
> @@ -126,6 +126,9 @@ static const unsigned MaxVectorWidth = 6
>  /// Maximum vectorization unroll count.
>  static const unsigned MaxUnrollFactor = 16;
>
> +/// The cost of a loop that is considered 'small' by the unroller.
> +static const unsigned SmallLoopCost = 20;
> +
>  namespace {
>
>  // Forward declarations.
> @@ -167,7 +170,9 @@ public:
>      updateAnalysis();
>    }
>
> -private:
> +  virtual ~InnerLoopVectorizer() {}
> +
> +protected:
>    /// A small list of PHINodes.
>    typedef SmallVector<PHINode*, 4> PhiVector;
>    /// When we unroll loops we have multiple vector values for each scalar.
> @@ -187,7 +192,13 @@ private:
>    /// Create an empty loop, based on the loop ranges of the old loop.
>    void createEmptyLoop(LoopVectorizationLegality *Legal);
>    /// Copy and widen the instructions from the old loop.
> -  void vectorizeLoop(LoopVectorizationLegality *Legal);
> +  virtual void vectorizeLoop(LoopVectorizationLegality *Legal);
> +
> +  /// \brief The Loop exit block may have single value PHI nodes where the
> +  /// incoming value is 'Undef'. While vectorizing we only handled real values
> +  /// that were defined inside the loop. Here we fix the 'undef case'.
> +  /// See PR14725.
> +  void fixLCSSAPHIs();
>
>    /// A helper function that computes the predicate of the block BB, assuming
>    /// that the header block of the loop is set to True. It returns the *entry*
> @@ -201,16 +212,23 @@ private:
>    void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB,
>                              PhiVector *PV);
>
> +  /// Vectorize a single PHINode in a block. This method handles the induction
> +  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
> +  /// arbitrary length vectors.
> +  void widenPHIInstruction(Instruction *PN, VectorParts &Entry,
> +                           LoopVectorizationLegality *Legal,
> +                           unsigned UF, unsigned VF, PhiVector *PV);
> +
>    /// Insert the new loop to the loop hierarchy and pass manager
>    /// and update the analysis passes.
>    void updateAnalysis();
>
>    /// This instruction is un-vectorizable. Implement it as a sequence
>    /// of scalars.
> -  void scalarizeInstruction(Instruction *Instr);
> +  virtual void scalarizeInstruction(Instruction *Instr);
>
>    /// Vectorize Load and Store instructions,
> -  void vectorizeMemoryInstruction(Instruction *Instr,
> +  virtual void vectorizeMemoryInstruction(Instruction *Instr,
>                                    LoopVectorizationLegality *Legal);
>
>    /// Create a broadcast instruction. This method generates a broadcast
> @@ -218,12 +236,12 @@ private:
>    /// value. If this is the induction variable then we extend it to N, N+1, ...
>    /// this is needed because each iteration in the loop corresponds to a SIMD
>    /// element.
> -  Value *getBroadcastInstrs(Value *V);
> +  virtual Value *getBroadcastInstrs(Value *V);
>
>    /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
>    /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
>    /// The sequence starts at StartIndex.
> -  Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
> +  virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
>
>    /// When we go over instructions in the basic block we rely on previous
>    /// values within the current basic block or on loop invariant values.
> @@ -233,7 +251,7 @@ private:
>    VectorParts &getVectorValue(Value *V);
>
>    /// Generate a shuffle sequence that will reverse the vector Vec.
> -  Value *reverseVector(Value *Vec);
> +  virtual Value *reverseVector(Value *Vec);
>
>    /// This is a helper class that holds the vectorizer state. It maps scalar
>    /// instructions to vector instructions. When the code is 'unrolled' then
> @@ -291,6 +309,8 @@ private:
>    /// The vectorization SIMD factor to use. Each vector will have this many
>    /// vector elements.
>    unsigned VF;
> +
> +protected:
>    /// The vectorization unroll factor to use. Each scalar is vectorized to this
>    /// many different vector instructions.
>    unsigned UF;
> @@ -326,6 +346,23 @@ private:
>    EdgeMaskCache MaskCache;
>  };
>
> +class InnerLoopUnroller : public InnerLoopVectorizer {
> +public:
> +  InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
> +                    DominatorTree *DT, DataLayout *DL,
> +                    const TargetLibraryInfo *TLI, unsigned UnrollFactor) :
> +    InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { }
> +
> +private:
> +  virtual void vectorizeLoop(LoopVectorizationLegality *Legal);
> +  virtual void scalarizeInstruction(Instruction *Instr);
> +  virtual void vectorizeMemoryInstruction(Instruction *Instr,
> +                                          LoopVectorizationLegality *Legal);
> +  virtual Value *getBroadcastInstrs(Value *V);
> +  virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
> +  virtual Value *reverseVector(Value *Vec);
> +};
> +
>  /// \brief Look for a meaningful debug location on the instruction or it's
>  /// operands.
>  static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
> @@ -875,7 +912,7 @@ struct LoopVectorize : public LoopPass {
>
>      LoopVectorizeHints Hints(L);
>
> -    if (Hints.Width == 1) {
> +    if (Hints.Width == 1 && Hints.Unroll == 1) {
>        DEBUG(dbgs() << "LV: Not vectorizing.\n");
>        return false;
>      }
> @@ -914,16 +951,23 @@ struct LoopVectorize : public LoopPass {
>
>      if (VF.Width == 1) {
>        DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
> -      return false;
>      }
>
>      DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
>            F->getParent()->getModuleIdentifier()<<"\n");
>      DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
>
> -    // If we decided that it is *legal* to vectorize the loop then do it.
> -    InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
> -    LB.vectorize(&LVL);
> +    if (VF.Width == 1) {
> +      if (UF == 1)
> +        return false;
> +      // We decided not to vectorize, but we may want to unroll.
> +      InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF);
> +      Unroller.vectorize(&LVL);
> +    } else {
> +      // If we decided that it is *legal* to vectorize the loop then do it.
> +      InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
> +      LB.vectorize(&LVL);
> +    }
>
>      // Mark the loop as already vectorized to avoid vectorizing again.
>      Hints.setAlreadyVectorized(L);
> @@ -2136,11 +2180,11 @@ InnerLoopVectorizer::vectorizeLoop(LoopV
>      (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
>      (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
>    }// end of for each redux variable.
> +
> +  fixLCSSAPHIs();
> +}
>
> -  // The Loop exit block may have single value PHI nodes where the incoming
> -  // value is 'undef'. While vectorizing we only handled real values that
> -  // were defined inside the loop. Here we handle the 'undef case'.
> -  // See PR14725.
> +void InnerLoopVectorizer::fixLCSSAPHIs() {
>    for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
>         LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
>      PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
> @@ -2149,7 +2193,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopV
>        LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
>                              LoopMiddleBlock);
>    }
> -}
> +}
>
>  InnerLoopVectorizer::VectorParts
>  InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
> @@ -2210,161 +2254,185 @@ InnerLoopVectorizer::createBlockInMask(B
>    return BlockMask;
>  }
>
> -void
> -InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
> -                                          BasicBlock *BB, PhiVector *PV) {
> -  // For each instruction in the old loop.
> -  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
> -    VectorParts &Entry = WidenMap.get(it);
> -    switch (it->getOpcode()) {
> -    case Instruction::Br:
> -      // Nothing to do for PHIs and BR, since we already took care of the
> -      // loop control flow instructions.
> -      continue;
> -    case Instruction::PHI:{
> -      PHINode* P = cast<PHINode>(it);
> -      // Handle reduction variables:
> -      if (Legal->getReductionVars()->count(P)) {
> -        for (unsigned part = 0; part < UF; ++part) {
> -          // This is phase one of vectorizing PHIs.
> -          Type *VecTy = VectorType::get(it->getType(), VF);
> -          Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
> -                                        LoopVectorBody-> getFirstInsertionPt());
> -        }
> -        PV->push_back(P);
> -        continue;
> -      }
> +void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
> +                                              InnerLoopVectorizer::VectorParts &Entry,
> +                                              LoopVectorizationLegality *Legal,
> +                                              unsigned UF, unsigned VF, PhiVector *PV) {
> +  PHINode* P = cast<PHINode>(PN);
> +  // Handle reduction variables:
> +  if (Legal->getReductionVars()->count(P)) {
> +    for (unsigned part = 0; part < UF; ++part) {
> +      // This is phase one of vectorizing PHIs.
> +      Type *VecTy = (VF == 1) ? PN->getType() :
> +      VectorType::get(PN->getType(), VF);
> +      Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
> +                                    LoopVectorBody-> getFirstInsertionPt());
> +    }
> +    PV->push_back(P);
> +    return;
> +  }
>
> -      setDebugLocFromInst(Builder, P);
> -      // Check for PHI nodes that are lowered to vector selects.
> -      if (P->getParent() != OrigLoop->getHeader()) {
> -        // We know that all PHIs in non header blocks are converted into
> -        // selects, so we don't have to worry about the insertion order and we
> -        // can just use the builder.
> -        // At this point we generate the predication tree. There may be
> -        // duplications since this is a simple recursive scan, but future
> -        // optimizations will clean it up.
> -
> -        unsigned NumIncoming = P->getNumIncomingValues();
> -
> -        // Generate a sequence of selects of the form:
> -        // SELECT(Mask3, In3,
> -        //      SELECT(Mask2, In2,
> -        //                   ( ...)))
> -        for (unsigned In = 0; In < NumIncoming; In++) {
> -          VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
> -                                            P->getParent());
> -          VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
> -
> -          for (unsigned part = 0; part < UF; ++part) {
> -            // We might have single edge PHIs (blocks) - use an identity
> -            // 'select' for the first PHI operand.
> -            if (In == 0)
> -              Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
> -                                                 In0[part]);
> -            else
> -              // Select between the current value and the previous incoming edge
> -              // based on the incoming mask.
> -              Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
> -                                                 Entry[part], "predphi");
> -          }
> -        }
> -        continue;
> +  setDebugLocFromInst(Builder, P);
> +  // Check for PHI nodes that are lowered to vector selects.
> +  if (P->getParent() != OrigLoop->getHeader()) {
> +    // We know that all PHIs in non header blocks are converted into
> +    // selects, so we don't have to worry about the insertion order and we
> +    // can just use the builder.
> +    // At this point we generate the predication tree. There may be
> +    // duplications since this is a simple recursive scan, but future
> +    // optimizations will clean it up.
> +
> +    unsigned NumIncoming = P->getNumIncomingValues();
> +
> +    // Generate a sequence of selects of the form:
> +    // SELECT(Mask3, In3,
> +    //      SELECT(Mask2, In2,
> +    //                   ( ...)))
> +    for (unsigned In = 0; In < NumIncoming; In++) {
> +      VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
> +                                        P->getParent());
> +      VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
> +
> +      for (unsigned part = 0; part < UF; ++part) {
> +        // We might have single edge PHIs (blocks) - use an identity
> +        // 'select' for the first PHI operand.
> +        if (In == 0)
> +          Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
> +                                             In0[part]);
> +        else
> +          // Select between the current value and the previous incoming edge
> +          // based on the incoming mask.
> +          Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
> +                                             Entry[part], "predphi");
>        }
> +    }
> +    return;
> +  }
>
> -      // This PHINode must be an induction variable.
> -      // Make sure that we know about it.
> -      assert(Legal->getInductionVars()->count(P) &&
> -             "Not an induction variable");
> -
> -      LoopVectorizationLegality::InductionInfo II =
> -        Legal->getInductionVars()->lookup(P);
> -
> -      switch (II.IK) {
> -      case LoopVectorizationLegality::IK_NoInduction:
> -        llvm_unreachable("Unknown induction");
> -      case LoopVectorizationLegality::IK_IntInduction: {
> -        assert(P->getType() == II.StartValue->getType() && "Types must match");
> -        Type *PhiTy = P->getType();
> -        Value *Broadcasted;
> -        if (P == OldInduction) {
> -          // Handle the canonical induction variable. We might have had to
> -          // extend the type.
> -          Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
> -        } else {
> -          // Handle other induction variables that are now based on the
> -          // canonical one.
> -          Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
> -                                                   "normalized.idx");
> -          NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
> -          Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx,
> -                                          "offset.idx");
> -        }
> -        Broadcasted = getBroadcastInstrs(Broadcasted);
> -        // After broadcasting the induction variable we need to make the vector
> -        // consecutive by adding 0, 1, 2, etc.
> +  // This PHINode must be an induction variable.
> +  // Make sure that we know about it.
> +  assert(Legal->getInductionVars()->count(P) &&
> +         "Not an induction variable");
> +
> +  LoopVectorizationLegality::InductionInfo II =
> +  Legal->getInductionVars()->lookup(P);
> +
> +  switch (II.IK) {
> +    case LoopVectorizationLegality::IK_NoInduction:
> +      llvm_unreachable("Unknown induction");
> +    case LoopVectorizationLegality::IK_IntInduction: {
> +      assert(P->getType() == II.StartValue->getType() && "Types must match");
> +      Type *PhiTy = P->getType();
> +      Value *Broadcasted;
> +      if (P == OldInduction) {
> +        // Handle the canonical induction variable. We might have had to
> +        // extend the type.
> +        Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
> +      } else {
> +        // Handle other induction variables that are now based on the
> +        // canonical one.
> +        Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
> +                                                 "normalized.idx");
> +        NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
> +        Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx,
> +                                        "offset.idx");
> +      }
> +      Broadcasted = getBroadcastInstrs(Broadcasted);
> +      // After broadcasting the induction variable we need to make the vector
> +      // consecutive by adding 0, 1, 2, etc.
> +      for (unsigned part = 0; part < UF; ++part)
> +        Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
> +      return;
> +    }
> +    case LoopVectorizationLegality::IK_ReverseIntInduction:
> +    case LoopVectorizationLegality::IK_PtrInduction:
> +    case LoopVectorizationLegality::IK_ReversePtrInduction:
> +      // Handle reverse integer and pointer inductions.
> +      Value *StartIdx = ExtendedIdx;
> +      // This is the normalized GEP that starts counting at zero.
> +      Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
> +                                               "normalized.idx");
> +
> +      // Handle the reverse integer induction variable case.
> +      if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
> +        IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
> +        Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
> +                                               "resize.norm.idx");
> +        Value *ReverseInd  = Builder.CreateSub(II.StartValue, CNI,
> +                                               "reverse.idx");
> +
> +        // This is a new value so do not hoist it out.
> +        Value *Broadcasted = getBroadcastInstrs(ReverseInd);
> +        // After broadcasting the induction variable we need to make the
> +        // vector consecutive by adding  ... -3, -2, -1, 0.
>          for (unsigned part = 0; part < UF; ++part)
> -          Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
> -        continue;
> +          Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
> +                                             true);
> +        return;
>        }
> -      case LoopVectorizationLegality::IK_ReverseIntInduction:
> -      case LoopVectorizationLegality::IK_PtrInduction:
> -      case LoopVectorizationLegality::IK_ReversePtrInduction:
> -        // Handle reverse integer and pointer inductions.
> -        Value *StartIdx = ExtendedIdx;
> -        // This is the normalized GEP that starts counting at zero.
> -        Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
> -                                                 "normalized.idx");
>
> -        // Handle the reverse integer induction variable case.
> -        if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
> -          IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
> -          Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
> -                                                 "resize.norm.idx");
> -          Value *ReverseInd  = Builder.CreateSub(II.StartValue, CNI,
> -                                                 "reverse.idx");
> -
> -          // This is a new value so do not hoist it out.
> -          Value *Broadcasted = getBroadcastInstrs(ReverseInd);
> -          // After broadcasting the induction variable we need to make the
> -          // vector consecutive by adding  ... -3, -2, -1, 0.
> -          for (unsigned part = 0; part < UF; ++part)
> -            Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
> -                                               true);
> +      // Handle the pointer induction variable case.
> +      assert(P->getType()->isPointerTy() && "Unexpected type.");
> +
> +      // Is this a reverse induction ptr or a consecutive induction ptr.
> +      bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
> +                      II.IK);
> +
> +      // This is the vector of results. Notice that we don't generate
> +      // vector geps because scalar geps result in better code.
> +      for (unsigned part = 0; part < UF; ++part) {
> +        if (VF == 1) {
> +          int EltIndex = (part) * (Reverse ? -1 : 1);
> +          Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
> +          Value *GlobalIdx;
> +          if (Reverse)
> +            GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
> +          else
> +            GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
> +
> +          Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
> +                                             "next.gep");
> +          Entry[part] = SclrGep;
>            continue;
>          }
>
> -        // Handle the pointer induction variable case.
> -        assert(P->getType()->isPointerTy() && "Unexpected type.");
> +        Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
> +        for (unsigned int i = 0; i < VF; ++i) {
> +          int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
> +          Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
> +          Value *GlobalIdx;
> +          if (!Reverse)
> +            GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
> +          else
> +            GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
>
> -        // Is this a reverse induction ptr or a consecutive induction ptr.
> -        bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
> -                        II.IK);
> -
> -        // This is the vector of results. Notice that we don't generate
> -        // vector geps because scalar geps result in better code.
> -        for (unsigned part = 0; part < UF; ++part) {
> -          Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
> -          for (unsigned int i = 0; i < VF; ++i) {
> -            int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
> -            Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
> -            Value *GlobalIdx;
> -            if (!Reverse)
> -              GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
> -            else
> -              GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
> -
> -            Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
> -                                               "next.gep");
> -            VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
> -                                                 Builder.getInt32(i),
> -                                                 "insert.gep");
> -          }
> -          Entry[part] = VecVal;
> +          Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
> +                                             "next.gep");
> +          VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
> +                                               Builder.getInt32(i),
> +                                               "insert.gep");
>          }
> -        continue;
> +        Entry[part] = VecVal;
>        }
> +      return;
> +  }
> +}
>
> +void
> +InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
> +                                          BasicBlock *BB, PhiVector *PV) {
> +  // For each instruction in the old loop.
> +  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
> +    VectorParts &Entry = WidenMap.get(it);
> +    switch (it->getOpcode()) {
> +    case Instruction::Br:
> +      // Nothing to do for PHIs and BR, since we already took care of the
> +      // loop control flow instructions.
> +      continue;
> +    case Instruction::PHI:{
> +      // Vectorize PHINodes.
> +      widenPHIInstruction(it, Entry, Legal, UF, VF, PV);
> +      continue;
>      }// End of PHI.
>
>      case Instruction::Add:
> @@ -2423,8 +2491,10 @@ InnerLoopVectorizer::vectorizeBlockInLoo
>        VectorParts &Cond = getVectorValue(it->getOperand(0));
>        VectorParts &Op0  = getVectorValue(it->getOperand(1));
>        VectorParts &Op1  = getVectorValue(it->getOperand(2));
> -      Value *ScalarCond = Builder.CreateExtractElement(Cond[0],
> -                                                       Builder.getInt32(0));
> +
> +      Value *ScalarCond = (VF == 1) ? Cond[0] :
> +        Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
> +
>        for (unsigned Part = 0; Part < UF; ++Part) {
>          Entry[Part] = Builder.CreateSelect(
>            InvariantCond ? ScalarCond : Cond[Part],
> @@ -2485,7 +2555,8 @@ InnerLoopVectorizer::vectorizeBlockInLoo
>          break;
>        }
>        /// Vectorize casts.
> -      Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
> +      Type *DestTy = (VF == 1) ? CI->getType() :
> +                                 VectorType::get(CI->getType(), VF);
>
>        VectorParts &A = getVectorValue(it->getOperand(0));
>        for (unsigned Part = 0; Part < UF; ++Part)
> @@ -2515,7 +2586,10 @@ InnerLoopVectorizer::vectorizeBlockInLoo
>              VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
>              Args.push_back(Arg[Part]);
>            }
> -          Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) };
> +          Type *Tys[] = {CI->getType()};
> +          if (VF > 1)
> +            Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF);
> +
>            Function *F = Intrinsic::getDeclaration(M, ID, Tys);
>            Entry[Part] = Builder.CreateCall(F, Args);
>          }
> @@ -4267,7 +4341,19 @@ LoopVectorizationCostModel::selectUnroll
>    else if (UF < 1)
>      UF = 1;
>
> -  if (Legal->getReductionVars()->size()) {
> +  bool HasReductions = Legal->getReductionVars()->size();
> +
> +  // Decide if we want to unroll if we decided that it is legal to vectorize
> +  // but not profitable.
> +  if (VF == 1) {
> +    if (TheLoop->getNumBlocks() > 1 || !HasReductions ||
> +        LoopCost > SmallLoopCost)
> +      return 1;
> +
> +    return UF;
> +  }
> +
> +  if (HasReductions) {
>      DEBUG(dbgs() << "LV: Unrolling because of reductions. \n");
>      return UF;
>    }
> @@ -4277,9 +4363,9 @@ LoopVectorizationCostModel::selectUnroll
>    // to estimate the cost of the loop and unroll until the cost of the
>    // loop overhead is about 5% of the cost of the loop.
>    DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n");
> -  if (LoopCost < 20) {
> +  if (LoopCost < SmallLoopCost) {
>      DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n");
> -    unsigned NewUF = 20/LoopCost + 1;
> +    unsigned NewUF = SmallLoopCost / (LoopCost + 1);
>      return std::min(NewUF, UF);
>    }
>
> @@ -4701,3 +4787,245 @@ bool LoopVectorizationCostModel::isConse
>
>    return false;
>  }
> +
> +void
> +InnerLoopUnroller::vectorizeLoop(LoopVectorizationLegality *Legal) {
> +  // In order to support reduction variables we need to be able to unroll
> +  // Phi nodes. Phi nodes have cycles, so we need to unroll them in two
> +  // stages. See InnerLoopVectorizer::vectorizeLoop for more details.
> +  PhiVector RdxPHIsToFix;
> +
> +  // Scan the loop in a topological order to ensure that defs are vectorized
> +  // before users.
> +  LoopBlocksDFS DFS(OrigLoop);
> +  DFS.perform(LI);
> +
> +  // Unroll all of the blocks in the original loop.
> +  for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), be = DFS.endRPO();
> +       bb != be; ++bb)
> +    vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix);
> +
> +  // Create the 'reduced' values for each of the induction vars.
> +  // The reduced values are the vector values that we scalarize and combine
> +  // after the loop is finished.
> +  for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
> +       it != e; ++it) {
> +    PHINode *RdxPhi = *it;
> +    assert(RdxPhi && "Unable to recover vectorized PHI");
> +
> +    // Find the reduction variable descriptor.
> +    assert(Legal->getReductionVars()->count(RdxPhi) &&
> +           "Unable to find the reduction variable");
> +    LoopVectorizationLegality::ReductionDescriptor RdxDesc =
> +    (*Legal->getReductionVars())[RdxPhi];
> +
> +    setDebugLocFromInst(Builder, RdxDesc.StartValue);
> +
> +    // We need to generate a reduction vector from the incoming scalar.
> +    // To do so, we need to generate the 'identity' vector and overide
> +    // one of the elements with the incoming scalar reduction. We need
> +    // to do it in the vector-loop preheader.
> +    Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator());
> +
> +    // This is the vector-clone of the value that leaves the loop.
> +    VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
> +    Type *VecTy = VectorExit[0]->getType();
> +
> +    // Find the reduction identity variable. Zero for addition, or, xor,
> +    // one for multiplication, -1 for And.
> +    Value *Identity;
> +    Value *VectorStart;
> +    if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax ||
> +        RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) {
> +      // MinMax reduction have the start value as their identify.
> +      VectorStart = Identity = RdxDesc.StartValue;
> +
> +    } else {
> +      Identity = LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
> +                                                        VecTy->getScalarType());
> +
> +      // This vector is the Identity vector where the first element is the
> +      // incoming scalar reduction.
> +      VectorStart = RdxDesc.StartValue;
> +    }
> +
> +    // Fix the vector-loop phi.
> +    // We created the induction variable so we know that the
> +    // preheader is the first entry.
> +    BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
> +
> +    // Reductions do not have to start at zero. They can start with
> +    // any loop invariant values.
> +    VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
> +    BasicBlock *Latch = OrigLoop->getLoopLatch();
> +    Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
> +    VectorParts &Val = getVectorValue(LoopVal);
> +    for (unsigned part = 0; part < UF; ++part) {
> +      // Make sure to add the reduction stat value only to the
> +      // first unroll part.
> +      Value *StartVal = (part == 0) ? VectorStart : Identity;
> +      cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
> +      cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody);
> +    }
> +
> +    // Before each round, move the insertion point right between
> +    // the PHIs and the values we are going to write.
> +    // This allows us to write both PHINodes and the extractelement
> +    // instructions.
> +    Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
> +
> +    VectorParts RdxParts;
> +    setDebugLocFromInst(Builder, RdxDesc.LoopExitInstr);
> +    for (unsigned part = 0; part < UF; ++part) {
> +      // This PHINode contains the vectorized reduction variable, or
> +      // the initial value vector, if we bypass the vector loop.
> +      VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
> +      PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
> +      Value *StartVal = (part == 0) ? VectorStart : Identity;
> +      for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
> +        NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
> +      NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody);
> +      RdxParts.push_back(NewPhi);
> +    }
> +
> +    // Reduce all of the unrolled parts into a single vector.
> +    Value *ReducedPartRdx = RdxParts[0];
> +    unsigned Op = getReductionBinOp(RdxDesc.Kind);
> +    setDebugLocFromInst(Builder, ReducedPartRdx);
> +    for (unsigned part = 1; part < UF; ++part) {
> +      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
> +        ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op,
> +                                             RdxParts[part], ReducedPartRdx,
> +                                             "bin.rdx");
> +      else
> +        ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind,
> +                                        ReducedPartRdx, RdxParts[part]);
> +    }
> +
> +    // Now, we need to fix the users of the reduction variable
> +    // inside and outside of the scalar remainder loop.
> +    // We know that the loop is in LCSSA form. We need to update the
> +    // PHI nodes in the exit blocks.
> +    for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
> +         LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
> +      PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
> +      if (!LCSSAPhi) continue;
> +
> +      // All PHINodes need to have a single entry edge, or two if
> +      // we already fixed them.
> +      assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
> +
> +      // We found our reduction value exit-PHI. Update it with the
> +      // incoming bypass edge.
> +      if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
> +        // Add an edge coming from the bypass.
> +        LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
> +        break;
> +      }
> +    }// end of the LCSSA phi scan.
> +
> +    // Fix the scalar loop reduction variable with the incoming reduction sum
> +    // from the vector body and from the backedge value.
> +    int IncomingEdgeBlockIdx =
> +    (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
> +    assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
> +    // Pick the other block.
> +    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
> +    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, ReducedPartRdx);
> +    (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
> +  }// end of for each redux variable.
> +
> +  fixLCSSAPHIs();
> +}
> +
> +void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) {
> +  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
> +  // Holds vector parameters or scalars, in case of uniform vals.
> +  SmallVector<VectorParts, 4> Params;
> +
> +  setDebugLocFromInst(Builder, Instr);
> +
> +  // Find all of the vectorized parameters.
> +  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
> +    Value *SrcOp = Instr->getOperand(op);
> +
> +    // If we are accessing the old induction variable, use the new one.
> +    if (SrcOp == OldInduction) {
> +      Params.push_back(getVectorValue(SrcOp));
> +      continue;
> +    }
> +
> +    // Try using previously calculated values.
> +    Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
> +
> +    // If the src is an instruction that appeared earlier in the basic block
> +    // then it should already be vectorized.
> +    if (SrcInst && OrigLoop->contains(SrcInst)) {
> +      assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
> +      // The parameter is a vector value from earlier.
> +      Params.push_back(WidenMap.get(SrcInst));
> +    } else {
> +      // The parameter is a scalar from outside the loop. Maybe even a constant.
> +      VectorParts Scalars;
> +      Scalars.append(UF, SrcOp);
> +      Params.push_back(Scalars);
> +    }
> +  }
> +
> +  assert(Params.size() == Instr->getNumOperands() &&
> +         "Invalid number of operands");
> +
> +  // Does this instruction return a value ?
> +  bool IsVoidRetTy = Instr->getType()->isVoidTy();
> +
> +  Value *UndefVec = IsVoidRetTy ? 0 :
> +  UndefValue::get(Instr->getType());
> +  // Create a new entry in the WidenMap and initialize it to Undef or Null.
> +  VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
> +
> +  // For each vector unroll 'part':
> +  for (unsigned Part = 0; Part < UF; ++Part) {
> +    // For each scalar that we create:
> +
> +    Instruction *Cloned = Instr->clone();
> +      if (!IsVoidRetTy)
> +        Cloned->setName(Instr->getName() + ".cloned");
> +      // Replace the operands of the cloned instrucions with extracted scalars.
> +      for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
> +        Value *Op = Params[op][Part];
> +        Cloned->setOperand(op, Op);
> +      }
> +
> +      // Place the cloned scalar in the new loop.
> +      Builder.Insert(Cloned);
> +
> +      // If the original scalar returns a value we need to place it in a vector
> +      // so that future users will be able to use it.
> +      if (!IsVoidRetTy)
> +        VecResults[Part] = Cloned;
> +  }
> +}
> +
> +void
> +InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr,
> +                                              LoopVectorizationLegality*) {
> +  return scalarizeInstruction(Instr);
> +}
> +
> +Value *InnerLoopUnroller::reverseVector(Value *Vec) {
> +  return Vec;
> +}
> +
> +Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) {
> +  return V;
> +}
> +
> +Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx,
> +                                               bool Negate) {
> +  // When unrolling and the VF is 1, we only need to add a simple scalar.
> +  Type *ITy = Val->getType();
> +  assert(!ITy->isVectorTy() && "Val must be a scalar");
> +  Constant *C = ConstantInt::get(ITy, StartIdx, Negate);
> +  return Builder.CreateAdd(Val, C, "induction");
> +}
> +
>
> Added: llvm/trunk/test/Transforms/LoopVectorize/unroll_novec.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/unroll_novec.ll?rev=189281&view=auto
> ==============================================================================
> --- llvm/trunk/test/Transforms/LoopVectorize/unroll_novec.ll (added)
> +++ llvm/trunk/test/Transforms/LoopVectorize/unroll_novec.ll Mon Aug 26 17:33:26 2013
> @@ -0,0 +1,39 @@
> +; RUN: opt < %s  -loop-vectorize -force-vector-width=1 -force-vector-unroll=2 -dce -instcombine -S | FileCheck %s
> +
> +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
> +target triple = "x86_64-apple-macosx10.8.0"
> +
> + at a = common global [2048 x i32] zeroinitializer, align 16
> +
> +; This is the loop.
> +;  for (i=0; i<n; i++){
> +;    a[i] += i;
> +;  }
> +;CHECK-LABEL: @inc(
> +;CHECK: load i32*
> +;CHECK: load i32*
> +;CHECK: add nsw i32
> +;CHECK: add nsw i32
> +;CHECK: store i32
> +;CHECK: store i32
> +;CHECK: ret void
> +define void @inc(i32 %n) nounwind uwtable noinline ssp {
> +  %1 = icmp sgt i32 %n, 0
> +  br i1 %1, label %.lr.ph, label %._crit_edge
> +
> +.lr.ph:                                           ; preds = %0, %.lr.ph
> +  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
> +  %2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
> +  %3 = load i32* %2, align 4
> +  %4 = trunc i64 %indvars.iv to i32
> +  %5 = add nsw i32 %3, %4
> +  store i32 %5, i32* %2, align 4
> +  %indvars.iv.next = add i64 %indvars.iv, 1
> +  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
> +  %exitcond = icmp eq i32 %lftr.wideiv, %n
> +  br i1 %exitcond, label %._crit_edge, label %.lr.ph
> +
> +._crit_edge:                                      ; preds = %.lr.ph, %0
> +  ret void
> +}
> +
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits