[llvm-commits] [llvm] r171436 - in /llvm/trunk/lib/Transforms/Vectorize: LoopVectorize.cpp LoopVectorize.h

Wed Jan 2 17:00:49 PST 2013

On Wed, Jan 2, 2013 at 4:52 PM, Nadav Rotem <nrotem at apple.com> wrote:

> Author: nadav
> Date: Wed Jan  2 18:52:27 2013
> New Revision: 171436
>
> URL: http://llvm.org/viewvc/llvm-project?rev=171436&view=rev
> Log:
> LoopVectorizer: Add support for loop-unrolling during vectorization for
> increasing the ILP. At the moment this feature is disabled by default and
> this commit should not cause any functional changes.
>

You can still add test cases by passing the flag in the RUN line, and it
would be good to do so. In particular, the test case can have two RUN lines
and verify both strategies, demonstrating any cut-off point or heuristic
that is eventually used to selectively enable this.

>
>
> Modified:
>     llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
>     llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.h
>
> Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=171436&r1=171435&r2=171436&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
> +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Wed Jan  2
> 18:52:27 2013
> @@ -42,6 +42,11 @@
>  VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
>                      cl::desc("Sets the SIMD width. Zero is autoselect."));
>
> +static cl::opt<unsigned>
> +VectorizationUnroll("force-vector-unroll", cl::init(1), cl::Hidden,
> +                    cl::desc("Sets the vectorization unroll count. "
> +                             "Zero is autoselect."));
> +
>  static cl::opt<bool>
>  EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
>                     cl::desc("Enable if-conversion during
> vectorization."));
> @@ -117,7 +122,7 @@
>            F->getParent()->getModuleIdentifier()<<"\n");
>
>      // If we decided that it is *legal* to vectorizer the loop then do it.
> -    InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF);
> +    InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, VectorizationUnroll);
>      LB.vectorize(&LVL);
>
>      DEBUG(verifyFunction(*L->getHeader()->getParent()));
> @@ -180,7 +185,8 @@
>    return Shuf;
>  }
>
> -Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, bool Negate)
> {
> +Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned
> StartIdx,
> +                                                 bool Negate) {
>    assert(Val->getType()->isVectorTy() && "Must be a vector");
>    assert(Val->getType()->getScalarType()->isIntegerTy() &&
>           "Elem must be an integer");
> @@ -191,8 +197,10 @@
>    SmallVector<Constant*, 8> Indices;
>
>    // Create a vector of consecutive numbers from zero to VF.
> -  for (int i = 0; i < VLen; ++i)
> -    Indices.push_back(ConstantInt::get(ITy, Negate ? (-i): i ));
> +  for (int i = 0; i < VLen; ++i) {
> +    int Idx = Negate ? (-i): i;
> +    Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx));
> +  }
>
>    // Add the consecutive indices to the vector value.
>    Constant *Cv = ConstantVector::get(Indices);
> @@ -244,18 +252,20 @@
>    return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
>  }
>
> -Value *InnerLoopVectorizer::getVectorValue(Value *V) {
> +InnerLoopVectorizer::VectorParts&
> +InnerLoopVectorizer::getVectorValue(Value *V) {
>    assert(V != Induction && "The new induction variable should not be
> used.");
>    assert(!V->getType()->isVectorTy() && "Can't widen a vector");
> -  // If we saved a vectorized copy of V, use it.
> -  Value *&MapEntry = WidenMap[V];
> -  if (MapEntry)
> -    return MapEntry;
>
> -  // Broadcast V and save the value for future uses.
> +  // If we have this scalar in the map, return it.
> +  if (WidenMap.has(V))
> +    return WidenMap.get(V);
> +
> +  // If this scalar is unknown, assume that it is a constant or that it is
> +  // loop invariant. Broadcast V and save the value for future uses.
>    Value *B = getBroadcastInstrs(V);
> -  MapEntry = B;
> -  return B;
> +  WidenMap.splat(V, B);
> +  return WidenMap.get(V);
>  }
>
>  Constant*
> @@ -277,7 +287,7 @@
>  void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
>    assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
>    // Holds vector parameters or scalars, in case of uniform vals.
> -  SmallVector<Value*, 8> Params;
> +  SmallVector<VectorParts, 4> Params;
>
>    // Find all of the vectorized parameters.
>    for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
> @@ -295,12 +305,14 @@
>      // If the src is an instruction that appeared earlier in the basic
> block
>      // then it should already be vectorized.
>      if (SrcInst && OrigLoop->contains(SrcInst)) {
> -      assert(WidenMap.count(SrcInst) && "Source operand is unavailable");
> +      assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
>        // The parameter is a vector value from earlier.
> -      Params.push_back(WidenMap[SrcInst]);
> +      Params.push_back(WidenMap.get(SrcInst));
>      } else {
>        // The parameter is a scalar from outside the loop. Maybe even a
> constant.
> -      Params.push_back(SrcOp);
> +      VectorParts Scalars;
> +      Scalars.append(UF, SrcOp);
> +      Params.push_back(Scalars);
>      }
>    }
>
> @@ -309,39 +321,38 @@
>
>    // Does this instruction return a value ?
>    bool IsVoidRetTy = Instr->getType()->isVoidTy();
> -  Value *VecResults = 0;
>
> -  // If we have a return value, create an empty vector. We place the
> scalarized
> -  // instructions in this vector.
> -  if (!IsVoidRetTy)
> -    VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF));
> +  Value *UndefVec = IsVoidRetTy ? 0 :
> +    UndefValue::get(VectorType::get(Instr->getType(), VF));
> +  // Create a new entry in the WidenMap and initialize it to Undef or
> Null.
> +  VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
>
>    // For each scalar that we create:
> -  for (unsigned i = 0; i < VF; ++i) {
> -    Instruction *Cloned = Instr->clone();
> -    if (!IsVoidRetTy)
> -      Cloned->setName(Instr->getName() + ".cloned");
> -    // Replace the operands of the cloned instrucions with extracted
> scalars.
> -    for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
> -      Value *Op = Params[op];
> -      // Param is a vector. Need to extract the right lane.
> -      if (Op->getType()->isVectorTy())
> -        Op = Builder.CreateExtractElement(Op, Builder.getInt32(i));
> -      Cloned->setOperand(op, Op);
> -    }
> -
> -    // Place the cloned scalar in the new loop.
> -    Builder.Insert(Cloned);
> -
> -    // If the original scalar returns a value we need to place it in a
> vector
> -    // so that future users will be able to use it.
> -    if (!IsVoidRetTy)
> -      VecResults = Builder.CreateInsertElement(VecResults, Cloned,
> -                                               Builder.getInt32(i));
> +  for (unsigned Width = 0; Width < VF; ++Width) {
> +    // For each vector unroll 'part':
> +    for (unsigned Part = 0; Part < UF; ++Part) {
> +      Instruction *Cloned = Instr->clone();
> +      if (!IsVoidRetTy)
> +        Cloned->setName(Instr->getName() + ".cloned");
> +      // Replace the operands of the cloned instrucions with extracted
> scalars.
> +      for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
> +        Value *Op = Params[op][Part];
> +        // Param is a vector. Need to extract the right lane.
> +        if (Op->getType()->isVectorTy())
> +          Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
> +        Cloned->setOperand(op, Op);
> +      }
> +
> +      // Place the cloned scalar in the new loop.
> +      Builder.Insert(Cloned);
> +
> +      // If the original scalar returns a value we need to place it in a
> vector
> +      // so that future users will be able to use it.
> +      if (!IsVoidRetTy)
> +        VecResults[Part] = Builder.CreateInsertElement(VecResults[Part],
> Cloned,
> +
> Builder.getInt32(Width));
> +    }
>    }
> -
> -  if (!IsVoidRetTy)
> -    WidenMap[Instr] = VecResults;
>  }
>
>  Value*
> @@ -503,7 +514,9 @@
>
>    // Generate the induction variable.
>    Induction = Builder.CreatePHI(IdxTy, 2, "index");
> -  Constant *Step = ConstantInt::get(IdxTy, VF);
> +  // The loop step is equal to the vectorization factor (num of SIMD
> elements)
> +  // times the unroll factor (num of SIMD instructions).
> +  Constant *Step = ConstantInt::get(IdxTy, VF * UF);
>
>    // We may need to extend the index in case there is a type mismatch.
>    // We know that the count starts at zero and does not overflow.
> @@ -521,8 +534,7 @@
>
>    // Now we need to generate the expression for N - (N % VF), which is
>    // the part that the vectorized body will execute.
> -  Constant *CIVF = ConstantInt::get(IdxTy, VF);
> -  Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc);
> +  Value *R = BinaryOperator::CreateURem(Count, Step, "n.mod.vf", Loc);
>    Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec",
> Loc);
>    Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown,
> StartIdx,
>                                                       "end.idx.rnd.down",
> Loc);
> @@ -775,7 +787,6 @@
>    for (PhiVector::iterator it = RdxPHIsToFix.begin(), e =
> RdxPHIsToFix.end();
>         it != e; ++it) {
>      PHINode *RdxPhi = *it;
> -    PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
>      assert(RdxPhi && "Unable to recover vectorized PHI");
>
>      // Find the reduction variable descriptor.
> @@ -791,8 +802,8 @@
>      Builder.SetInsertPoint(LoopBypassBlock->getTerminator());
>
>      // This is the vector-clone of the value that leaves the loop.
> -    Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
> -    Type *VecTy = VectorExit->getType();
> +    VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
> +    Type *VecTy = VectorExit[0]->getType();
>
>      // Find the reduction identity variable. Zero for addition, or, xor,
>      // one for multiplication, -1 for And.
> @@ -811,10 +822,17 @@
>
>      // Reductions do not have to start at zero. They can start with
>      // any loop invariant values.
> -    VecRdxPhi->addIncoming(VectorStart, VecPreheader);
> -    Value *Val =
> -
>  getVectorValue(RdxPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
> -    VecRdxPhi->addIncoming(Val, LoopVectorBody);
> +    VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
> +    BasicBlock *Latch = OrigLoop->getLoopLatch();
> +    Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
> +    VectorParts &Val = getVectorValue(LoopVal);
> +    for (unsigned part = 0; part < UF; ++part) {
> +      // Make sure to add the reduction stat value only to the
> +      // first unroll part.
> +      Value *StartVal = (part == 0) ? VectorStart : Identity;
> +      cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
> +      cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
> LoopVectorBody);
> +    }
>
>      // Before each round, move the insertion point right between
>      // the PHIs and the values we are going to write.
> @@ -822,18 +840,54 @@
>      // instructions.
>      Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
>
> -    // This PHINode contains the vectorized reduction variable, or
> -    // the initial value vector, if we bypass the vector loop.
> -    PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
> -    NewPhi->addIncoming(VectorStart, LoopBypassBlock);
> -    NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr),
> LoopVectorBody);
> +    VectorParts RdxParts;
> +    for (unsigned part = 0; part < UF; ++part) {
> +      // This PHINode contains the vectorized reduction variable, or
> +      // the initial value vector, if we bypass the vector loop.
> +      VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
> +      PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
> +      Value *StartVal = (part == 0) ? VectorStart : Identity;
> +      NewPhi->addIncoming(StartVal, LoopBypassBlock);
> +      NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody);
> +      RdxParts.push_back(NewPhi);
> +    }
> +
> +    // Reduce all of the unrolled parts into a single vector.
> +    Value *ReducedPartRdx = RdxParts[0];
> +    for (unsigned part = 1; part < UF; ++part) {
> +      switch (RdxDesc.Kind) {
> +      case LoopVectorizationLegality::IntegerAdd:
> +        ReducedPartRdx =
> +          Builder.CreateAdd(RdxParts[part], ReducedPartRdx, "add.rdx");
> +        break;
> +      case LoopVectorizationLegality::IntegerMult:
> +        ReducedPartRdx =
> +          Builder.CreateMul(RdxParts[part], ReducedPartRdx, "mul.rdx");
> +        break;
> +      case LoopVectorizationLegality::IntegerOr:
> +        ReducedPartRdx =
> +          Builder.CreateOr(RdxParts[part], ReducedPartRdx, "or.rdx");
> +        break;
> +      case LoopVectorizationLegality::IntegerAnd:
> +        ReducedPartRdx =
> +          Builder.CreateAnd(RdxParts[part], ReducedPartRdx, "and.rdx");
> +        break;
> +      case LoopVectorizationLegality::IntegerXor:
> +        ReducedPartRdx =
> +          Builder.CreateXor(RdxParts[part], ReducedPartRdx, "xor.rdx");
> +        break;
> +      default:
> +        llvm_unreachable("Unknown reduction operation");
> +      }
> +    }
> +
>
>      // VF is a power of 2 so we can emit the reduction using log2(VF)
> shuffles
>      // and vector ops, reducing the set of values being computed by half
> each
>      // round.
>      assert(isPowerOf2_32(VF) &&
>             "Reduction emission only supported for pow2 vectors!");
> -    Value *TmpVec = NewPhi;
> +    Value *TmpVec = ReducedPartRdx;
>      SmallVector<Constant*, 32> ShuffleMask(VF, 0);
>      for (unsigned i = VF; i != 1; i >>= 1) {
>        // Move the upper half of the vector to the lower half.
> @@ -922,27 +976,34 @@
>    }
>  }
>
> -Value *InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock
> *Dst) {
> +InnerLoopVectorizer::VectorParts
> +InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
>    assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst)
> &&
>           "Invalid edge");
>
> -  Value *SrcMask = createBlockInMask(Src);
> +  VectorParts SrcMask = createBlockInMask(Src);
>
>    // The terminator has to be a branch inst!
>    BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
>    assert(BI && "Unexpected terminator found");
>
> -  Value *EdgeMask = SrcMask;
>    if (BI->isConditional()) {
> -    EdgeMask = getVectorValue(BI->getCondition());
> +    VectorParts EdgeMask = getVectorValue(BI->getCondition());
> +
>      if (BI->getSuccessor(0) != Dst)
> -      EdgeMask = Builder.CreateNot(EdgeMask);
> +      for (unsigned part = 0; part < UF; ++part)
> +        EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
> +
> +    for (unsigned part = 0; part < UF; ++part)
> +      EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
> +    return EdgeMask;
>    }
>
> -  return Builder.CreateAnd(EdgeMask, SrcMask);
> +  return SrcMask;
>  }
>
> -Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
> +InnerLoopVectorizer::VectorParts
> +InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
>    assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
>
>    // Loop incoming mask is all-one.
> @@ -953,11 +1014,14 @@
>
>    // This is the block mask. We OR all incoming edges, and with zero.
>    Value *Zero =
> ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
> -  Value *BlockMask = getVectorValue(Zero);
> +  VectorParts BlockMask = getVectorValue(Zero);
>
>    // For each pred:
> -  for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it)
> -    BlockMask = Builder.CreateOr(BlockMask, createEdgeMask(*it, BB));
> +  for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e;
> ++it) {
> +    VectorParts EM = createEdgeMask(*it, BB);
> +    for (unsigned part = 0; part < UF; ++part)
> +      BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
> +  }
>
>    return BlockMask;
>  }
> @@ -969,6 +1033,7 @@
>
>    // For each instruction in the old loop.
>    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;
> ++it) {
> +    VectorParts &Entry = WidenMap.get(it);
>      switch (it->getOpcode()) {
>      case Instruction::Br:
>        // Nothing to do for PHIs and BR, since we already took care of the
> @@ -978,11 +1043,12 @@
>        PHINode* P = cast<PHINode>(it);
>        // Handle reduction variables:
>        if (Legal->getReductionVars()->count(P)) {
> -        // This is phase one of vectorizing PHIs.
> -        Type *VecTy = VectorType::get(it->getType(), VF);
> -        WidenMap[it] =
> -          PHINode::Create(VecTy, 2, "vec.phi",
> -                          LoopVectorBody->getFirstInsertionPt());
> +        for (unsigned part = 0; part < UF; ++part) {
> +          // This is phase one of vectorizing PHIs.
> +          Type *VecTy = VectorType::get(it->getType(), VF);
> +          Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
> +                                        LoopVectorBody->
> getFirstInsertionPt());
> +        }
>          PV->push_back(P);
>          continue;
>        }
> @@ -996,12 +1062,15 @@
>          // At this point we generate the predication tree. There may be
>          // duplications since this is a simple recursive scan, but future
>          // optimizations will clean it up.
> -        Value *Cond = createEdgeMask(P->getIncomingBlock(0),
> P->getParent());
> -        WidenMap[P] =
> -          Builder.CreateSelect(Cond,
> -                               getVectorValue(P->getIncomingValue(0)),
> -                               getVectorValue(P->getIncomingValue(1)),
> -                               "predphi");
> +        VectorParts Cond = createEdgeMask(P->getIncomingBlock(0),
> +                                               P->getParent());
> +
> +        for (unsigned part = 0; part < UF; ++part) {
> +        VectorParts &In0 = getVectorValue(P->getIncomingValue(0));
> +        VectorParts &In1 = getVectorValue(P->getIncomingValue(1));
> +          Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
> In1[part],
> +                                             "predphi");
> +        }
>          continue;
>        }
>
> @@ -1021,8 +1090,8 @@
>          Value *Broadcasted = getBroadcastInstrs(Induction);
>          // After broadcasting the induction variable we need to make the
>          // vector consecutive by adding 0, 1, 2 ...
> -        Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted);
> -        WidenMap[OldInduction] = ConsecutiveInduction;
> +        for (unsigned part = 0; part < UF; ++part)
> +          Entry[part] = getConsecutiveVector(Broadcasted, VF * part,
> false);
>          continue;
>        }
>        case LoopVectorizationLegality::ReverseIntInduction:
> @@ -1054,9 +1123,8 @@
>            Value *Broadcasted = getBroadcastInstrs(ReverseInd);
>            // After broadcasting the induction variable we need to make the
>            // vector consecutive by adding  ... -3, -2, -1, 0.
> -          Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted,
> -                                                             true);
> -          WidenMap[it] = ConsecutiveInduction;
> +          for (unsigned part = 0; part < UF; ++part)
> +            Entry[part] = getConsecutiveVector(Broadcasted, -VF * part,
> true);
>            continue;
>          }
>
> @@ -1065,19 +1133,21 @@
>
>          // This is the vector of results. Notice that we don't generate
>          // vector geps because scalar geps result in better code.
> -        Value *VecVal = UndefValue::get(VectorType::get(P->getType(),
> VF));
> -        for (unsigned int i = 0; i < VF; ++i) {
> -          Constant *Idx = ConstantInt::get(Induction->getType(), i);
> -          Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx,
> -                                               "gep.idx");
> -          Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
> -                                             "next.gep");
> -          VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
> -                                               Builder.getInt32(i),
> -                                               "insert.gep");
> +        for (unsigned part = 0; part < UF; ++part) {
> +          Value *VecVal = UndefValue::get(VectorType::get(P->getType(),
> VF));
> +          for (unsigned int i = 0; i < VF; ++i) {
> +            Constant *Idx = ConstantInt::get(Induction->getType(),
> +                                             i + part * VF);
> +            Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx,
> +                                                 "gep.idx");
> +            Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
> +                                               "next.gep");
> +            VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
> +                                                 Builder.getInt32(i),
> +                                                 "insert.gep");
> +          }
> +          Entry[part] = VecVal;
>          }
> -
> -        WidenMap[it] = VecVal;
>          continue;
>        }
>
> @@ -1103,41 +1173,48 @@
>      case Instruction::Xor: {
>        // Just widen binops.
>        BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
> -      Value *A = getVectorValue(it->getOperand(0));
> -      Value *B = getVectorValue(it->getOperand(1));
> +      VectorParts &A = getVectorValue(it->getOperand(0));
> +      VectorParts &B = getVectorValue(it->getOperand(1));
>
>        // Use this vector value for all users of the original instruction.
> -      Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
> -      WidenMap[it] = V;
> +      for (unsigned Part = 0; Part < UF; ++Part) {
> +        Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part],
> B[Part]);
>
> -      // Update the NSW, NUW and Exact flags.
> -      BinaryOperator *VecOp = cast<BinaryOperator>(V);
> -      if (isa<OverflowingBinaryOperator>(BinOp)) {
> -        VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
> -        VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
> +        // Update the NSW, NUW and Exact flags.
> +        BinaryOperator *VecOp = cast<BinaryOperator>(V);
> +        if (isa<OverflowingBinaryOperator>(BinOp)) {
> +          VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
> +          VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
> +        }
> +        if (isa<PossiblyExactOperator>(VecOp))
> +          VecOp->setIsExact(BinOp->isExact());
> +
> +        Entry[Part] = V;
>        }
> -      if (isa<PossiblyExactOperator>(VecOp))
> -        VecOp->setIsExact(BinOp->isExact());
>        break;
>      }
>      case Instruction::Select: {
>        // Widen selects.
>        // If the selector is loop invariant we can create a select
>        // instruction with a scalar condition. Otherwise, use
> vector-select.
> -      Value *Cond = it->getOperand(0);
> -      bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond),
> OrigLoop);
> +      bool InvariantCond =
> SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
> +                                               OrigLoop);
>
>        // The condition can be loop invariant  but still defined inside the
>        // loop. This means that we can't just use the original 'cond'
> value.
>        // We have to take the 'vectorized' value and pick the first lane.
>        // Instcombine will make this a no-op.
> -      Cond = getVectorValue(Cond);
> -      if (InvariantCond)
> -        Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0));
> -
> -      Value *Op0 = getVectorValue(it->getOperand(1));
> -      Value *Op1 = getVectorValue(it->getOperand(2));
> -      WidenMap[it] = Builder.CreateSelect(Cond, Op0, Op1);
> +      VectorParts &Cond = getVectorValue(it->getOperand(0));
> +      VectorParts &Op0  = getVectorValue(it->getOperand(1));
> +      VectorParts &Op1  = getVectorValue(it->getOperand(2));
> +      Value *ScalarCond = Builder.CreateExtractElement(Cond[0],
> +
> Builder.getInt32(0));
> +      for (unsigned Part = 0; Part < UF; ++Part) {
> +        Entry[Part] = Builder.CreateSelect(
> +          InvariantCond ? ScalarCond : Cond[Part],
> +          Op0[Part],
> +          Op1[Part]);
> +      }
>        break;
>      }
>
> @@ -1146,12 +1223,16 @@
>        // Widen compares. Generate vector compares.
>        bool FCmp = (it->getOpcode() == Instruction::FCmp);
>        CmpInst *Cmp = dyn_cast<CmpInst>(it);
> -      Value *A = getVectorValue(it->getOperand(0));
> -      Value *B = getVectorValue(it->getOperand(1));
> -      if (FCmp)
> -        WidenMap[it] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
> -      else
> -        WidenMap[it] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
> +      VectorParts &A = getVectorValue(it->getOperand(0));
> +      VectorParts &B = getVectorValue(it->getOperand(1));
> +      for (unsigned Part = 0; Part < UF; ++Part) {
> +        Value *C = 0;
> +        if (FCmp)
> +          C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
> +        else
> +          C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
> +        Entry[Part] = C;
> +      }
>        break;
>      }
>
> @@ -1173,12 +1254,17 @@
>          break;
>        }
>
> +      // Handle consecutive stores.
> +
>        GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
>        if (Gep) {
>          // The last index does not have to be the induction. It can be
>          // consecutive and be a function of the index. For example A[I+1];
>          unsigned NumOperands = Gep->getNumOperands();
> -        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -
> 1));
> +
> +        Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
> +        VectorParts &GEPParts = getVectorValue(LastGepOperand);
> +        Value *LastIndex = GEPParts[0];
>          LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
>
>          // Create the new GEP with the new induction variable.
> @@ -1188,19 +1274,28 @@
>        } else {
>          // Use the induction element ptr.
>          assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
> -        Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
> +        VectorParts &PtrVal = getVectorValue(Ptr);
> +        Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
>        }
>
> -      // If the address is consecutive but reversed, then the
> -      // wide load needs to start at the last vector element.
> -      if (Reverse)
> -        Ptr = Builder.CreateGEP(Ptr, Builder.getInt32(1 - VF));
> -
> -      Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
> -      Value *Val = getVectorValue(SI->getValueOperand());
> -      if (Reverse)
> -        Val = reverseVector(Val);
> -      Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
> +      VectorParts &StoredVal = getVectorValue(SI->getValueOperand());
> +      for (unsigned Part = 0; Part < UF; ++Part) {
> +        // Calculate the pointer for the specific unroll-part.
> +        Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part *
> VF));
> +
> +        if (Reverse) {
> +          // If we store to reverse consecutive memory locations then we
> need
> +          // to reverse the order of elements in the stored value.
> +          StoredVal[Part] = reverseVector(StoredVal[Part]);
> +          // If the address is consecutive but reversed, then the
> +          // wide store needs to start at the last vector element.
> +          PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
> +          PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
> +        }
> +
> +        Value *VecPtr = Builder.CreateBitCast(PartPtr,
> StTy->getPointerTo());
> +        Builder.CreateStore(StoredVal[Part],
> VecPtr)->setAlignment(Alignment);
> +      }
>        break;
>      }
>      case Instruction::Load: {
> @@ -1224,7 +1319,10 @@
>          // The last index does not have to be the induction. It can be
>          // consecutive and be a function of the index. For example A[I+1];
>          unsigned NumOperands = Gep->getNumOperands();
> -        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands
> -1));
> +
> +        Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
> +        VectorParts &GEPParts = getVectorValue(LastGepOperand);
> +        Value *LastIndex = GEPParts[0];
>          LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
>
>          // Create the new GEP with the new induction variable.
> @@ -1234,19 +1332,26 @@
>        } else {
>          // Use the induction element ptr.
>          assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
> -        Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
> +        VectorParts &PtrVal = getVectorValue(Ptr);
> +        Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
>        }
> -      // If the address is consecutive but reversed, then the
> -      // wide load needs to start at the last vector element.
> -      if (Reverse)
> -        Ptr = Builder.CreateGEP(Ptr, Builder.getInt32(1 - VF));
> -
> -      Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
> -      LI = Builder.CreateLoad(Ptr);
> -      LI->setAlignment(Alignment);
>
> -      // Use this vector value for all users of the load.
> -      WidenMap[it] = Reverse ? reverseVector(LI) :  LI;
> +      for (unsigned Part = 0; Part < UF; ++Part) {
> +        // Calculate the pointer for the specific unroll-part.
> +        Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part *
> VF));
> +
> +        if (Reverse) {
> +          // If the address is consecutive but reversed, then the
> +          // wide store needs to start at the last vector element.
> +          PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
> +          PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
> +        }
> +
> +        Value *VecPtr = Builder.CreateBitCast(PartPtr,
> RetTy->getPointerTo());
> +        Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
> +        cast<LoadInst>(LI)->setAlignment(Alignment);
> +        Entry[Part] = Reverse ? reverseVector(LI) :  LI;
> +      }
>        break;
>      }
>      case Instruction::ZExt:
> @@ -1271,13 +1376,16 @@
>          Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
>                                                 CI->getType());
>          Value *Broadcasted = getBroadcastInstrs(ScalarCast);
> -        WidenMap[it] = getConsecutiveVector(Broadcasted);
> +        for (unsigned Part = 0; Part < UF; ++Part)
> +          Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part,
> false);
>          break;
>        }
>        /// Vectorize casts.
> -      Value *A = getVectorValue(it->getOperand(0));
>        Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
> -      WidenMap[it] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
> +
> +      VectorParts &A = getVectorValue(it->getOperand(0));
> +      for (unsigned Part = 0; Part < UF; ++Part)
> +        Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part],
> DestTy);
>        break;
>      }
>
> @@ -1286,12 +1394,16 @@
>        Module *M = BB->getParent()->getParent();
>        IntrinsicInst *II = cast<IntrinsicInst>(it);
>        Intrinsic::ID ID = II->getIntrinsicID();
> -      SmallVector<Value*, 4> Args;
> -      for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i)
> -        Args.push_back(getVectorValue(II->getArgOperand(i)));
> -      Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF)
> };
> -      Function *F = Intrinsic::getDeclaration(M, ID, Tys);
> -      WidenMap[it] = Builder.CreateCall(F, Args);
> +      for (unsigned Part = 0; Part < UF; ++Part) {
> +        SmallVector<Value*, 4> Args;
> +        for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) {
> +          VectorParts &Arg = getVectorValue(II->getArgOperand(i));
> +          Args.push_back(Arg[Part]);
> +        }
> +        Type *Tys[] = { VectorType::get(II->getType()->getScalarType(),
> VF) };
> +        Function *F = Intrinsic::getDeclaration(M, ID, Tys);
> +        Entry[Part] = Builder.CreateCall(F, Args);
> +      }
>        break;
>      }
>
>
> Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.h
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.h?rev=171436&r1=171435&r2=171436&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.h (original)
> +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.h Wed Jan  2
> 18:52:27 2013
> @@ -54,6 +54,8 @@
>  #include "llvm/Analysis/ScalarEvolution.h"
>  #include "llvm/IR/IRBuilder.h"
>  #include <algorithm>
> +#include <map>
> +
>  using namespace llvm;
>
>  /// We don't vectorize loops with a known constant trip count below this
> number.
> @@ -91,9 +93,11 @@
>  public:
>    /// Ctor.
>    InnerLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
> -                      DominatorTree *Dt, DataLayout *Dl, unsigned
> VecWidth):
> +                      DominatorTree *Dt, DataLayout *Dl,
> +                      unsigned VecWidth, unsigned UnrollFactor):
>    OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), VF(VecWidth),
> -  Builder(Se->getContext()), Induction(0), OldInduction(0) { }
> +  UF(UnrollFactor), Builder(Se->getContext()), Induction(0),
> OldInduction(0),
> +  WidenMap(UnrollFactor) { }
>
>    // Perform the actual loop widening (vectorization).
>    void vectorize(LoopVectorizationLegality *Legal) {
> @@ -109,6 +113,10 @@
>  private:
>    /// A small list of PHINodes.
>    typedef SmallVector<PHINode*, 4> PhiVector;
> +  /// When we unroll loops we have multiple vector values for each scalar.
> +  /// This data structure holds the unrolled and vectorized values that
> +  /// originated from one scalar instruction.
> +  typedef SmallVector<Value*, 2> VectorParts;
>
>    /// Add code that checks at runtime if the accessed arrays overlap.
>    /// Returns the comparator value or NULL if no check is needed.
> @@ -122,10 +130,10 @@
>    /// A helper function that computes the predicate of the block BB,
> assuming
>    /// that the header block of the loop is set to True. It returns the
> *entry*
>    /// mask for the block BB.
> -  Value *createBlockInMask(BasicBlock *BB);
> +  VectorParts createBlockInMask(BasicBlock *BB);
>    /// A helper function that computes the predicate of the edge between
> SRC
>    /// and DST.
> -  Value *createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
> +  VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
>
>    /// A helper function to vectorize a single BB within the innermost
> loop.
>    void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock
> *BB,
> @@ -148,14 +156,15 @@
>
>    /// This function adds 0, 1, 2 ... to each vector element, starting at
> zero.
>    /// If Negate is set then negative numbers are added e.g. (0, -1, -2,
> ...).
> -  Value *getConsecutiveVector(Value* Val, bool Negate = false);
> +  /// The sequence starts at StartIndex.
> +  Value *getConsecutiveVector(Value* Val, unsigned StartIdx, bool Negate);
>
>    /// When we go over instructions in the basic block we rely on previous
>    /// values within the current basic block or on loop invariant values.
>    /// When we widen (vectorize) values we place them in the map. If the
> values
>    /// are not within the map, they have to be loop invariant, so we simply
>    /// broadcast them into a vector.
> -  Value *getVectorValue(Value *V);
> +  VectorParts &getVectorValue(Value *V);
>
>    /// Get a uniform vector of constant integers. We use this to get
>    /// vectors of ones and zeros for the reduction code.
> @@ -164,22 +173,61 @@
>    /// Generate a shuffle sequence that will reverse the vector Vec.
>    Value *reverseVector(Value *Vec);
>
> -  typedef DenseMap<Value*, Value*> ValueMap;
> +  /// This is a helper class that holds the vectorizer state. It maps
> scalar
> +  /// instructions to vector instructions. When the code is 'unrolled'
> then
> +  /// then a single scalar value is mapped to multiple vector parts. The
> parts
> +  /// are stored in the VectorPart type.
> +  struct ValueMap {
> +    /// C'tor.  UnrollFactor controls the number of vectors ('parts') that
> +    /// are mapped.
> +    ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
> +
> +    /// \return True if 'Key' is saved in the Value Map.
> +    bool has(Value *Key) { return MapStoreage.count(Key); }
> +
> +    /// Initializes a new entry in the map. Sets all of the vector parts
> to the
> +    /// save value in 'Val'.
> +    /// \return A reference to a vector with splat values.
> +    VectorParts &splat(Value *Key, Value *Val) {
> +        MapStoreage[Key].clear();
> +        MapStoreage[Key].append(UF, Val);
> +        return MapStoreage[Key];
> +    }
> +
> +    ///\return A reference to the value that is stored at 'Key'.
> +    VectorParts &get(Value *Key) {
> +      if (!has(Key))
> +        MapStoreage[Key].resize(UF);
> +      return MapStoreage[Key];
> +    }
> +
> +    /// The unroll factor. Each entry in the map stores this number of
> vector
> +    /// elements.
> +    unsigned UF;
> +
> +    /// Map storage. We use std::map and not DenseMap because insertions
> to a
> +    /// dense map invalidates its iterators.
> +    std::map<Value*, VectorParts> MapStoreage;
> +  };
>
>    /// The original loop.
>    Loop *OrigLoop;
> -  // Scev analysis to use.
> +  /// Scev analysis to use.
>    ScalarEvolution *SE;
> -  // Loop Info.
> +  /// Loop Info.
>    LoopInfo *LI;
> -  // Dominator Tree.
> +  /// Dominator Tree.
>    DominatorTree *DT;
> -  // Data Layout.
> +  /// Data Layout.
>    DataLayout *DL;
> -  // The vectorization factor to use.
> +  /// The vectorization SIMD factor to use. Each vector will have this
> many
> +  /// vector elements.
>    unsigned VF;
> +  /// The vectorization unroll factor to use. Each scalar is vectorized
> to this
> +  /// many different vector instructions.
> +  unsigned UF;
>
> -  // The builder that we use
> +  /// The builder that we use
>    IRBuilder<> Builder;
>
>    // --- Vectorization state ---
> @@ -203,7 +251,7 @@
>    PHINode *Induction;
>    /// The induction variable of the old basic block.
>    PHINode *OldInduction;
> -  // Maps scalars to widened vectors.
> +  /// Maps scalars to widened vectors.
>    ValueMap WidenMap;
>  };
>
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20130102/f456ec7f/attachment.html>