[llvm] r184647 - SLP Vectorizer: Implement multi-block slp-vectorization.

Sun Jun 23 08:43:42 PDT 2013

Hi Nadav!

This change breaks on ASan bootstrap bot with the following error report:

=================================================================
==27050==ERROR: AddressSanitizer: heap-use-after-free on address
0x60d00000c488 at pc 0x1592bfc bp 0x7ffffec9cd90 sp 0x7ffffec9cd88
READ of size 8 at 0x60d00000c488 thread T0
    #0 0x1592bfb in getParent /build/llvm/include/llvm/IR/Instruction.h:53
    #1 0x1592bfb in SetInsertPoint
/build/llvm/include/llvm/IR/IRBuilder.h:90
    #2 0x1592bfb in ~BuilderLocGuard
/build/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:61
    #3 0x1592bfb in (anonymous
namespace)::FuncSLP::vectorizeTree_rec(llvm::ArrayRef<llvm::Value*>)
/build/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:1111
    #4 0x158e563 in (anonymous
namespace)::FuncSLP::vectorizeTree(llvm::ArrayRef<llvm::Value*>)
/build/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:1115
    #5 0x1588ba1 in vectorizeStoreChain
/build/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:866
    #6 0x1588ba1 in vectorizeStores
/build/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:915
    #7 0x1588ba1 in vectorizeStoreChains
/build/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:1468
    #8 0x1588ba1 in (anonymous
namespace)::SLPVectorizer::runOnFunction(llvm::Function&)
/build/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:1230
    #9 0x2b03fdc in llvm::FPPassManager::runOnFunction(llvm::Function&)
/build/llvm/lib/IR/PassManager.cpp:1530
    #10 0x2b045a5 in llvm::FPPassManager::runOnModule(llvm::Module&)
/build/llvm/lib/IR/PassManager.cpp:1550
    #11 0x2b04dbb in llvm::MPPassManager::runOnModule(llvm::Module&)
/build/llvm/lib/IR/PassManager.cpp:1608
    #12 0x2b05fb3 in llvm::PassManagerImpl::run(llvm::Module&)
/build/llvm/lib/IR/PassManager.cpp:1703
    #13 0x2b0642f in llvm::PassManager::run(llvm::Module&)
/build/llvm/lib/IR/PassManager.cpp:1738
    #14 0x6199a3 in main /build/llvm/tools/opt/opt.cpp:823
    #15 0x7fb44e87276c (/lib/x86_64-linux-gnu/libc.so.6+0x2176c)
    #16 0x608ed4 in _start (/build/llvm_build_asan/bin/opt+0x608ed4)
0x60d00000c488 is located 120 bytes inside of 136-byte region
[0x60d00000c410,0x60d00000c498)
freed by thread T0 here:
    #0 0x5f49c5 in operator delete(void*)
/build/llvm/projects/compiler-rt/lib/asan/asan_new_delete.cc:83
    #1 0x1591a6e in (anonymous
namespace)::FuncSLP::vectorizeTree_rec(llvm::ArrayRef<llvm::Value*>)
/build/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:1105
    #2 0x158e563 in (anonymous
namespace)::FuncSLP::vectorizeTree(llvm::ArrayRef<llvm::Value*>)
/build/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:1115
    #3 0x1588ba1 in vectorizeStoreChain
/build/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:866
    #4 0x1588ba1 in vectorizeStores
/build/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:915
    #5 0x1588ba1 in vectorizeStoreChains
/build/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:1468
    #6 0x1588ba1 in (anonymous
namespace)::SLPVectorizer::runOnFunction(llvm::Function&)
/build/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp:1230
    #7 0x2b03fdc in llvm::FPPassManager::runOnFunction(llvm::Function&)
/build/llvm/lib/IR/PassManager.cpp:1530
    #8 0x2b045a5 in llvm::FPPassManager::runOnModule(llvm::Module&)
/build/llvm/lib/IR/PassManager.cpp:1550
    #9 0x2b04dbb in llvm::MPPassManager::runOnModule(llvm::Module&)
/build/llvm/lib/IR/PassManager.cpp:1608
    #10 0x2b05fb3 in llvm::PassManagerImpl::run(llvm::Module&)
/build/llvm/lib/IR/PassManager.cpp:1703
    #11 0x2b0642f in llvm::PassManager::run(llvm::Module&)
/build/llvm/lib/IR/PassManager.cpp:1738
    #12 0x6199a3 in main /build/llvm/tools/opt/opt.cpp:823
    #13 0x7fb44e87276c (/lib/x86_64-linux-gnu/libc.so.6+0x2176c)
previously allocated by thread T0 here:
    #0 0x5f4705 in operator new(unsigned long)
/build/llvm/projects/compiler-rt/lib/asan/asan_new_delete.cc:52
    #1 0x2b30953 in llvm::User::operator new(unsigned long, unsigned int)
/build/llvm/lib/IR/User.cpp:60
    #2 0x149304f in operator new
/build/llvm/include/llvm/IR/Instructions.h:265
    #3 0x149304f in llvm::LLParser::ParseStore(llvm::Instruction*&,
llvm::LLParser::PerFunctionState&)
/build/llvm/lib/AsmParser/LLParser.cpp:4137
    #4 0x14811e6 in llvm::LLParser::ParseInstruction(llvm::Instruction*&,
llvm::BasicBlock*, llvm::LLParser::PerFunctionState&)
/build/llvm/lib/AsmParser/LLParser.cpp:3312
    #5 0x1480175 in
llvm::LLParser::ParseBasicBlock(llvm::LLParser::PerFunctionState&)
/build/llvm/lib/AsmParser/LLParser.cpp:3185
    #6 0x145858f in llvm::LLParser::ParseFunctionBody(llvm::Function&)
/build/llvm/lib/AsmParser/LLParser.cpp:3138
    #7 0x1445942 in ParseDefine /build/llvm/lib/AsmParser/LLParser.cpp:424
    #8 0x1445942 in llvm::LLParser::ParseTopLevelEntities()
/build/llvm/lib/AsmParser/LLParser.cpp:226
    #9 0x14455ce in llvm::LLParser::Run()
/build/llvm/lib/AsmParser/LLParser.cpp:41
    #10 0x143706e in llvm::ParseAssembly(llvm::MemoryBuffer*,
llvm::Module*, llvm::SMDiagnostic&, llvm::LLVMContext&)
/build/llvm/lib/AsmParser/Parser.cpp:38
    #11 0x11bf597 in llvm::ParseIR(llvm::MemoryBuffer*,
llvm::SMDiagnostic&, llvm::LLVMContext&)
/build/llvm/lib/IRReader/IRReader.cpp:76
    #12 0x11bff04 in llvm::ParseIRFile(std::string const&,
llvm::SMDiagnostic&, llvm::LLVMContext&)
/build/llvm/lib/IRReader/IRReader.cpp:88
    #13 0x61308c in main /build/llvm/tools/opt/opt.cpp:592
    #14 0x7fb44e87276c (/lib/x86_64-linux-gnu/libc.so.6+0x2176c)
SUMMARY: AddressSanitizer: heap-use-after-free
/build/llvm/include/llvm/IR/Instruction.h:53 getParent

Can you please fix this?

On Sun, Jun 23, 2013 at 1:34 AM, Nadav Rotem <nrotem at apple.com> wrote:

> Author: nadav
> Date: Sat Jun 22 16:34:10 2013
> New Revision: 184647
>
> URL: http://llvm.org/viewvc/llvm-project?rev=184647&view=rev
> Log:
> SLP Vectorizer: Implement multi-block slp-vectorization.
>
> Rewrote the SLP-vectorization as a whole-function vectorization pass. It
> is now able to vectorize chains across multiple basic blocks.
> It still does not vectorize PHIs, but this should be easy to do now that
> we scan the entire function.
> I removed the support for extracting values from trees.
> We are now able to vectorize more programs, but there are some serious
> regressions in many workloads (such as flops-6 and mandel-2).
>
>
> Added:
>     llvm/trunk/test/Transforms/SLPVectorizer/X86/multi_block.ll
> Removed:
>     llvm/trunk/lib/Transforms/Vectorize/VecUtils.cpp
>     llvm/trunk/lib/Transforms/Vectorize/VecUtils.h
> Modified:
>     llvm/trunk/lib/Transforms/Vectorize/CMakeLists.txt
>     llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
>     llvm/trunk/test/Transforms/SLPVectorizer/X86/diamond.ll
>     llvm/trunk/test/Transforms/SLPVectorizer/X86/multi_user.ll
>
> Modified: llvm/trunk/lib/Transforms/Vectorize/CMakeLists.txt
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/CMakeLists.txt?rev=184647&r1=184646&r2=184647&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Vectorize/CMakeLists.txt (original)
> +++ llvm/trunk/lib/Transforms/Vectorize/CMakeLists.txt Sat Jun 22 16:34:10
> 2013
> @@ -3,7 +3,6 @@ add_llvm_library(LLVMVectorize
>    Vectorize.cpp
>    LoopVectorize.cpp
>    SLPVectorizer.cpp
> -  VecUtils.cpp
>    )
>
>  add_dependencies(LLVMVectorize intrinsics_gen)
>
> Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=184647&r1=184646&r2=184647&view=diff
>
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
> +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Sat Jun 22
> 16:34:10 2013
> @@ -18,17 +18,20 @@
>  #define SV_NAME "slp-vectorizer"
>  #define DEBUG_TYPE "SLP"
>
> -#include "VecUtils.h"
>  #include "llvm/Transforms/Vectorize.h"
>  #include "llvm/ADT/MapVector.h"
> +#include "llvm/ADT/SetVector.h"
>  #include "llvm/Analysis/AliasAnalysis.h"
>  #include "llvm/Analysis/ScalarEvolution.h"
> +#include "llvm/Analysis/ScalarEvolutionExpressions.h"
> +#include "llvm/Analysis/AliasAnalysis.h"
>  #include "llvm/Analysis/TargetTransformInfo.h"
>  #include "llvm/Analysis/Verifier.h"
>  #include "llvm/Analysis/LoopInfo.h"
>  #include "llvm/IR/DataLayout.h"
>  #include "llvm/IR/Instructions.h"
>  #include "llvm/IR/IntrinsicInst.h"
> +#include "llvm/IR/IRBuilder.h"
>  #include "llvm/IR/Module.h"
>  #include "llvm/IR/Type.h"
>  #include "llvm/IR/Value.h"
> @@ -36,6 +39,7 @@
>  #include "llvm/Support/CommandLine.h"
>  #include "llvm/Support/Debug.h"
>  #include "llvm/Support/raw_ostream.h"
> +#include <algorithm>
>  #include <map>
>
>  using namespace llvm;
> @@ -46,9 +50,1138 @@ static cl::opt<int>
>                                "number. (gain = -cost of vectorization)"));
>  namespace {
>
> +static const unsigned MinVecRegSize = 128;
> +
> +static const unsigned RecursionMaxDepth = 6;
> +
> +/// RAII pattern to save the insertion point of the IR builder.
> +class BuilderLocGuard {
> +public:
> +  BuilderLocGuard(IRBuilder<> &B) : Builder(B), Loc(B.GetInsertPoint()) {}
> +  ~BuilderLocGuard() { Builder.SetInsertPoint(Loc); }
> +
> +private:
> +  // Prevent copying.
> +  BuilderLocGuard(const BuilderLocGuard &);
> +  BuilderLocGuard &operator=(const BuilderLocGuard &);
> +  IRBuilder<> &Builder;
> +  BasicBlock::iterator Loc;
> +};
> +
> +/// A helper class for numbering instructions in multible blocks.
> +/// Numbers starts at zero for each basic block.
> +struct BlockNumbering {
> +
> +  BlockNumbering(BasicBlock *Bb) : BB(Bb), Valid(false) {}
> +
> +  BlockNumbering() : BB(0), Valid(false) {}
> +
> +  void numberInstructions() {
> +    unsigned Loc = 0;
> +    InstrIdx.clear();
> +    InstrVec.clear();
> +    // Number the instructions in the block.
> +    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;
> ++it) {
> +      InstrIdx[it] = Loc++;
> +      InstrVec.push_back(it);
> +      assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation");
> +    }
> +    Valid = true;
> +  }
> +
> +  int getIndex(Instruction *I) {
> +    if (!Valid)
> +      numberInstructions();
> +    assert(InstrIdx.count(I) && "Unknown instruction");
> +    return InstrIdx[I];
> +  }
> +
> +  Instruction *getInstruction(unsigned loc) {
> +    if (!Valid)
> +      numberInstructions();
> +    assert(InstrVec.size() > loc && "Invalid Index");
> +    return InstrVec[loc];
> +  }
> +
> +  void forget() { Valid = false; }
> +
> +private:
> +  /// The block we are numbering.
> +  BasicBlock *BB;
> +  /// Is the block numbered.
> +  bool Valid;
> +  /// Maps instructions to numbers and back.
> +  SmallDenseMap<Instruction *, int> InstrIdx;
> +  /// Maps integers to Instructions.
> +  std::vector<Instruction *> InstrVec;
> +};
> +
> +class FuncSLP {
> +  typedef SmallVector<Value *, 8> ValueList;
> +  typedef SmallVector<Instruction *, 16> InstrList;
> +  typedef SmallPtrSet<Value *, 16> ValueSet;
> +  typedef SmallVector<StoreInst *, 8> StoreList;
> +
> +public:
> +  static const int MAX_COST = INT_MIN;
> +
> +  FuncSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl,
> +          TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li)
> +      : F(Func), SE(Se), DL(Dl), TTI(Tti), AA(Aa), LI(Li),
> +        Builder(Se->getContext()) {
> +    for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it)
> {
> +      BasicBlock *BB = it;
> +      BlocksNumbers[BB] = BlockNumbering(BB);
> +    }
> +  }
> +
> +  /// \brief Take the pointer operand from the Load/Store instruction.
> +  /// \returns NULL if this is not a valid Load/Store instruction.
> +  static Value *getPointerOperand(Value *I);
> +
> +  /// \brief Take the address space operand from the Load/Store
> instruction.
> +  /// \returns -1 if this is not a valid Load/Store instruction.
> +  static unsigned getAddressSpaceOperand(Value *I);
> +
> +  /// \returns true if the memory operations A and B are consecutive.
> +  bool isConsecutiveAccess(Value *A, Value *B);
> +
> +  /// \brief Vectorize the tree that starts with the elements in \p VL.
> +  /// \returns the vectorized value.
> +  Value *vectorizeTree(ArrayRef<Value *> VL);
> +
> +  /// \returns the vectorization cost of the subtree that starts at \p VL.
> +  /// A negative number means that this is profitable.
> +  int getTreeCost(ArrayRef<Value *> VL);
> +
> +  /// \returns the scalarization cost for this list of values. Assuming
> that
> +  /// this subtree gets vectorized, we may need to extract the values
> from the
> +  /// roots. This method calculates the cost of extracting the values.
> +  int getGatherCost(ArrayRef<Value *> VL);
> +
> +  /// \brief Attempts to order and vectorize a sequence of stores. This
> +  /// function does a quadratic scan of the given stores.
> +  /// \returns true if the basic block was modified.
> +  bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold);
> +
> +  /// \brief Vectorize a group of scalars into a vector tree.
> +  /// \returns the vectorized value.
> +  Value *vectorizeArith(ArrayRef<Value *> Operands);
> +
> +  /// \brief This method contains the recursive part of getTreeCost.
> +  int getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth);
> +
> +  /// \brief This recursive method looks for vectorization hazards such as
> +  /// values that are used by multiple users and checks that values are
> used
> +  /// by only one vector lane. It updates the variables LaneMap,
> MultiUserVals.
> +  void getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth);
> +
> +  /// \brief This method contains the recursive part of vectorizeTree.
> +  Value *vectorizeTree_rec(ArrayRef<Value *> VL);
> +
> +  ///  \brief Vectorize a sorted sequence of stores.
> +  bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold);
> +
> +  /// \returns the scalarization cost for this type. Scalarization in this
> +  /// context means the creation of vectors from a group of scalars.
> +  int getGatherCost(Type *Ty);
> +
> +  /// \returns the AA location that is being access by the instruction.
> +  AliasAnalysis::Location getLocation(Instruction *I);
> +
> +  /// \brief Checks if it is possible to sink an instruction from
> +  /// \p Src to \p Dst.
> +  /// \returns the pointer to the barrier instruction if we can't sink.
> +  Value *getSinkBarrier(Instruction *Src, Instruction *Dst);
> +
> +  /// \returns the index of the last instrucion in the BB from \p VL.
> +  int getLastIndex(ArrayRef<Value *> VL);
> +
> +  /// \returns the Instrucion in the bundle \p VL.
> +  Instruction *getLastInstruction(ArrayRef<Value *> VL);
> +
> +  /// \returns the Instruction at index \p Index which is in Block \p BB.
> +  Instruction *getInstructionForIndex(unsigned Index, BasicBlock *BB);
> +
> +  /// \returns the index of the first User of \p VL.
> +  int getFirstUserIndex(ArrayRef<Value *> VL);
> +
> +  /// \returns a vector from a collection of scalars in \p VL.
> +  Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
> +
> +  /// \brief Try to hoist gather sequences outside of the loop in cases
> where
> +  /// all of the sources are loop invariant.
> +  void hoistGatherSequence();
> +
> +  bool needToGatherAny(ArrayRef<Value *> VL) {
> +    for (int i = 0, e = VL.size(); i < e; ++i)
> +      if (MustGather.count(VL[i]))
> +        return true;
> +    return false;
> +  }
> +
> +  /// -- Vectorization State --
> +
> +  /// Maps values in the tree to the vector lanes that uses them. This
> map must
> +  /// be reset between runs of getCost.
> +  std::map<Value *, int> LaneMap;
> +  /// A list of instructions to ignore while sinking
> +  /// memory instructions. This map must be reset between runs of getCost.
> +  ValueSet MemBarrierIgnoreList;
> +
> +  /// Maps between the first scalar to the vector. This map must be reset
> +  /// between runs.
> +  DenseMap<Value *, Value *> VectorizedValues;
> +
> +  /// Contains values that must be gathered because they are used
> +  /// by multiple lanes, or by users outside the tree.
> +  /// NOTICE: The vectorization methods also use this set.
> +  ValueSet MustGather;
> +
> +  /// Contains a list of values that are used outside the current tree.
> This
> +  /// set must be reset between runs.
> +  SetVector<Value *> MultiUserVals;
> +
> +  /// Holds all of the instructions that we gathered.
> +  SetVector<Instruction *> GatherSeq;
> +
> +  /// Numbers instructions in different blocks.
> +  std::map<BasicBlock *, BlockNumbering> BlocksNumbers;
> +
> +  // Analysis and block reference.
> +  Function *F;
> +  ScalarEvolution *SE;
> +  DataLayout *DL;
> +  TargetTransformInfo *TTI;
> +  AliasAnalysis *AA;
> +  LoopInfo *LI;
> +  /// Instruction builder to construct the vectorized tree.
> +  IRBuilder<> Builder;
> +};
> +
> +int FuncSLP::getGatherCost(Type *Ty) {
> +  int Cost = 0;
> +  for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e;
> ++i)
> +    Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
> +  return Cost;
> +}
> +
> +int FuncSLP::getGatherCost(ArrayRef<Value *> VL) {
> +  // Find the type of the operands in VL.
> +  Type *ScalarTy = VL[0]->getType();
> +  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
> +    ScalarTy = SI->getValueOperand()->getType();
> +  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
> +  // Find the cost of inserting/extracting values from the vector.
> +  return getGatherCost(VecTy);
> +}
> +
> +AliasAnalysis::Location FuncSLP::getLocation(Instruction *I) {
> +  if (StoreInst *SI = dyn_cast<StoreInst>(I))
> +    return AA->getLocation(SI);
> +  if (LoadInst *LI = dyn_cast<LoadInst>(I))
> +    return AA->getLocation(LI);
> +  return AliasAnalysis::Location();
> +}
> +
> +Value *FuncSLP::getPointerOperand(Value *I) {
> +  if (LoadInst *LI = dyn_cast<LoadInst>(I))
> +    return LI->getPointerOperand();
> +  if (StoreInst *SI = dyn_cast<StoreInst>(I))
> +    return SI->getPointerOperand();
> +  return 0;
> +}
> +
> +unsigned FuncSLP::getAddressSpaceOperand(Value *I) {
> +  if (LoadInst *L = dyn_cast<LoadInst>(I))
> +    return L->getPointerAddressSpace();
> +  if (StoreInst *S = dyn_cast<StoreInst>(I))
> +    return S->getPointerAddressSpace();
> +  return -1;
> +}
> +
> +bool FuncSLP::isConsecutiveAccess(Value *A, Value *B) {
> +  Value *PtrA = getPointerOperand(A);
> +  Value *PtrB = getPointerOperand(B);
> +  unsigned ASA = getAddressSpaceOperand(A);
> +  unsigned ASB = getAddressSpaceOperand(B);
> +
> +  // Check that the address spaces match and that the pointers are valid.
> +  if (!PtrA || !PtrB || (ASA != ASB))
> +    return false;
> +
> +  // Check that A and B are of the same type.
> +  if (PtrA->getType() != PtrB->getType())
> +    return false;
> +
> +  // Calculate the distance.
> +  const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
> +  const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
> +  const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB);
> +  const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV);
> +
> +  // Non constant distance.
> +  if (!ConstOffSCEV)
> +    return false;
> +
> +  int64_t Offset = ConstOffSCEV->getValue()->getSExtValue();
> +  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
> +  // The Instructions are connsecutive if the size of the first
> load/store is
> +  // the same as the offset.
> +  int64_t Sz = DL->getTypeStoreSize(Ty);
> +  return ((-Offset) == Sz);
> +}
> +
> +Value *FuncSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) {
> +  assert(Src->getParent() == Dst->getParent() && "Not the same BB");
> +  BasicBlock::iterator I = Src, E = Dst;
> +  /// Scan all of the instruction from SRC to DST and check if
> +  /// the source may alias.
> +  for (++I; I != E; ++I) {
> +    // Ignore store instructions that are marked as 'ignore'.
> +    if (MemBarrierIgnoreList.count(I))
> +      continue;
> +    if (Src->mayWriteToMemory()) /* Write */ {
> +      if (!I->mayReadOrWriteMemory())
> +        continue;
> +    } else /* Read */ {
> +      if (!I->mayWriteToMemory())
> +        continue;
> +    }
> +    AliasAnalysis::Location A = getLocation(&*I);
> +    AliasAnalysis::Location B = getLocation(Src);
> +
> +    if (!A.Ptr || !B.Ptr || AA->alias(A, B))
> +      return I;
> +  }
> +  return 0;
> +}
> +
> +static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
> +  BasicBlock *BB = 0;
> +  for (int i = 0, e = VL.size(); i < e; i++) {
> +    Instruction *I = dyn_cast<Instruction>(VL[i]);
> +    if (!I)
> +      return 0;
> +
> +    if (!BB) {
> +      BB = I->getParent();
> +      continue;
> +    }
> +
> +    if (BB != I->getParent())
> +      return 0;
> +  }
> +  return BB;
> +}
> +
> +static bool allConstant(ArrayRef<Value *> VL) {
> +  for (unsigned i = 0, e = VL.size(); i < e; ++i)
> +    if (!isa<Constant>(VL[i]))
> +      return false;
> +  return true;
> +}
> +
> +static bool isSplat(ArrayRef<Value *> VL) {
> +  for (unsigned i = 1, e = VL.size(); i < e; ++i)
> +    if (VL[i] != VL[0])
> +      return false;
> +  return true;
> +}
> +
> +static unsigned getSameOpcode(ArrayRef<Value *> VL) {
> +  unsigned Opcode = 0;
> +  for (int i = 0, e = VL.size(); i < e; i++) {
> +    if (Instruction *I = dyn_cast<Instruction>(VL[i])) {
> +      if (!Opcode) {
> +        Opcode = I->getOpcode();
> +        continue;
> +      }
> +      if (Opcode != I->getOpcode())
> +        return 0;
> +    }
> +  }
> +  return Opcode;
> +}
> +
> +static bool CanReuseExtract(ArrayRef<Value *> VL, unsigned VF,
> +                            VectorType *VecTy) {
> +  assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid
> opcode");
> +  // Check if all of the extracts come from the same vector and from the
> +  // correct offset.
> +  Value *VL0 = VL[0];
> +  ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
> +  Value *Vec = E0->getOperand(0);
> +
> +  // We have to extract from the same vector type.
> +  if (Vec->getType() != VecTy)
> +    return false;
> +
> +  // Check that all of the indices extract from the correct offset.
> +  ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
> +  if (!CI || CI->getZExtValue())
> +    return false;
> +
> +  for (unsigned i = 1, e = VF; i < e; ++i) {
> +    ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
> +    ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
> +
> +    if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
> +      return false;
> +  }
> +
> +  return true;
> +}
> +
> +void FuncSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) {
> +  if (Depth == RecursionMaxDepth)
> +    return MustGather.insert(VL.begin(), VL.end());
> +
> +  // Don't handle vectors.
> +  if (VL[0]->getType()->isVectorTy())
> +    return;
> +
> +  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
> +    if (SI->getValueOperand()->getType()->isVectorTy())
> +      return;
> +
> +  // If all of the operands are identical or constant we have a simple
> solution.
> +  if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL))
> +    return MustGather.insert(VL.begin(), VL.end());
> +
> +  // Stop the scan at unknown IR.
> +  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
> +  assert(VL0 && "Invalid instruction");
> +
> +  // Mark instructions with multiple users.
> +  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
> +    Instruction *I = dyn_cast<Instruction>(VL[i]);
> +    // Remember to check if all of the users of this instruction are
> vectorized
> +    // within our tree. At depth zero we have no local users, only
> external
> +    // users that we don't care about.
> +    if (Depth && I && I->getNumUses() > 1) {
> +      DEBUG(dbgs() << "SLP: Adding to MultiUserVals "
> +                      "because it has multiple users:" << *I << " \n");
> +      MultiUserVals.insert(I);
> +    }
> +  }
> +
> +  // Check that the instruction is only used within one lane.
> +  for (int i = 0, e = VL.size(); i < e; ++i) {
> +    if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) {
> +      DEBUG(dbgs() << "SLP: Value used by multiple lanes:" << *VL[i] <<
> "\n");
> +      return MustGather.insert(VL.begin(), VL.end());
> +    }
> +    // Make this instruction as 'seen' and remember the lane.
> +    LaneMap[VL[i]] = i;
> +  }
> +
> +  unsigned Opcode = getSameOpcode(VL);
> +  if (!Opcode)
> +    return MustGather.insert(VL.begin(), VL.end());
> +
> +  switch (Opcode) {
> +  case Instruction::ExtractElement: {
> +    VectorType *VecTy = VectorType::get(VL[0]->getType(), VL.size());
> +    // No need to follow ExtractElements that are going to be optimized
> away.
> +    if (CanReuseExtract(VL, VL.size(), VecTy))
> +      return;
> +    // Fall through.
> +  }
> +  case Instruction::Load:
> +    return;
> +  case Instruction::ZExt:
> +  case Instruction::SExt:
> +  case Instruction::FPToUI:
> +  case Instruction::FPToSI:
> +  case Instruction::FPExt:
> +  case Instruction::PtrToInt:
> +  case Instruction::IntToPtr:
> +  case Instruction::SIToFP:
> +  case Instruction::UIToFP:
> +  case Instruction::Trunc:
> +  case Instruction::FPTrunc:
> +  case Instruction::BitCast:
> +  case Instruction::Select:
> +  case Instruction::ICmp:
> +  case Instruction::FCmp:
> +  case Instruction::Add:
> +  case Instruction::FAdd:
> +  case Instruction::Sub:
> +  case Instruction::FSub:
> +  case Instruction::Mul:
> +  case Instruction::FMul:
> +  case Instruction::UDiv:
> +  case Instruction::SDiv:
> +  case Instruction::FDiv:
> +  case Instruction::URem:
> +  case Instruction::SRem:
> +  case Instruction::FRem:
> +  case Instruction::Shl:
> +  case Instruction::LShr:
> +  case Instruction::AShr:
> +  case Instruction::And:
> +  case Instruction::Or:
> +  case Instruction::Xor: {
> +    for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
> +      ValueList Operands;
> +      // Prepare the operand vector.
> +      for (unsigned j = 0; j < VL.size(); ++j)
> +        Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
> +
> +      getTreeUses_rec(Operands, Depth + 1);
> +    }
> +    return;
> +  }
> +  case Instruction::Store: {
> +    ValueList Operands;
> +    for (unsigned j = 0; j < VL.size(); ++j)
> +      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
> +    getTreeUses_rec(Operands, Depth + 1);
> +    return;
> +  }
> +  default:
> +    return MustGather.insert(VL.begin(), VL.end());
> +  }
> +}
> +
> +int FuncSLP::getLastIndex(ArrayRef<Value *> VL) {
> +  BasicBlock *BB = cast<Instruction>(VL[0])->getParent();
> +  assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid
> block");
> +  BlockNumbering &BN = BlocksNumbers[BB];
> +
> +  int MaxIdx = BN.getIndex(BB->getFirstNonPHI());
> +  for (unsigned i = 0, e = VL.size(); i < e; ++i)
> +    MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i])));
> +  return MaxIdx;
> +}
> +
> +Instruction *FuncSLP::getLastInstruction(ArrayRef<Value *> VL) {
> +  BasicBlock *BB = cast<Instruction>(VL[0])->getParent();
> +  assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid
> block");
> +  BlockNumbering &BN = BlocksNumbers[BB];
> +
> +  int MaxIdx = BN.getIndex(cast<Instruction>(VL[0]));
> +  for (unsigned i = 1, e = VL.size(); i < e; ++i)
> +    MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i])));
> +  return BN.getInstruction(MaxIdx);
> +}
> +
> +Instruction *FuncSLP::getInstructionForIndex(unsigned Index, BasicBlock
> *BB) {
> +  BlockNumbering &BN = BlocksNumbers[BB];
> +  return BN.getInstruction(Index);
> +}
> +
> +int FuncSLP::getFirstUserIndex(ArrayRef<Value *> VL) {
> +  BasicBlock *BB = getSameBlock(VL);
> +  BlockNumbering &BN = BlocksNumbers[BB];
> +
> +  // Find the first user of the values.
> +  int FirstUser = BN.getIndex(BB->getTerminator());
> +  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
> +    for (Value::use_iterator U = VL[i]->use_begin(), UE =
> VL[i]->use_end();
> +         U != UE; ++U) {
> +      Instruction *Instr = dyn_cast<Instruction>(*U);
> +
> +      if (!Instr || Instr->getParent() != BB)
> +        continue;
> +
> +      FirstUser = std::min(FirstUser, BN.getIndex(Instr));
> +    }
> +  }
> +  return FirstUser;
> +}
> +
> +int FuncSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
> +  Type *ScalarTy = VL[0]->getType();
> +
> +  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
> +    ScalarTy = SI->getValueOperand()->getType();
> +
> +  /// Don't mess with vectors.
> +  if (ScalarTy->isVectorTy())
> +    return FuncSLP::MAX_COST;
> +
> +  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
> +
> +  if (allConstant(VL))
> +    return 0;
> +
> +  if (isSplat(VL))
> +    return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy,
> 0);
> +
> +  if (Depth == RecursionMaxDepth || needToGatherAny(VL))
> +    return getGatherCost(VecTy);
> +
> +  BasicBlock *BB = getSameBlock(VL);
> +  unsigned Opcode = getSameOpcode(VL);
> +  assert(Opcode && BB && "Invalid Instruction Value");
> +
> +  // Check if it is safe to sink the loads or the stores.
> +  if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
> +    int MaxIdx = getLastIndex(VL);
> +    Instruction *Last = getInstructionForIndex(MaxIdx, BB);
> +
> +    for (unsigned i = 0, e = VL.size(); i < e; ++i) {
> +      if (VL[i] == Last)
> +        continue;
> +      Value *Barrier = getSinkBarrier(cast<Instruction>(VL[i]), Last);
> +      if (Barrier) {
> +        DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " <<
> *Last
> +                     << "\n because of " << *Barrier << "\n");
> +        return MAX_COST;
> +      }
> +    }
> +  }
> +
> +  Instruction *VL0 = cast<Instruction>(VL[0]);
> +  switch (Opcode) {
> +  case Instruction::ExtractElement: {
> +    if (CanReuseExtract(VL, VL.size(), VecTy))
> +      return 0;
> +    return getGatherCost(VecTy);
> +  }
> +  case Instruction::ZExt:
> +  case Instruction::SExt:
> +  case Instruction::FPToUI:
> +  case Instruction::FPToSI:
> +  case Instruction::FPExt:
> +  case Instruction::PtrToInt:
> +  case Instruction::IntToPtr:
> +  case Instruction::SIToFP:
> +  case Instruction::UIToFP:
> +  case Instruction::Trunc:
> +  case Instruction::FPTrunc:
> +  case Instruction::BitCast: {
> +    ValueList Operands;
> +    Type *SrcTy = VL0->getOperand(0)->getType();
> +    // Prepare the operand vector.
> +    for (unsigned j = 0; j < VL.size(); ++j) {
> +      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
> +      // Check that the casted type is the same for all users.
> +      if (cast<Instruction>(VL[j])->getOperand(0)->getType() != SrcTy)
> +        return getGatherCost(VecTy);
> +    }
> +
> +    int Cost = getTreeCost_rec(Operands, Depth + 1);
> +    if (Cost == FuncSLP::MAX_COST)
> +      return Cost;
> +
> +    // Calculate the cost of this instruction.
> +    int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
> +                                                       VL0->getType(),
> SrcTy);
> +
> +    VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
> +    int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy,
> SrcVecTy);
> +    Cost += (VecCost - ScalarCost);
> +    return Cost;
> +  }
> +  case Instruction::FCmp:
> +  case Instruction::ICmp: {
> +    // Check that all of the compares have the same predicate.
> +    CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
> +    for (unsigned i = 1, e = VL.size(); i < e; ++i) {
> +      CmpInst *Cmp = cast<CmpInst>(VL[i]);
> +      if (Cmp->getPredicate() != P0)
> +        return getGatherCost(VecTy);
> +    }
> +    // Fall through.
> +  }
> +  case Instruction::Select:
> +  case Instruction::Add:
> +  case Instruction::FAdd:
> +  case Instruction::Sub:
> +  case Instruction::FSub:
> +  case Instruction::Mul:
> +  case Instruction::FMul:
> +  case Instruction::UDiv:
> +  case Instruction::SDiv:
> +  case Instruction::FDiv:
> +  case Instruction::URem:
> +  case Instruction::SRem:
> +  case Instruction::FRem:
> +  case Instruction::Shl:
> +  case Instruction::LShr:
> +  case Instruction::AShr:
> +  case Instruction::And:
> +  case Instruction::Or:
> +  case Instruction::Xor: {
> +    int TotalCost = 0;
> +    // Calculate the cost of all of the operands.
> +    for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
> +      ValueList Operands;
> +      // Prepare the operand vector.
> +      for (unsigned j = 0; j < VL.size(); ++j)
> +        Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
> +
> +      int Cost = getTreeCost_rec(Operands, Depth + 1);
> +      if (Cost == MAX_COST)
> +        return MAX_COST;
> +      TotalCost += TotalCost;
> +    }
> +
> +    // Calculate the cost of this instruction.
> +    int ScalarCost = 0;
> +    int VecCost = 0;
> +    if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
> +        Opcode == Instruction::Select) {
> +      VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(),
> VL.size());
> +      ScalarCost =
> +          VecTy->getNumElements() *
> +          TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
> +      VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
> +    } else {
> +      ScalarCost = VecTy->getNumElements() *
> +                   TTI->getArithmeticInstrCost(Opcode, ScalarTy);
> +      VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
> +    }
> +    TotalCost += (VecCost - ScalarCost);
> +    return TotalCost;
> +  }
> +  case Instruction::Load: {
> +    // If we are scalarize the loads, add the cost of forming the vector.
> +    for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
> +      if (!isConsecutiveAccess(VL[i], VL[i + 1]))
> +        return getGatherCost(VecTy);
> +
> +    // Cost of wide load - cost of scalar loads.
> +    int ScalarLdCost = VecTy->getNumElements() *
> +                       TTI->getMemoryOpCost(Instruction::Load, ScalarTy,
> 1, 0);
> +    int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1,
> 0);
> +    return VecLdCost - ScalarLdCost;
> +  }
> +  case Instruction::Store: {
> +    // We know that we can merge the stores. Calculate the cost.
> +    int ScalarStCost = VecTy->getNumElements() *
> +                       TTI->getMemoryOpCost(Instruction::Store, ScalarTy,
> 1, 0);
> +    int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1,
> 0);
> +    int StoreCost = VecStCost - ScalarStCost;
> +
> +    ValueList Operands;
> +    for (unsigned j = 0; j < VL.size(); ++j) {
> +      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
> +      MemBarrierIgnoreList.insert(VL[j]);
> +    }
> +
> +    int Cost = getTreeCost_rec(Operands, Depth + 1);
> +    if (Cost == MAX_COST)
> +      return MAX_COST;
> +
> +    int TotalCost = StoreCost + Cost;
> +    return TotalCost;
> +  }
> +  default:
> +    // Unable to vectorize unknown instructions.
> +    return getGatherCost(VecTy);
> +  }
> +}
> +
> +int FuncSLP::getTreeCost(ArrayRef<Value *> VL) {
> +  // Get rid of the list of stores that were removed, and from the
> +  // lists of instructions with multiple users.
> +  MemBarrierIgnoreList.clear();
> +  LaneMap.clear();
> +  MultiUserVals.clear();
> +  MustGather.clear();
> +
> +  if (!getSameBlock(VL))
> +    return MAX_COST;
> +
> +  // Find the location of the last root.
> +  int LastRootIndex = getLastIndex(VL);
> +  int FirstUserIndex = getFirstUserIndex(VL);
> +
> +  // Don't vectorize if there are users of the tree roots inside the tree
> +  // itself.
> +  if (LastRootIndex > FirstUserIndex)
> +    return MAX_COST;
> +
> +  // Scan the tree and find which value is used by which lane, and which
> values
> +  // must be scalarized.
> +  getTreeUses_rec(VL, 0);
> +
> +  // Check that instructions with multiple users can be vectorized. Mark
> unsafe
> +  // instructions.
> +  for (SetVector<Value *>::iterator it = MultiUserVals.begin(),
> +                                    e = MultiUserVals.end();
> +       it != e; ++it) {
> +    // Check that all of the users of this instr are within the tree.
> +    for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end();
> +         I != E; ++I) {
> +      if (LaneMap.find(*I) == LaneMap.end()) {
> +        DEBUG(dbgs() << "SLP: Adding to MustExtract "
> +                        "because of an out of tree usage.\n");
> +        MustGather.insert(*it);
> +        continue;
> +      }
> +    }
> +  }
> +
> +  // Now calculate the cost of vectorizing the tree.
> +  return getTreeCost_rec(VL, 0);
> +}
> +bool FuncSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int
> CostThreshold) {
> +  unsigned ChainLen = Chain.size();
> +  DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
> +               << "\n");
> +  Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
> +  unsigned Sz = DL->getTypeSizeInBits(StoreTy);
> +  unsigned VF = MinVecRegSize / Sz;
> +
> +  if (!isPowerOf2_32(Sz) || VF < 2)
> +    return false;
> +
> +  bool Changed = false;
> +  // Look for profitable vectorizable trees at all offsets, starting at
> zero.
> +  for (unsigned i = 0, e = ChainLen; i < e; ++i) {
> +    if (i + VF > e)
> +      break;
> +    DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
> +                 << "\n");
> +    ArrayRef<Value *> Operands = Chain.slice(i, VF);
> +
> +    int Cost = getTreeCost(Operands);
> +    if (Cost == FuncSLP::MAX_COST)
> +      continue;
> +    DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF <<
> "\n");
> +    if (Cost < CostThreshold) {
> +      DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
> +      vectorizeTree(Operands);
> +      i += VF - 1;
> +      Changed = true;
> +    }
> +  }
> +
> +  if (Changed || ChainLen > VF)
> +    return Changed;
> +
> +  // Handle short chains. This helps us catch types such as <3 x float>
> that
> +  // are smaller than vector size.
> +  int Cost = getTreeCost(Chain);
> +  if (Cost == FuncSLP::MAX_COST)
> +    return false;
> +  if (Cost < CostThreshold) {
> +    DEBUG(dbgs() << "SLP: Found store chain cost = " << Cost
> +                 << " for size = " << ChainLen << "\n");
> +    vectorizeTree(Chain);
> +    return true;
> +  }
> +
> +  return false;
> +}
> +
> +bool FuncSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int
> costThreshold) {
> +  SetVector<Value *> Heads, Tails;
> +  SmallDenseMap<Value *, Value *> ConsecutiveChain;
> +
> +  // We may run into multiple chains that merge into a single chain. We
> mark the
> +  // stores that we vectorized so that we don't visit the same store
> twice.
> +  ValueSet VectorizedStores;
> +  bool Changed = false;
> +
> +  // Do a quadratic search on all of the given stores and find
> +  // all of the pairs of loads that follow each other.
> +  for (unsigned i = 0, e = Stores.size(); i < e; ++i)
> +    for (unsigned j = 0; j < e; ++j) {
> +      if (i == j)
> +        continue;
> +
> +      if (isConsecutiveAccess(Stores[i], Stores[j])) {
> +        Tails.insert(Stores[j]);
> +        Heads.insert(Stores[i]);
> +        ConsecutiveChain[Stores[i]] = Stores[j];
> +      }
> +    }
> +
> +  // For stores that start but don't end a link in the chain:
> +  for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end();
> +       it != e; ++it) {
> +    if (Tails.count(*it))
> +      continue;
> +
> +    // We found a store instr that starts a chain. Now follow the chain
> and try
> +    // to vectorize it.
> +    ValueList Operands;
> +    Value *I = *it;
> +    // Collect the chain into a list.
> +    while (Tails.count(I) || Heads.count(I)) {
> +      if (VectorizedStores.count(I))
> +        break;
> +      Operands.push_back(I);
> +      // Move to the next value in the chain.
> +      I = ConsecutiveChain[I];
> +    }
> +
> +    bool Vectorized = vectorizeStoreChain(Operands, costThreshold);
> +
> +    // Mark the vectorized stores so that we don't vectorize them again.
> +    if (Vectorized)
> +      VectorizedStores.insert(Operands.begin(), Operands.end());
> +    Changed |= Vectorized;
> +  }
> +
> +  return Changed;
> +}
> +
> +Value *FuncSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
> +  Value *Vec = UndefValue::get(Ty);
> +  // Generate the 'InsertElement' instruction.
> +  for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
> +    Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
> +    if (Instruction *I = dyn_cast<Instruction>(Vec))
> +      GatherSeq.insert(I);
> +  }
> +
> +  VectorizedValues[VL[0]] = Vec;
> +  return Vec;
> +}
> +
> +Value *FuncSLP::vectorizeTree_rec(ArrayRef<Value *> VL) {
> +  BuilderLocGuard Guard(Builder);
> +
> +  Type *ScalarTy = VL[0]->getType();
> +  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
> +    ScalarTy = SI->getValueOperand()->getType();
> +  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
> +
> +  if (needToGatherAny(VL))
> +    return Gather(VL, VecTy);
> +
> +  if (VectorizedValues.count(VL[0])) {
> +    DEBUG(dbgs() << "SLP: Diamond merged at depth.\n");
> +    return VectorizedValues[VL[0]];
> +  }
> +
> +  Instruction *VL0 = cast<Instruction>(VL[0]);
> +  unsigned Opcode = VL0->getOpcode();
> +  assert(Opcode == getSameOpcode(VL) && "Invalid opcode");
> +
> +  switch (Opcode) {
> +  case Instruction::ExtractElement: {
> +    if (CanReuseExtract(VL, VL.size(), VecTy))
> +      return VL0->getOperand(0);
> +    return Gather(VL, VecTy);
> +  }
> +  case Instruction::ZExt:
> +  case Instruction::SExt:
> +  case Instruction::FPToUI:
> +  case Instruction::FPToSI:
> +  case Instruction::FPExt:
> +  case Instruction::PtrToInt:
> +  case Instruction::IntToPtr:
> +  case Instruction::SIToFP:
> +  case Instruction::UIToFP:
> +  case Instruction::Trunc:
> +  case Instruction::FPTrunc:
> +  case Instruction::BitCast: {
> +    ValueList INVL;
> +    for (int i = 0, e = VL.size(); i < e; ++i)
> +      INVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
> +
> +    Builder.SetInsertPoint(getLastInstruction(VL));
> +    Value *InVec = vectorizeTree_rec(INVL);
> +    CastInst *CI = dyn_cast<CastInst>(VL0);
> +    Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
> +    VectorizedValues[VL0] = V;
> +    return V;
> +  }
> +  case Instruction::FCmp:
> +  case Instruction::ICmp: {
> +    // Check that all of the compares have the same predicate.
> +    CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
> +    for (unsigned i = 1, e = VL.size(); i < e; ++i) {
> +      CmpInst *Cmp = cast<CmpInst>(VL[i]);
> +      if (Cmp->getPredicate() != P0)
> +        return Gather(VL, VecTy);
> +    }
> +
> +    ValueList LHSV, RHSV;
> +    for (int i = 0, e = VL.size(); i < e; ++i) {
> +      LHSV.push_back(cast<Instruction>(VL[i])->getOperand(0));
> +      RHSV.push_back(cast<Instruction>(VL[i])->getOperand(1));
> +    }
> +
> +    Builder.SetInsertPoint(getLastInstruction(VL));
> +    Value *L = vectorizeTree_rec(LHSV);
> +    Value *R = vectorizeTree_rec(RHSV);
> +    Value *V;
> +
> +    if (Opcode == Instruction::FCmp)
> +      V = Builder.CreateFCmp(P0, L, R);
> +    else
> +      V = Builder.CreateICmp(P0, L, R);
> +
> +    VectorizedValues[VL0] = V;
> +    return V;
> +  }
> +  case Instruction::Select: {
> +    ValueList TrueVec, FalseVec, CondVec;
> +    for (int i = 0, e = VL.size(); i < e; ++i) {
> +      CondVec.push_back(cast<Instruction>(VL[i])->getOperand(0));
> +      TrueVec.push_back(cast<Instruction>(VL[i])->getOperand(1));
> +      FalseVec.push_back(cast<Instruction>(VL[i])->getOperand(2));
> +    }
> +
> +    Builder.SetInsertPoint(getLastInstruction(VL));
> +    Value *True = vectorizeTree_rec(TrueVec);
> +    Value *False = vectorizeTree_rec(FalseVec);
> +    Value *Cond = vectorizeTree_rec(CondVec);
> +    Value *V = Builder.CreateSelect(Cond, True, False);
> +    VectorizedValues[VL0] = V;
> +    return V;
> +  }
> +  case Instruction::Add:
> +  case Instruction::FAdd:
> +  case Instruction::Sub:
> +  case Instruction::FSub:
> +  case Instruction::Mul:
> +  case Instruction::FMul:
> +  case Instruction::UDiv:
> +  case Instruction::SDiv:
> +  case Instruction::FDiv:
> +  case Instruction::URem:
> +  case Instruction::SRem:
> +  case Instruction::FRem:
> +  case Instruction::Shl:
> +  case Instruction::LShr:
> +  case Instruction::AShr:
> +  case Instruction::And:
> +  case Instruction::Or:
> +  case Instruction::Xor: {
> +    ValueList LHSVL, RHSVL;
> +    for (int i = 0, e = VL.size(); i < e; ++i) {
> +      LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
> +      RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
> +    }
> +
> +    Builder.SetInsertPoint(getLastInstruction(VL));
> +    Value *LHS = vectorizeTree_rec(LHSVL);
> +    Value *RHS = vectorizeTree_rec(RHSVL);
> +
> +    if (LHS == RHS) {
> +      assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid
> order");
> +    }
> +
> +    BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
> +    Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
> +    VectorizedValues[VL0] = V;
> +    return V;
> +  }
> +  case Instruction::Load: {
> +    // Check if all of the loads are consecutive.
> +    for (unsigned i = 1, e = VL.size(); i < e; ++i)
> +      if (!isConsecutiveAccess(VL[i - 1], VL[i]))
> +        return Gather(VL, VecTy);
> +
> +    // Loads are inserted at the head of the tree because we don't want to
> +    // sink them all the way down past store instructions.
> +    Builder.SetInsertPoint(getLastInstruction(VL));
> +    LoadInst *LI = cast<LoadInst>(VL0);
> +    Value *VecPtr =
> +        Builder.CreateBitCast(LI->getPointerOperand(),
> VecTy->getPointerTo());
> +    unsigned Alignment = LI->getAlignment();
> +    LI = Builder.CreateLoad(VecPtr);
> +    LI->setAlignment(Alignment);
> +
> +    VectorizedValues[VL0] = LI;
> +    return LI;
> +  }
> +  case Instruction::Store: {
> +    StoreInst *SI = cast<StoreInst>(VL0);
> +    unsigned Alignment = SI->getAlignment();
> +
> +    ValueList ValueOp;
> +    for (int i = 0, e = VL.size(); i < e; ++i)
> +      ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand());
> +
> +    Value *VecValue = vectorizeTree_rec(ValueOp);
> +
> +    Builder.SetInsertPoint(getLastInstruction(VL));
> +    Value *VecPtr =
> +        Builder.CreateBitCast(SI->getPointerOperand(),
> VecTy->getPointerTo());
> +    Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment);
> +
> +    for (int i = 0, e = VL.size(); i < e; ++i)
> +      cast<Instruction>(VL[i])->eraseFromParent();
> +    return 0;
> +  }
> +  default:
> +    return Gather(VL, VecTy);
> +  }
> +}
> +
> +Value *FuncSLP::vectorizeTree(ArrayRef<Value *> VL) {
> +  Builder.SetInsertPoint(getLastInstruction(VL));
> +  Value *V = vectorizeTree_rec(VL);
> +
> +  // We moved some instructions around. We have to number them again
> +  // before we can do any analysis.
> +  MustGather.clear();
> +  VectorizedValues.clear();
> +  MemBarrierIgnoreList.clear();
> +  for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it)
> +    BlocksNumbers[it].forget();
> +  return V;
> +}
> +
> +Value *FuncSLP::vectorizeArith(ArrayRef<Value *> Operands) {
> +  Value *Vec = vectorizeTree(Operands);
> +  // After vectorizing the operands we need to generate extractelement
> +  // instructions and replace all of the uses of the scalar values with
> +  // the values that we extracted from the vectorized tree.
> +  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
> +    Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i));
> +    Operands[i]->replaceAllUsesWith(S);
> +  }
> +
> +  return Vec;
> +}
> +
> +void FuncSLP::hoistGatherSequence() {
> +  for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
> +                                          e = GatherSeq.end();
> +       it != e; ++it) {
> +    InsertElementInst *Insert = dyn_cast_or_null<InsertElementInst>(*it);
> +
> +    // The InsertElement sequence can be simplified into a constant.
> +    // Also Ignore NULL pointers because they are only here to separate
> +    // sequences.
> +    if (!Insert)
> +      continue;
> +
> +    BasicBlock *BB = Insert->getParent();
> +
> +    // Check if this block is inside a loop.
> +    Loop *L = LI->getLoopFor(BB);
> +    if (!L)
> +      return;
> +
> +    // Check if it has a preheader.
> +    BasicBlock *PreHeader = L->getLoopPreheader();
> +    if (!PreHeader)
> +      return;
> +
> +    // If the vector or the element that we insert into it are
> +    // instructions that are defined in this basic block then we can't
> +    // hoist this instruction.
> +    Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
> +    Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
> +    if (CurrVec && L->contains(CurrVec))
> +      continue;
> +    if (NewElem && L->contains(NewElem))
> +      continue;
> +
> +    // Mark the insertion point for the block.
> +    Instruction *Location = PreHeader->getTerminator();
> +    // We can hoist this instruction. Move it to the pre-header.
> +    Insert->moveBefore(Location);
> +  }
> +}
> +
>  /// The SLPVectorizer Pass.
>  struct SLPVectorizer : public FunctionPass {
> -  typedef MapVector<Value *, BoUpSLP::StoreList> StoreListMap;
> +  typedef SmallVector<StoreInst *, 8> StoreList;
> +  typedef MapVector<Value *, StoreList> StoreListMap;
>
>    /// Pass identification, replacement for typeid
>    static char ID;
> @@ -80,34 +1213,26 @@ struct SLPVectorizer : public FunctionPa
>
>      DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
>
> +    // Use the bollom up slp vectorizer to construct chains that start
> with
> +    // he store instructions.
> +    FuncSLP R(&F, SE, DL, TTI, AA, LI);
> +
>      for (Function::iterator it = F.begin(), e = F.end(); it != e; ++it) {
>        BasicBlock *BB = it;
> -      bool BBChanged = false;
> -
> -      // Use the bollom up slp vectorizer to construct chains that start
> with
> -      // he store instructions.
> -      BoUpSLP R(BB, SE, DL, TTI, AA, LI->getLoopFor(BB));
>
>        // Vectorize trees that end at reductions.
> -      BBChanged |= vectorizeChainsInBlock(BB, R);
> +      Changed |= vectorizeChainsInBlock(BB, R);
>
>        // Vectorize trees that end at stores.
>        if (unsigned count = collectStores(BB, R)) {
>          (void)count;
>          DEBUG(dbgs() << "SLP: Found " << count << " stores to
> vectorize.\n");
> -        BBChanged |= vectorizeStoreChains(R);
> +        Changed |= vectorizeStoreChains(R);
>        }
> -
> -      // Try to hoist some of the scalarization code to the preheader.
> -      if (BBChanged) {
> -        hoistGatherSequence(LI, BB, R);
> -        Changed |=
> vectorizeUsingGatherHints(R.getGatherSeqInstructions());
> -      }
> -
> -      Changed |= BBChanged;
>      }
>
>      if (Changed) {
> +      R.hoistGatherSequence();
>        DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
>        DEBUG(verifyFunction(F));
>      }
> @@ -128,42 +1253,31 @@ private:
>    /// object. We sort the stores to their base objects to reduce the cost
> of the
>    /// quadratic search on the stores. TODO: We can further reduce this
> cost
>    /// if we flush the chain creation every time we run into a memory
> barrier.
> -  unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
> +  unsigned collectStores(BasicBlock *BB, FuncSLP &R);
>
>    /// \brief Try to vectorize a chain that starts at two arithmetic
> instrs.
> -  bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
> +  bool tryToVectorizePair(Value *A, Value *B, FuncSLP &R);
>
>    /// \brief Try to vectorize a list of operands. If \p NeedExtracts is
> true
>    /// then we calculate the cost of extracting the scalars from the
> vector.
>    /// \returns true if a value was vectorized.
> -  bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, bool
> NeedExtracts);
> +  bool tryToVectorizeList(ArrayRef<Value *> VL, FuncSLP &R, bool
> NeedExtracts);
>
>    /// \brief Try to vectorize a chain that may start at the operands of
> \V;
> -  bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
> +  bool tryToVectorize(BinaryOperator *V, FuncSLP &R);
>
>    /// \brief Vectorize the stores that were collected in StoreRefs.
> -  bool vectorizeStoreChains(BoUpSLP &R);
> -
> -  /// \brief Try to hoist gather sequences outside of the loop in cases
> where
> -  /// all of the sources are loop invariant.
> -  void hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, BoUpSLP &R);
> -
> -  /// \brief Try to vectorize additional sequences in different basic
> blocks
> -  /// based on values that we gathered in previous blocks. The list \p
> Gathers
> -  /// holds the gather InsertElement instructions that were generated
> during
> -  /// vectorization.
> -  /// \returns True if some code was vectorized.
> -  bool vectorizeUsingGatherHints(BoUpSLP::InstrList &Gathers);
> +  bool vectorizeStoreChains(FuncSLP &R);
>
>    /// \brief Scan the basic block and look for patterns that are likely
> to start
>    /// a vectorization chain.
> -  bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
> +  bool vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R);
>
>  private:
>    StoreListMap StoreRefs;
>  };
>
> -unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
> +unsigned SLPVectorizer::collectStores(BasicBlock *BB, FuncSLP &R) {
>    unsigned count = 0;
>    StoreRefs.clear();
>    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;
> ++it) {
> @@ -188,14 +1302,14 @@ unsigned SLPVectorizer::collectStores(Ba
>    return count;
>  }
>
> -bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
> +bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, FuncSLP &R) {
>    if (!A || !B)
>      return false;
>    Value *VL[] = { A, B };
>    return tryToVectorizeList(VL, R, true);
>  }
>
> -bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
> +bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, FuncSLP &R,
>                                         bool NeedExtracts) {
>    if (VL.size() < 2)
>      return false;
> @@ -219,7 +1333,10 @@ bool SLPVectorizer::tryToVectorizeList(A
>    }
>
>    int Cost = R.getTreeCost(VL);
> -  int ExtrCost = NeedExtracts ? R.getScalarizationCost(VL) : 0;
> +  if (Cost == FuncSLP::MAX_COST)
> +    return false;
> +
> +  int ExtrCost = NeedExtracts ? R.getGatherCost(VL) : 0;
>    DEBUG(dbgs() << "SLP: Cost of pair:" << Cost
>                 << " Cost of extract:" << ExtrCost << ".\n");
>    if ((Cost + ExtrCost) >= -SLPCostThreshold)
> @@ -229,10 +1346,10 @@ bool SLPVectorizer::tryToVectorizeList(A
>    return true;
>  }
>
> -bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
> +bool SLPVectorizer::tryToVectorize(BinaryOperator *V, FuncSLP &R) {
>    if (!V)
>      return false;
> -
> +
>    // Try to vectorize V.
>    if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
>      return true;
> @@ -269,7 +1386,7 @@ bool SLPVectorizer::tryToVectorize(Binar
>    return 0;
>  }
>
> -bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
> +bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R) {
>    bool Changed = false;
>    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;
> ++it) {
>      if (isa<DbgInfoIntrinsic>(it))
> @@ -292,7 +1409,7 @@ bool SLPVectorizer::vectorizeChainsInBlo
>        Value *Inst = BI->getOperand(0);
>        if (Inst == P)
>          Inst = BI->getOperand(1);
> -
> +
>        Changed |= tryToVectorize(dyn_cast<BinaryOperator>(Inst), R);
>        continue;
>      }
> @@ -337,7 +1454,7 @@ bool SLPVectorizer::vectorizeChainsInBlo
>    return Changed;
>  }
>
> -bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
> +bool SLPVectorizer::vectorizeStoreChains(FuncSLP &R) {
>    bool Changed = false;
>    // Attempt to sort and vectorize each of the store-groups.
>    for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
> @@ -353,92 +1470,6 @@ bool SLPVectorizer::vectorizeStoreChains
>    return Changed;
>  }
>
> -bool SLPVectorizer::vectorizeUsingGatherHints(BoUpSLP::InstrList
> &Gathers) {
> -  SmallVector<Value *, 4> Seq;
> -  bool Changed = false;
> -  for (int i = 0, e = Gathers.size(); i < e; ++i) {
> -    InsertElementInst *IEI =
> dyn_cast_or_null<InsertElementInst>(Gathers[i]);
> -
> -    if (IEI) {
> -      if (Instruction *I = dyn_cast<Instruction>(IEI->getOperand(1)))
> -        Seq.push_back(I);
> -    } else {
> -
> -      if (!Seq.size())
> -        continue;
> -
> -      Instruction *I = cast<Instruction>(Seq[0]);
> -      BasicBlock *BB = I->getParent();
> -
> -      DEBUG(dbgs() << "SLP: Inspecting a gather list of size " <<
> Seq.size()
> -                   << " in " << BB->getName() << ".\n");
> -
> -      // Check if the gathered values have multiple uses. If they only
> have one
> -      // user then we know that the insert/extract pair will go away.
> -      bool HasMultipleUsers = false;
> -      for (int i = 0; e = Seq.size(), i < e; ++i) {
> -        if (!Seq[i]->hasOneUse()) {
> -          HasMultipleUsers = true;
> -          break;
> -        }
> -      }
> -
> -      BoUpSLP BO(BB, SE, DL, TTI, AA, LI->getLoopFor(BB));
> -
> -      if (tryToVectorizeList(Seq, BO, HasMultipleUsers)) {
> -        DEBUG(dbgs() << "SLP: Vectorized a gather list of len " <<
> Seq.size()
> -                     << " in " << BB->getName() << ".\n");
> -        Changed = true;
> -      }
> -
> -      Seq.clear();
> -    }
> -  }
> -
> -  return Changed;
> -}
> -
> -void SLPVectorizer::hoistGatherSequence(LoopInfo *LI, BasicBlock *BB,
> -                                        BoUpSLP &R) {
> -  // Check if this block is inside a loop.
> -  Loop *L = LI->getLoopFor(BB);
> -  if (!L)
> -    return;
> -
> -  // Check if it has a preheader.
> -  BasicBlock *PreHeader = L->getLoopPreheader();
> -  if (!PreHeader)
> -    return;
> -
> -  // Mark the insertion point for the block.
> -  Instruction *Location = PreHeader->getTerminator();
> -
> -  BoUpSLP::InstrList &Gathers = R.getGatherSeqInstructions();
> -  for (BoUpSLP::InstrList::iterator it = Gathers.begin(), e =
> Gathers.end();
> -       it != e; ++it) {
> -    InsertElementInst *Insert = dyn_cast_or_null<InsertElementInst>(*it);
> -
> -    // The InsertElement sequence can be simplified into a constant.
> -    // Also Ignore NULL pointers because they are only here to separate
> -    // sequences.
> -    if (!Insert)
> -      continue;
> -
> -    // If the vector or the element that we insert into it are
> -    // instructions that are defined in this basic block then we can't
> -    // hoist this instruction.
> -    Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
> -    Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
> -    if (CurrVec && L->contains(CurrVec))
> -      continue;
> -    if (NewElem && L->contains(NewElem))
> -      continue;
> -
> -    // We can hoist this instruction. Move it to the pre-header.
> -    Insert->moveBefore(Location);
> -  }
> -}
> -
>  } // end anonymous namespace
>
>  char SLPVectorizer::ID = 0;
>
> Removed: llvm/trunk/lib/Transforms/Vectorize/VecUtils.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/VecUtils.cpp?rev=184646&view=auto
>
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Vectorize/VecUtils.cpp (original)
> +++ llvm/trunk/lib/Transforms/Vectorize/VecUtils.cpp (removed)
> @@ -1,1031 +0,0 @@
> -//===- VecUtils.cpp --- Vectorization Utilities
> ---------------------------===//
> -//
> -//                     The LLVM Compiler Infrastructure
> -//
> -// This file is distributed under the University of Illinois Open Source
> -// License. See LICENSE.TXT for details.
> -//
>
> -//===----------------------------------------------------------------------===//
> -#define DEBUG_TYPE "SLP"
> -
> -#include "VecUtils.h"
> -#include "llvm/ADT/DenseMap.h"
> -#include "llvm/ADT/SmallPtrSet.h"
> -#include "llvm/ADT/SmallSet.h"
> -#include "llvm/ADT/SmallVector.h"
> -#include "llvm/Analysis/AliasAnalysis.h"
> -#include "llvm/Analysis/ScalarEvolution.h"
> -#include "llvm/Analysis/ScalarEvolutionExpressions.h"
> -#include "llvm/Analysis/TargetTransformInfo.h"
> -#include "llvm/Analysis/Verifier.h"
> -#include "llvm/Analysis/LoopInfo.h"
> -#include "llvm/IR/Constants.h"
> -#include "llvm/IR/DataLayout.h"
> -#include "llvm/IR/Function.h"
> -#include "llvm/IR/Instructions.h"
> -#include "llvm/IR/Module.h"
> -#include "llvm/IR/Type.h"
> -#include "llvm/IR/Value.h"
> -#include "llvm/Pass.h"
> -#include "llvm/Support/CommandLine.h"
> -#include "llvm/Support/Debug.h"
> -#include "llvm/Support/raw_ostream.h"
> -#include "llvm/Target/TargetLibraryInfo.h"
> -#include "llvm/Transforms/Scalar.h"
> -#include "llvm/Transforms/Utils/Local.h"
> -#include <algorithm>
> -#include <map>
> -
> -using namespace llvm;
> -
> -static const unsigned MinVecRegSize = 128;
> -
> -static const unsigned RecursionMaxDepth = 6;
> -
> -namespace llvm {
> -
> -BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl,
> -                 TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp)
> -    : Builder(S->getContext()), BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa),
> L(Lp) {
> -  numberInstructions();
> -}
> -
> -void BoUpSLP::numberInstructions() {
> -  int Loc = 0;
> -  InstrIdx.clear();
> -  InstrVec.clear();
> -  // Number the instructions in the block.
> -  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;
> ++it) {
> -    InstrIdx[it] = Loc++;
> -    InstrVec.push_back(it);
> -    assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation");
> -  }
> -}
> -
> -Value *BoUpSLP::getPointerOperand(Value *I) {
> -  if (LoadInst *LI = dyn_cast<LoadInst>(I))
> -    return LI->getPointerOperand();
> -  if (StoreInst *SI = dyn_cast<StoreInst>(I))
> -    return SI->getPointerOperand();
> -  return 0;
> -}
> -
> -unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
> -  if (LoadInst *L = dyn_cast<LoadInst>(I))
> -    return L->getPointerAddressSpace();
> -  if (StoreInst *S = dyn_cast<StoreInst>(I))
> -    return S->getPointerAddressSpace();
> -  return -1;
> -}
> -
> -bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
> -  Value *PtrA = getPointerOperand(A);
> -  Value *PtrB = getPointerOperand(B);
> -  unsigned ASA = getAddressSpaceOperand(A);
> -  unsigned ASB = getAddressSpaceOperand(B);
> -
> -  // Check that the address spaces match and that the pointers are valid.
> -  if (!PtrA || !PtrB || (ASA != ASB))
> -    return false;
> -
> -  // Check that A and B are of the same type.
> -  if (PtrA->getType() != PtrB->getType())
> -    return false;
> -
> -  // Calculate the distance.
> -  const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
> -  const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
> -  const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB);
> -  const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV);
> -
> -  // Non constant distance.
> -  if (!ConstOffSCEV)
> -    return false;
> -
> -  int64_t Offset = ConstOffSCEV->getValue()->getSExtValue();
> -  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
> -  // The Instructions are connsecutive if the size of the first
> load/store is
> -  // the same as the offset.
> -  int64_t Sz = DL->getTypeStoreSize(Ty);
> -  return ((-Offset) == Sz);
> -}
> -
> -bool BoUpSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int
> CostThreshold) {
> -  unsigned ChainLen = Chain.size();
> -  DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
> -               << "\n");
> -  Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
> -  unsigned Sz = DL->getTypeSizeInBits(StoreTy);
> -  unsigned VF = MinVecRegSize / Sz;
> -
> -  if (!isPowerOf2_32(Sz) || VF < 2)
> -    return false;
> -
> -  bool Changed = false;
> -  // Look for profitable vectorizable trees at all offsets, starting at
> zero.
> -  for (unsigned i = 0, e = ChainLen; i < e; ++i) {
> -    if (i + VF > e)
> -      break;
> -    DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
> -                 << "\n");
> -    ArrayRef<Value *> Operands = Chain.slice(i, VF);
> -
> -    int Cost = getTreeCost(Operands);
> -    DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF <<
> "\n");
> -    if (Cost < CostThreshold) {
> -      DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
> -      Builder.SetInsertPoint(getInsertionPoint(getLastIndex(Operands,
> VF)));
> -      vectorizeTree(Operands, VF);
> -      i += VF - 1;
> -      Changed = true;
> -    }
> -  }
> -
> -  if (Changed || ChainLen > VF)
> -    return Changed;
> -
> -  // Handle short chains. This helps us catch types such as <3 x float>
> that
> -  // are smaller than vector size.
> -  int Cost = getTreeCost(Chain);
> -  if (Cost < CostThreshold) {
> -    DEBUG(dbgs() << "SLP: Found store chain cost = " << Cost
> -                 << " for size = " << ChainLen << "\n");
> -    Builder.SetInsertPoint(getInsertionPoint(getLastIndex(Chain,
> ChainLen)));
> -    vectorizeTree(Chain, ChainLen);
> -    return true;
> -  }
> -
> -  return false;
> -}
> -
> -bool BoUpSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int
> costThreshold) {
> -  SetVector<Value *> Heads, Tails;
> -  SmallDenseMap<Value *, Value *> ConsecutiveChain;
> -
> -  // We may run into multiple chains that merge into a single chain. We
> mark the
> -  // stores that we vectorized so that we don't visit the same store
> twice.
> -  ValueSet VectorizedStores;
> -  bool Changed = false;
> -
> -  // Do a quadratic search on all of the given stores and find
> -  // all of the pairs of loads that follow each other.
> -  for (unsigned i = 0, e = Stores.size(); i < e; ++i)
> -    for (unsigned j = 0; j < e; ++j) {
> -      if (i == j)
> -        continue;
> -
> -      if (isConsecutiveAccess(Stores[i], Stores[j])) {
> -        Tails.insert(Stores[j]);
> -        Heads.insert(Stores[i]);
> -        ConsecutiveChain[Stores[i]] = Stores[j];
> -      }
> -    }
> -
> -  // For stores that start but don't end a link in the chain:
> -  for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end();
> -       it != e; ++it) {
> -    if (Tails.count(*it))
> -      continue;
> -
> -    // We found a store instr that starts a chain. Now follow the chain
> and try
> -    // to vectorize it.
> -    ValueList Operands;
> -    Value *I = *it;
> -    // Collect the chain into a list.
> -    while (Tails.count(I) || Heads.count(I)) {
> -      if (VectorizedStores.count(I))
> -        break;
> -      Operands.push_back(I);
> -      // Move to the next value in the chain.
> -      I = ConsecutiveChain[I];
> -    }
> -
> -    bool Vectorized = vectorizeStoreChain(Operands, costThreshold);
> -
> -    // Mark the vectorized stores so that we don't vectorize them again.
> -    if (Vectorized)
> -      VectorizedStores.insert(Operands.begin(), Operands.end());
> -    Changed |= Vectorized;
> -  }
> -
> -  return Changed;
> -}
> -
> -int BoUpSLP::getScalarizationCost(ArrayRef<Value *> VL) {
> -  // Find the type of the operands in VL.
> -  Type *ScalarTy = VL[0]->getType();
> -  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
> -    ScalarTy = SI->getValueOperand()->getType();
> -  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
> -  // Find the cost of inserting/extracting values from the vector.
> -  return getScalarizationCost(VecTy);
> -}
> -
> -int BoUpSLP::getScalarizationCost(Type *Ty) {
> -  int Cost = 0;
> -  for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e;
> ++i)
> -    Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
> -  return Cost;
> -}
> -
> -AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) {
> -  if (StoreInst *SI = dyn_cast<StoreInst>(I))
> -    return AA->getLocation(SI);
> -  if (LoadInst *LI = dyn_cast<LoadInst>(I))
> -    return AA->getLocation(LI);
> -  return AliasAnalysis::Location();
> -}
> -
> -Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) {
> -  assert(Src->getParent() == Dst->getParent() && "Not the same BB");
> -  BasicBlock::iterator I = Src, E = Dst;
> -  /// Scan all of the instruction from SRC to DST and check if
> -  /// the source may alias.
> -  for (++I; I != E; ++I) {
> -    // Ignore store instructions that are marked as 'ignore'.
> -    if (MemBarrierIgnoreList.count(I))
> -      continue;
> -    if (Src->mayWriteToMemory()) /* Write */ {
> -      if (!I->mayReadOrWriteMemory())
> -        continue;
> -    } else /* Read */ {
> -      if (!I->mayWriteToMemory())
> -        continue;
> -    }
> -    AliasAnalysis::Location A = getLocation(&*I);
> -    AliasAnalysis::Location B = getLocation(Src);
> -
> -    if (!A.Ptr || !B.Ptr || AA->alias(A, B))
> -      return I;
> -  }
> -  return 0;
> -}
> -
> -Value *BoUpSLP::vectorizeArith(ArrayRef<Value *> Operands) {
> -  int LastIdx = getLastIndex(Operands, Operands.size());
> -  Instruction *Loc = getInsertionPoint(LastIdx);
> -  Builder.SetInsertPoint(Loc);
> -
> -  assert(getFirstUserIndex(Operands, Operands.size()) > LastIdx &&
> -         "Vectorizing with in-tree users");
> -
> -  Value *Vec = vectorizeTree(Operands, Operands.size());
> -  // After vectorizing the operands we need to generate extractelement
> -  // instructions and replace all of the uses of the scalar values with
> -  // the values that we extracted from the vectorized tree.
> -  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
> -    Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i));
> -    Operands[i]->replaceAllUsesWith(S);
> -  }
> -
> -  return Vec;
> -}
> -
> -int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) {
> -  // Get rid of the list of stores that were removed, and from the
> -  // lists of instructions with multiple users.
> -  MemBarrierIgnoreList.clear();
> -  LaneMap.clear();
> -  MultiUserVals.clear();
> -  MustScalarize.clear();
> -  MustExtract.clear();
> -
> -  // Find the location of the last root.
> -  int LastRootIndex = getLastIndex(VL, VL.size());
> -  int FirstUserIndex = getFirstUserIndex(VL, VL.size());
> -
> -  // Don't vectorize if there are users of the tree roots inside the tree
> -  // itself.
> -  if (LastRootIndex > FirstUserIndex)
> -    return max_cost;
> -
> -  // Scan the tree and find which value is used by which lane, and which
> values
> -  // must be scalarized.
> -  getTreeUses_rec(VL, 0);
> -
> -  // Check that instructions with multiple users can be vectorized. Mark
> unsafe
> -  // instructions.
> -  for (SetVector<Value *>::iterator it = MultiUserVals.begin(),
> -                                    e = MultiUserVals.end();
> -       it != e; ++it) {
> -    // Check that all of the users of this instr are within the tree
> -    // and that they are all from the same lane.
> -    int Lane = -1;
> -    for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end();
> -         I != E; ++I) {
> -      if (LaneMap.find(*I) == LaneMap.end()) {
> -        DEBUG(dbgs() << "SLP: Instr " << **it << " has multiple
> users.\n");
> -
> -        // We don't have an ordering problem if the user is not in this
> basic
> -        // block.
> -        Instruction *Inst = cast<Instruction>(*I);
> -        if (Inst->getParent() != BB) {
> -          MustExtract.insert(*it);
> -          continue;
> -        }
> -
> -        // We don't have an ordering problem if the user is after the
> last root.
> -        int Idx = InstrIdx[Inst];
> -        if (Idx < LastRootIndex) {
> -          MustScalarize.insert(*it);
> -          DEBUG(dbgs() << "SLP: Adding to MustScalarize "
> -                          "because of an unsafe out of tree usage.\n");
> -          break;
> -        }
> -
> -        DEBUG(dbgs() << "SLP: Adding to MustExtract "
> -                        "because of a safe out of tree usage.\n");
> -        MustExtract.insert(*it);
> -        continue;
> -      }
> -      if (Lane == -1)
> -        Lane = LaneMap[*I];
> -      if (Lane != LaneMap[*I]) {
> -        MustScalarize.insert(*it);
> -        DEBUG(dbgs() << "SLP: Adding " << **it
> -                     << " to MustScalarize because multiple lane use it: "
> -                     << Lane << " and " << LaneMap[*I] << ".\n");
> -        break;
> -      }
> -    }
> -  }
> -
> -  // Now calculate the cost of vectorizing the tree.
> -  return getTreeCost_rec(VL, 0);
> -}
> -
> -static bool CanReuseExtract(ArrayRef<Value *> VL, unsigned VF,
> -                            VectorType *VecTy) {
> -  // Check if all of the extracts come from the same vector and from the
> -  // correct offset.
> -  Value *VL0 = VL[0];
> -  ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
> -  Value *Vec = E0->getOperand(0);
> -
> -  // We have to extract from the same vector type.
> -  if (Vec->getType() != VecTy)
> -    return false;
> -
> -  // Check that all of the indices extract from the correct offset.
> -  ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
> -  if (!CI || CI->getZExtValue())
> -    return false;
> -
> -  for (unsigned i = 1, e = VF; i < e; ++i) {
> -    ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
> -    ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
> -
> -    if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
> -      return false;
> -  }
> -
> -  return true;
> -}
> -
> -void BoUpSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) {
> -  if (Depth == RecursionMaxDepth)
> -    return;
> -
> -  // Don't handle vectors.
> -  if (VL[0]->getType()->isVectorTy())
> -    return;
> -
> -  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
> -    if (SI->getValueOperand()->getType()->isVectorTy())
> -      return;
> -
> -  // Check if all of the operands are constants.
> -  bool AllConst = true;
> -  bool AllSameScalar = true;
> -  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
> -    AllConst &= isa<Constant>(VL[i]);
> -    AllSameScalar &= (VL[0] == VL[i]);
> -    Instruction *I = dyn_cast<Instruction>(VL[i]);
> -    // If one of the instructions is out of this BB, we need to scalarize
> all.
> -    if (I && I->getParent() != BB)
> -      return;
> -  }
> -
> -  // If all of the operands are identical or constant we have a simple
> solution.
> -  if (AllConst || AllSameScalar)
> -    return;
> -
> -  // Scalarize unknown structures.
> -  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
> -  if (!VL0)
> -    return;
> -
> -  unsigned Opcode = VL0->getOpcode();
> -  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
> -    Instruction *I = dyn_cast<Instruction>(VL[i]);
> -    // If not all of the instructions are identical then we have to
> scalarize.
> -    if (!I || Opcode != I->getOpcode())
> -      return;
> -  }
> -
> -  for (int i = 0, e = VL.size(); i < e; ++i) {
> -    // Check that the instruction is only used within
> -    // one lane.
> -    if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i)
> -      return;
> -    // Make this instruction as 'seen' and remember the lane.
> -    LaneMap[VL[i]] = i;
> -  }
> -
> -  // Mark instructions with multiple users.
> -  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
> -    Instruction *I = dyn_cast<Instruction>(VL[i]);
> -    // Remember to check if all of the users of this instr are vectorized
> -    // within our tree. At depth zero we have no local users, only
> external
> -    // users that we don't care about.
> -    if (Depth && I && I->getNumUses() > 1) {
> -      DEBUG(dbgs() << "SLP: Adding to MultiUserVals "
> -                      "because it has multiple users:" << *I << " \n");
> -      MultiUserVals.insert(I);
> -    }
> -  }
> -
> -  switch (Opcode) {
> -  case Instruction::ExtractElement: {
> -    VectorType *VecTy = VectorType::get(VL[0]->getType(), VL.size());
> -    // No need to follow ExtractElements that are going to be optimized
> away.
> -    if (CanReuseExtract(VL, VL.size(), VecTy))
> -      return;
> -    // Fall through.
> -  }
> -  case Instruction::ZExt:
> -  case Instruction::SExt:
> -  case Instruction::FPToUI:
> -  case Instruction::FPToSI:
> -  case Instruction::FPExt:
> -  case Instruction::PtrToInt:
> -  case Instruction::IntToPtr:
> -  case Instruction::SIToFP:
> -  case Instruction::UIToFP:
> -  case Instruction::Trunc:
> -  case Instruction::FPTrunc:
> -  case Instruction::BitCast:
> -  case Instruction::Select:
> -  case Instruction::ICmp:
> -  case Instruction::FCmp:
> -  case Instruction::Add:
> -  case Instruction::FAdd:
> -  case Instruction::Sub:
> -  case Instruction::FSub:
> -  case Instruction::Mul:
> -  case Instruction::FMul:
> -  case Instruction::UDiv:
> -  case Instruction::SDiv:
> -  case Instruction::FDiv:
> -  case Instruction::URem:
> -  case Instruction::SRem:
> -  case Instruction::FRem:
> -  case Instruction::Shl:
> -  case Instruction::LShr:
> -  case Instruction::AShr:
> -  case Instruction::And:
> -  case Instruction::Or:
> -  case Instruction::Xor: {
> -    for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
> -      ValueList Operands;
> -      // Prepare the operand vector.
> -      for (unsigned j = 0; j < VL.size(); ++j)
> -        Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
> -
> -      getTreeUses_rec(Operands, Depth + 1);
> -    }
> -    return;
> -  }
> -  case Instruction::Store: {
> -    ValueList Operands;
> -    for (unsigned j = 0; j < VL.size(); ++j)
> -      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
> -    getTreeUses_rec(Operands, Depth + 1);
> -    return;
> -  }
> -  default:
> -    return;
> -  }
> -}
> -
> -int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
> -  Type *ScalarTy = VL[0]->getType();
> -
> -  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
> -    ScalarTy = SI->getValueOperand()->getType();
> -
> -  /// Don't mess with vectors.
> -  if (ScalarTy->isVectorTy())
> -    return max_cost;
> -
> -  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
> -
> -  if (Depth == RecursionMaxDepth)
> -    return getScalarizationCost(VecTy);
> -
> -  // Check if all of the operands are constants.
> -  bool AllConst = true;
> -  bool AllSameScalar = true;
> -  bool MustScalarizeFlag = false;
> -  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
> -    AllConst &= isa<Constant>(VL[i]);
> -    AllSameScalar &= (VL[0] == VL[i]);
> -    // Must have a single use.
> -    Instruction *I = dyn_cast<Instruction>(VL[i]);
> -    MustScalarizeFlag |= MustScalarize.count(VL[i]);
> -    // This instruction is outside the basic block.
> -    if (I && I->getParent() != BB)
> -      return getScalarizationCost(VecTy);
> -  }
> -
> -  // Is this a simple vector constant.
> -  if (AllConst)
> -    return 0;
> -
> -  // If all of the operands are identical we can broadcast them.
> -  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
> -  if (AllSameScalar) {
> -    // If we are in a loop, and this is not an instruction (e.g. constant
> or
> -    // argument) or the instruction is defined outside the loop then
> assume
> -    // that the cost is zero.
> -    if (L && (!VL0 || !L->contains(VL0)))
> -      return 0;
> -
> -    // We need to broadcast the scalar.
> -    return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy,
> 0);
> -  }
> -
> -  // If this is not a constant, or a scalar from outside the loop then we
> -  // need to scalarize it.
> -  if (MustScalarizeFlag)
> -    return getScalarizationCost(VecTy);
> -
> -  if (!VL0)
> -    return getScalarizationCost(VecTy);
> -  assert(VL0->getParent() == BB && "Wrong BB");
> -
> -  unsigned Opcode = VL0->getOpcode();
> -  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
> -    Instruction *I = dyn_cast<Instruction>(VL[i]);
> -    // If not all of the instructions are identical then we have to
> scalarize.
> -    if (!I || Opcode != I->getOpcode())
> -      return getScalarizationCost(VecTy);
> -  }
> -
> -  // Check if it is safe to sink the loads or the stores.
> -  if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
> -    int MaxIdx = getLastIndex(VL, VL.size());
> -    Instruction *Last = InstrVec[MaxIdx];
> -
> -    for (unsigned i = 0, e = VL.size(); i < e; ++i) {
> -      if (VL[i] == Last)
> -        continue;
> -      Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last);
> -      if (Barrier) {
> -        DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " <<
> *Last
> -                     << "\n because of " << *Barrier << "\n");
> -        return max_cost;
> -      }
> -    }
> -  }
> -
> -  // Calculate the extract cost.
> -  unsigned ExternalUserExtractCost = 0;
> -  for (unsigned i = 0, e = VL.size(); i < e; ++i)
> -    if (MustExtract.count(VL[i]))
> -      ExternalUserExtractCost +=
> -          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
> -
> -  switch (Opcode) {
> -  case Instruction::ExtractElement: {
> -    if (CanReuseExtract(VL, VL.size(), VecTy))
> -      return 0;
> -    return getScalarizationCost(VecTy);
> -  }
> -  case Instruction::ZExt:
> -  case Instruction::SExt:
> -  case Instruction::FPToUI:
> -  case Instruction::FPToSI:
> -  case Instruction::FPExt:
> -  case Instruction::PtrToInt:
> -  case Instruction::IntToPtr:
> -  case Instruction::SIToFP:
> -  case Instruction::UIToFP:
> -  case Instruction::Trunc:
> -  case Instruction::FPTrunc:
> -  case Instruction::BitCast: {
> -    int Cost = ExternalUserExtractCost;
> -    ValueList Operands;
> -    Type *SrcTy = VL0->getOperand(0)->getType();
> -    // Prepare the operand vector.
> -    for (unsigned j = 0; j < VL.size(); ++j) {
> -      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
> -      // Check that the casted type is the same for all users.
> -      if (cast<Instruction>(VL[j])->getOperand(0)->getType() != SrcTy)
> -        return getScalarizationCost(VecTy);
> -    }
> -
> -    Cost += getTreeCost_rec(Operands, Depth + 1);
> -    if (Cost >= max_cost)
> -      return max_cost;
> -
> -    // Calculate the cost of this instruction.
> -    int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
> -                                                       VL0->getType(),
> SrcTy);
> -
> -    VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
> -    int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy,
> SrcVecTy);
> -    Cost += (VecCost - ScalarCost);
> -    return Cost;
> -  }
> -  case Instruction::FCmp:
> -  case Instruction::ICmp: {
> -    // Check that all of the compares have the same predicate.
> -    CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
> -    for (unsigned i = 1, e = VL.size(); i < e; ++i) {
> -      CmpInst *Cmp = cast<CmpInst>(VL[i]);
> -      if (Cmp->getPredicate() != P0)
> -        return getScalarizationCost(VecTy);
> -    }
> -    // Fall through.
> -  }
> -  case Instruction::Select:
> -  case Instruction::Add:
> -  case Instruction::FAdd:
> -  case Instruction::Sub:
> -  case Instruction::FSub:
> -  case Instruction::Mul:
> -  case Instruction::FMul:
> -  case Instruction::UDiv:
> -  case Instruction::SDiv:
> -  case Instruction::FDiv:
> -  case Instruction::URem:
> -  case Instruction::SRem:
> -  case Instruction::FRem:
> -  case Instruction::Shl:
> -  case Instruction::LShr:
> -  case Instruction::AShr:
> -  case Instruction::And:
> -  case Instruction::Or:
> -  case Instruction::Xor: {
> -    int Cost = ExternalUserExtractCost;
> -    // Calculate the cost of all of the operands.
> -    for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
> -      ValueList Operands;
> -      // Prepare the operand vector.
> -      for (unsigned j = 0; j < VL.size(); ++j)
> -        Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
> -
> -      Cost += getTreeCost_rec(Operands, Depth + 1);
> -      if (Cost >= max_cost)
> -        return max_cost;
> -    }
> -
> -    // Calculate the cost of this instruction.
> -    int ScalarCost = 0;
> -    int VecCost = 0;
> -    if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
> -        Opcode == Instruction::Select) {
> -      VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(),
> VL.size());
> -      ScalarCost =
> -          VecTy->getNumElements() *
> -          TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
> -      VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
> -    } else {
> -      ScalarCost = VecTy->getNumElements() *
> -                   TTI->getArithmeticInstrCost(Opcode, ScalarTy);
> -      VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
> -    }
> -    Cost += (VecCost - ScalarCost);
> -    return Cost;
> -  }
> -  case Instruction::Load: {
> -    // If we are scalarize the loads, add the cost of forming the vector.
> -    for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
> -      if (!isConsecutiveAccess(VL[i], VL[i + 1]))
> -        return getScalarizationCost(VecTy);
> -
> -    // Cost of wide load - cost of scalar loads.
> -    int ScalarLdCost = VecTy->getNumElements() *
> -                       TTI->getMemoryOpCost(Instruction::Load, ScalarTy,
> 1, 0);
> -    int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1,
> 0);
> -    return VecLdCost - ScalarLdCost + ExternalUserExtractCost;
> -  }
> -  case Instruction::Store: {
> -    // We know that we can merge the stores. Calculate the cost.
> -    int ScalarStCost = VecTy->getNumElements() *
> -                       TTI->getMemoryOpCost(Instruction::Store, ScalarTy,
> 1, 0);
> -    int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1,
> 0);
> -    int StoreCost = VecStCost - ScalarStCost;
> -
> -    ValueList Operands;
> -    for (unsigned j = 0; j < VL.size(); ++j) {
> -      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
> -      MemBarrierIgnoreList.insert(VL[j]);
> -    }
> -
> -    int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1);
> -    return TotalCost + ExternalUserExtractCost;
> -  }
> -  default:
> -    // Unable to vectorize unknown instructions.
> -    return getScalarizationCost(VecTy);
> -  }
> -}
> -
> -int BoUpSLP::getLastIndex(ArrayRef<Value *> VL, unsigned VF) {
> -  int MaxIdx = InstrIdx[BB->getFirstNonPHI()];
> -  for (unsigned i = 0; i < VF; ++i)
> -    MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
> -  return MaxIdx;
> -}
> -
> -int BoUpSLP::getFirstUserIndex(ArrayRef<Value *> VL, unsigned VF) {
> -  // Find the first user of the values.
> -  int FirstUser = InstrVec.size();
> -  for (unsigned i = 0; i < VF; ++i) {
> -    for (Value::use_iterator U = VL[i]->use_begin(), UE =
> VL[i]->use_end();
> -         U != UE; ++U) {
> -      Instruction *Instr = dyn_cast<Instruction>(*U);
> -      if (!Instr || Instr->getParent() != BB)
> -        continue;
> -
> -      FirstUser = std::min(FirstUser, InstrIdx[Instr]);
> -    }
> -  }
> -  return FirstUser;
> -}
> -
> -int BoUpSLP::getLastIndex(Instruction *I, Instruction *J) {
> -  assert(I->getParent() == BB && "Invalid parent for instruction I");
> -  assert(J->getParent() == BB && "Invalid parent for instruction J");
> -  return std::max(InstrIdx[I], InstrIdx[J]);
> -}
> -
> -Instruction *BoUpSLP::getInsertionPoint(unsigned Index) {
> -  return InstrVec[Index + 1];
> -}
> -
> -Value *BoUpSLP::Scalarize(ArrayRef<Value *> VL, VectorType *Ty) {
> -  Value *Vec = UndefValue::get(Ty);
> -  for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
> -    // Generate the 'InsertElement' instruction.
> -    Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
> -    // Remember that this instruction is used as part of a 'gather'
> sequence.
> -    // The caller of the bottom-up slp vectorizer can try to hoist the
> sequence
> -    // if the users are outside of the basic block.
> -    if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(Vec))
> -      GatherInstructions.push_back(IEI);
> -  }
> -
> -  // Mark the end of the gather sequence.
> -  GatherInstructions.push_back(0);
> -
> -  for (unsigned i = 0; i < Ty->getNumElements(); ++i)
> -    VectorizedValues[VL[i]] = Vec;
> -
> -  return Vec;
> -}
> -
> -Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int VF) {
> -  Value *V = vectorizeTree_rec(VL, VF);
> -
> -  int LastInstrIdx = getLastIndex(VL, VL.size());
> -  for (SetVector<Value *>::iterator it = MustExtract.begin(),
> -                                    e = MustExtract.end();
> -       it != e; ++it) {
> -    Instruction *I = cast<Instruction>(*it);
> -
> -    // This is a scalarized value, so we can use the original value.
> -    // No need to extract from the vector.
> -    if (!LaneMap.count(I))
> -      continue;
> -
> -    Value *Vec = VectorizedValues[I];
> -    // We decided not to vectorize I because one of its users was not
> -    // vectorizerd. This is okay.
> -    if (!Vec)
> -      continue;
> -
> -    Value *Idx = Builder.getInt32(LaneMap[I]);
> -    Value *Extract = Builder.CreateExtractElement(Vec, Idx);
> -    bool Replaced = false;
> -    for (Value::use_iterator U = I->use_begin(), UE = I->use_end(); U !=
> UE;
> -         ++U) {
> -      Instruction *UI = cast<Instruction>(*U);
> -      if (UI->getParent() != I->getParent() || InstrIdx[UI] >
> LastInstrIdx)
> -        UI->replaceUsesOfWith(I, Extract);
> -      Replaced = true;
> -    }
> -    assert(Replaced && "Must replace at least one outside user");
> -    (void)Replaced;
> -  }
> -
> -  // We moved some instructions around. We have to number them again
> -  // before we can do any analysis.
> -  numberInstructions();
> -  MustScalarize.clear();
> -  MustExtract.clear();
> -  VectorizedValues.clear();
> -  return V;
> -}
> -
> -Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) {
> -  Type *ScalarTy = VL[0]->getType();
> -  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
> -    ScalarTy = SI->getValueOperand()->getType();
> -  VectorType *VecTy = VectorType::get(ScalarTy, VF);
> -
> -  // Check if all of the operands are constants or identical.
> -  bool AllConst = true;
> -  bool AllSameScalar = true;
> -  for (unsigned i = 0, e = VF; i < e; ++i) {
> -    AllConst &= isa<Constant>(VL[i]);
> -    AllSameScalar &= (VL[0] == VL[i]);
> -    // The instruction must be in the same BB, and it must be
> vectorizable.
> -    Instruction *I = dyn_cast<Instruction>(VL[i]);
> -    if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB))
> -      return Scalarize(VL, VecTy);
> -  }
> -
> -  // Check that this is a simple vector constant.
> -  if (AllConst || AllSameScalar)
> -    return Scalarize(VL, VecTy);
> -
> -  // Scalarize unknown structures.
> -  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
> -  if (!VL0)
> -    return Scalarize(VL, VecTy);
> -
> -  if (VectorizedValues.count(VL0)) {
> -    Value *Vec = VectorizedValues[VL0];
> -    for (int i = 0; i < VF; ++i)
> -      VectorizedValues[VL[i]] = Vec;
> -    return Vec;
> -  }
> -
> -  unsigned Opcode = VL0->getOpcode();
> -  for (unsigned i = 0, e = VF; i < e; ++i) {
> -    Instruction *I = dyn_cast<Instruction>(VL[i]);
> -    // If not all of the instructions are identical then we have to
> scalarize.
> -    if (!I || Opcode != I->getOpcode())
> -      return Scalarize(VL, VecTy);
> -  }
> -
> -  switch (Opcode) {
> -  case Instruction::ExtractElement: {
> -    if (CanReuseExtract(VL, VL.size(), VecTy))
> -      return VL0->getOperand(0);
> -    return Scalarize(VL, VecTy);
> -  }
> -  case Instruction::ZExt:
> -  case Instruction::SExt:
> -  case Instruction::FPToUI:
> -  case Instruction::FPToSI:
> -  case Instruction::FPExt:
> -  case Instruction::PtrToInt:
> -  case Instruction::IntToPtr:
> -  case Instruction::SIToFP:
> -  case Instruction::UIToFP:
> -  case Instruction::Trunc:
> -  case Instruction::FPTrunc:
> -  case Instruction::BitCast: {
> -    ValueList INVL;
> -    for (int i = 0; i < VF; ++i)
> -      INVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
> -    Value *InVec = vectorizeTree_rec(INVL, VF);
> -    CastInst *CI = dyn_cast<CastInst>(VL0);
> -    Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
> -
> -    for (int i = 0; i < VF; ++i)
> -      VectorizedValues[VL[i]] = V;
> -
> -    return V;
> -  }
> -  case Instruction::FCmp:
> -  case Instruction::ICmp: {
> -    // Check that all of the compares have the same predicate.
> -    CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
> -    for (unsigned i = 1, e = VF; i < e; ++i) {
> -      CmpInst *Cmp = cast<CmpInst>(VL[i]);
> -      if (Cmp->getPredicate() != P0)
> -        return Scalarize(VL, VecTy);
> -    }
> -
> -    ValueList LHSV, RHSV;
> -    for (int i = 0; i < VF; ++i) {
> -      LHSV.push_back(cast<Instruction>(VL[i])->getOperand(0));
> -      RHSV.push_back(cast<Instruction>(VL[i])->getOperand(1));
> -    }
> -
> -    Value *L = vectorizeTree_rec(LHSV, VF);
> -    Value *R = vectorizeTree_rec(RHSV, VF);
> -    Value *V;
> -    if (VL0->getOpcode() == Instruction::FCmp)
> -      V = Builder.CreateFCmp(P0, L, R);
> -    else
> -      V = Builder.CreateICmp(P0, L, R);
> -
> -    for (int i = 0; i < VF; ++i)
> -      VectorizedValues[VL[i]] = V;
> -
> -    return V;
> -  }
> -  case Instruction::Select: {
> -    ValueList TrueVec, FalseVec, CondVec;
> -    for (int i = 0; i < VF; ++i) {
> -      CondVec.push_back(cast<Instruction>(VL[i])->getOperand(0));
> -      TrueVec.push_back(cast<Instruction>(VL[i])->getOperand(1));
> -      FalseVec.push_back(cast<Instruction>(VL[i])->getOperand(2));
> -    }
> -
> -    Value *True = vectorizeTree_rec(TrueVec, VF);
> -    Value *False = vectorizeTree_rec(FalseVec, VF);
> -    Value *Cond = vectorizeTree_rec(CondVec, VF);
> -    Value *V = Builder.CreateSelect(Cond, True, False);
> -
> -    for (int i = 0; i < VF; ++i)
> -      VectorizedValues[VL[i]] = V;
> -
> -    return V;
> -  }
> -  case Instruction::Add:
> -  case Instruction::FAdd:
> -  case Instruction::Sub:
> -  case Instruction::FSub:
> -  case Instruction::Mul:
> -  case Instruction::FMul:
> -  case Instruction::UDiv:
> -  case Instruction::SDiv:
> -  case Instruction::FDiv:
> -  case Instruction::URem:
> -  case Instruction::SRem:
> -  case Instruction::FRem:
> -  case Instruction::Shl:
> -  case Instruction::LShr:
> -  case Instruction::AShr:
> -  case Instruction::And:
> -  case Instruction::Or:
> -  case Instruction::Xor: {
> -    ValueList LHSVL, RHSVL;
> -    for (int i = 0; i < VF; ++i) {
> -      LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
> -      RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
> -    }
> -
> -    Value *LHS = vectorizeTree_rec(LHSVL, VF);
> -    Value *RHS = vectorizeTree_rec(RHSVL, VF);
> -    BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
> -    Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
> -
> -    for (int i = 0; i < VF; ++i)
> -      VectorizedValues[VL[i]] = V;
> -
> -    return V;
> -  }
> -  case Instruction::Load: {
> -    LoadInst *LI = cast<LoadInst>(VL0);
> -    unsigned Alignment = LI->getAlignment();
> -
> -    // Check if all of the loads are consecutive.
> -    for (unsigned i = 1, e = VF; i < e; ++i)
> -      if (!isConsecutiveAccess(VL[i - 1], VL[i]))
> -        return Scalarize(VL, VecTy);
> -
> -    // Loads are inserted at the head of the tree because we don't want
> to sink
> -    // them all the way down past store instructions.
> -    Instruction *Loc = getInsertionPoint(getLastIndex(VL, VL.size()));
> -    IRBuilder<> LoadBuilder(Loc);
> -    Value *VecPtr = LoadBuilder.CreateBitCast(LI->getPointerOperand(),
> -                                              VecTy->getPointerTo());
> -    LI = LoadBuilder.CreateLoad(VecPtr);
> -    LI->setAlignment(Alignment);
> -
> -    for (int i = 0; i < VF; ++i)
> -      VectorizedValues[VL[i]] = LI;
> -
> -    return LI;
> -  }
> -  case Instruction::Store: {
> -    StoreInst *SI = cast<StoreInst>(VL0);
> -    unsigned Alignment = SI->getAlignment();
> -
> -    ValueList ValueOp;
> -    for (int i = 0; i < VF; ++i)
> -      ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand());
> -
> -    Value *VecValue = vectorizeTree_rec(ValueOp, VF);
> -    Value *VecPtr =
> -        Builder.CreateBitCast(SI->getPointerOperand(),
> VecTy->getPointerTo());
> -    Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment);
> -
> -    for (int i = 0; i < VF; ++i)
> -      cast<Instruction>(VL[i])->eraseFromParent();
> -    return 0;
> -  }
> -  default:
> -    return Scalarize(VL, VecTy);
> -  }
> -}
> -
> -} // end of namespace
>
> Removed: llvm/trunk/lib/Transforms/Vectorize/VecUtils.h
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/VecUtils.h?rev=184646&view=auto
>
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Vectorize/VecUtils.h (original)
> +++ llvm/trunk/lib/Transforms/Vectorize/VecUtils.h (removed)
> @@ -1,194 +0,0 @@
> -//===- VecUtils.h - Vectorization Utilities
> -------------------------------===//
> -//
> -//                     The LLVM Compiler Infrastructure
> -//
> -// This file is distributed under the University of Illinois Open Source
> -// License. See LICENSE.TXT for details.
> -//
>
> -//===----------------------------------------------------------------------===//
> -//
> -// This family of classes and functions manipulate vectors and chains of
> -// vectors.
> -//
>
> -//===----------------------------------------------------------------------===//
> -
> -#ifndef LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H
> -#define LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H
> -
> -#include "llvm/ADT/DenseMap.h"
> -#include "llvm/ADT/SetVector.h"
> -#include "llvm/ADT/SmallPtrSet.h"
> -#include "llvm/ADT/SmallVector.h"
> -#include "llvm/Analysis/AliasAnalysis.h"
> -#include "llvm/IR/IRBuilder.h"
> -#include <vector>
> -
> -namespace llvm {
> -
> -class BasicBlock;
> -class Instruction;
> -class Type;
> -class VectorType;
> -class StoreInst;
> -class Value;
> -class ScalarEvolution;
> -class DataLayout;
> -class TargetTransformInfo;
> -class AliasAnalysis;
> -class Loop;
> -
> -/// Bottom Up SLP vectorization utility class.
> -struct BoUpSLP {
> -  typedef SmallVector<Value *, 8> ValueList;
> -  typedef SmallVector<Instruction *, 16> InstrList;
> -  typedef SmallPtrSet<Value *, 16> ValueSet;
> -  typedef SmallVector<StoreInst *, 8> StoreList;
> -  static const int max_cost = 1 << 20;
> -
> -  // \brief C'tor.
> -  BoUpSLP(BasicBlock *Bb, ScalarEvolution *Se, DataLayout *Dl,
> -          TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp);
> -
> -  /// \brief Take the pointer operand from the Load/Store instruction.
> -  /// \returns NULL if this is not a valid Load/Store instruction.
> -  static Value *getPointerOperand(Value *I);
> -
> -  /// \brief Take the address space operand from the Load/Store
> instruction.
> -  /// \returns -1 if this is not a valid Load/Store instruction.
> -  static unsigned getAddressSpaceOperand(Value *I);
> -
> -  /// \returns true if the memory operations A and B are consecutive.
> -  bool isConsecutiveAccess(Value *A, Value *B);
> -
> -  /// \brief Vectorize the tree that starts with the elements in \p VL.
> -  /// \returns the vectorized value.
> -  Value *vectorizeTree(ArrayRef<Value *> VL, int VF);
> -
> -  /// \returns the vectorization cost of the subtree that starts at \p VL.
> -  /// A negative number means that this is profitable.
> -  int getTreeCost(ArrayRef<Value *> VL);
> -
> -  /// \returns the scalarization cost for this list of values. Assuming
> that
> -  /// this subtree gets vectorized, we may need to extract the values
> from the
> -  /// roots. This method calculates the cost of extracting the values.
> -  int getScalarizationCost(ArrayRef<Value *> VL);
> -
> -  /// \brief Attempts to order and vectorize a sequence of stores. This
> -  /// function does a quadratic scan of the given stores.
> -  /// \returns true if the basic block was modified.
> -  bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold);
> -
> -  /// \brief Vectorize a group of scalars into a vector tree.
> -  /// \returns the vectorized value.
> -  Value *vectorizeArith(ArrayRef<Value *> Operands);
> -
> -  /// \returns the list of new instructions that were added in order to
> collect
> -  /// scalars into vectors. This list can be used to further optimize the
> gather
> -  /// sequences.
> -  InstrList &getGatherSeqInstructions() { return GatherInstructions; }
> -
> -private:
> -  /// \brief This method contains the recursive part of getTreeCost.
> -  int getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth);
> -
> -  /// \brief This recursive method looks for vectorization hazards such as
> -  /// values that are used by multiple users and checks that values are
> used
> -  /// by only one vector lane. It updates the variables LaneMap,
> MultiUserVals.
> -  void getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth);
> -
> -  /// \brief This method contains the recursive part of vectorizeTree.
> -  Value *vectorizeTree_rec(ArrayRef<Value *> VL, int VF);
> -
> -  /// \brief Number all of the instructions in the block.
> -  void numberInstructions();
> -
> -  ///  \brief Vectorize a sorted sequence of stores.
> -  bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold);
> -
> -  /// \returns the scalarization cost for this type. Scalarization in this
> -  /// context means the creation of vectors from a group of scalars.
> -  int getScalarizationCost(Type *Ty);
> -
> -  /// \returns the AA location that is being access by the instruction.
> -  AliasAnalysis::Location getLocation(Instruction *I);
> -
> -  /// \brief Checks if it is possible to sink an instruction from
> -  /// \p Src to \p Dst.
> -  /// \returns the pointer to the barrier instruction if we can't sink.
> -  Value *isUnsafeToSink(Instruction *Src, Instruction *Dst);
> -
> -  /// \returns the index of the last instrucion in the BB from \p VL.
> -  /// Only consider the first \p VF elements.
> -  int getLastIndex(ArrayRef<Value *> VL, unsigned VF);
> -
> -  /// \returns the index of the first User of \p VL.
> -  /// Only consider the first \p VF elements.
> -  int getFirstUserIndex(ArrayRef<Value *> VL, unsigned VF);
> -
> -  /// \returns the instruction \p I or \p J that appears last in the BB .
> -  int getLastIndex(Instruction *I, Instruction *J);
> -
> -  /// \returns the insertion point for \p Index.
> -  Instruction *getInsertionPoint(unsigned Index);
> -
> -  /// \returns a vector from a collection of scalars in \p VL.
> -  Value *Scalarize(ArrayRef<Value *> VL, VectorType *Ty);
> -
> -private:
> -  /// Maps instructions to numbers and back.
> -  SmallDenseMap<Value *, int> InstrIdx;
> -  /// Maps integers to Instructions.
> -  std::vector<Instruction *> InstrVec;
> -
> -  // -- containers that are used during getTreeCost -- //
> -
> -  /// Contains values that must be scalarized because they are used
> -  /// by multiple lanes, or by users outside the tree.
> -  /// NOTICE: The vectorization methods also use this set.
> -  ValueSet MustScalarize;
> -
> -  /// Contains values that have users outside of the vectorized graph.
> -  /// We need to generate extract instructions for these values.
> -  /// NOTICE: The vectorization methods also use this set.
> -  SetVector<Value *> MustExtract;
> -
> -  /// Contains a list of values that are used outside the current tree.
> This
> -  /// set must be reset between runs.
> -  SetVector<Value *> MultiUserVals;
> -  /// Maps values in the tree to the vector lanes that uses them. This
> map must
> -  /// be reset between runs of getCost.
> -  std::map<Value *, int> LaneMap;
> -  /// A list of instructions to ignore while sinking
> -  /// memory instructions. This map must be reset between runs of getCost.
> -  ValueSet MemBarrierIgnoreList;
> -
> -  // -- Containers that are used during vectorizeTree -- //
> -
> -  /// Maps between the first scalar to the vector. This map must be reset
> -  /// between runs.
> -  DenseMap<Value *, Value *> VectorizedValues;
> -
> -  // -- Containers that are used after vectorization by the caller -- //
> -
> -  /// A list of instructions that are used when gathering scalars into
> vectors.
> -  /// In many cases these instructions can be hoisted outside of the BB.
> -  /// Iterating over this list is faster than calling LICM.
> -  /// Notice: We insert NULL ptrs to separate between the different gather
> -  /// sequences.
> -  InstrList GatherInstructions;
> -
> -  /// Instruction builder to construct the vectorized tree.
> -  IRBuilder<> Builder;
> -
> -  // Analysis and block reference.
> -  BasicBlock *BB;
> -  ScalarEvolution *SE;
> -  DataLayout *DL;
> -  TargetTransformInfo *TTI;
> -  AliasAnalysis *AA;
> -  Loop *L;
> -};
> -
> -} // end of namespace
> -
> -#endif // LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H
>
> Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/diamond.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/diamond.ll?rev=184647&r1=184646&r2=184647&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/Transforms/SLPVectorizer/X86/diamond.ll (original)
> +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/diamond.ll Sat Jun 22
> 16:34:10 2013
> @@ -50,9 +50,9 @@ entry:
>  ; }
>
>  ; CHECK: @extr_user
> +; CHECK: load i32*
>  ; CHECK: store <4 x i32>
> -; CHECK-NEXT: extractelement <4 x i32>
> -; CHECK: ret
> +; CHECK-NEXT: ret
>  define i32 @extr_user(i32* noalias nocapture %B, i32* noalias nocapture
> %A, i32 %n, i32 %m) {
>  entry:
>    %0 = load i32* %A, align 4
> @@ -79,9 +79,9 @@ entry:
>
>  ; In this example we have an external user that is not the first element
> in the vector.
>  ; CHECK: @extr_user1
> +; CHECK: load i32*
>  ; CHECK: store <4 x i32>
> -; CHECK-NEXT: extractelement <4 x i32>
> -; CHECK: ret
> +; CHECK-NEXT: ret
>  define i32 @extr_user1(i32* noalias nocapture %B, i32* noalias nocapture
> %A, i32 %n, i32 %m) {
>  entry:
>    %0 = load i32* %A, align 4
>
> Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/multi_block.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/multi_block.ll?rev=184647&view=auto
>
> ==============================================================================
> --- llvm/trunk/test/Transforms/SLPVectorizer/X86/multi_block.ll (added)
> +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/multi_block.ll Sat Jun 22
> 16:34:10 2013
> @@ -0,0 +1,55 @@
> +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S
> -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
> +
> +target datalayout =
> "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
> +target triple = "x86_64-apple-macosx10.7.0"
> +
> +; int bar(double *A, int d) {
> +;   double A0 = A[0];
> +;   double A1 = A[1];
> +;   float F0 = A0;
> +;   float F1 = A1;
> +;   if (d) foo(); <----- This splits the blocks
> +;   F0+=4.0;
> +;   F1+=5.0;
> +;   A[8] = 9.0 + F0;
> +;   A[9] = 5.0 + F1;
> +; }
> +
> +
> +;CHECK: @bar
> +;CHECK: load <2 x double>
> +;CHECK: fptrunc <2 x double>
> +;CHECK: call i32
> +;CHECK: fadd <2 x float>
> +;CHECK: fpext <2 x float>
> +;CHECK: store <2 x double>
> +;CHECK: ret
> +define i32 @bar(double* nocapture %A, i32 %d) {
> +  %1 = load double* %A, align 8
> +  %2 = getelementptr inbounds double* %A, i64 1
> +  %3 = load double* %2, align 8
> +  %4 = fptrunc double %1 to float
> +  %5 = fptrunc double %3 to float
> +  %6 = icmp eq i32 %d, 0
> +  br i1 %6, label %9, label %7
> +
> +; <label>:7                                       ; preds = %0
> +  %8 = tail call i32 (...)* @foo()
> +  br label %9
> +
> +; <label>:9                                       ; preds = %0, %7
> +  %10 = fadd float %4, 4.000000e+00
> +  %11 = fadd float %5, 5.000000e+00
> +  %12 = fpext float %10 to double
> +  %13 = fadd double %12, 9.000000e+00
> +  %14 = getelementptr inbounds double* %A, i64 8
> +  store double %13, double* %14, align 8
> +  %15 = fpext float %11 to double
> +  %16 = fadd double %15, 5.000000e+00
> +  %17 = getelementptr inbounds double* %A, i64 9
> +  store double %16, double* %17, align 8
> +  ret i32 undef
> +}
> +
> +declare i32 @foo(...)
> +
>
> Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/multi_user.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/multi_user.ll?rev=184647&r1=184646&r2=184647&view=diff
>
> ==============================================================================
> --- llvm/trunk/test/Transforms/SLPVectorizer/X86/multi_user.ll (original)
> +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/multi_user.ll Sat Jun 22
> 16:34:10 2013
> @@ -12,8 +12,8 @@ target triple = "x86_64-apple-macosx10.7
>  ;}
>
>  ;CHECK: @foo
> -;CHECK: load <4 x i32>
>  ;CHECK: insertelement <4 x i32>
> +;CHECK: load <4 x i32>
>  ;CHECK: add <4 x i32>
>  ;CHECK: store <4 x i32>
>  ;CHECK: ret
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>

-- 
Alexey Samsonov, MSK
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20130623/6dec61fa/attachment.html>