[llvm] r184647 - SLP Vectorizer: Implement multi-block slp-vectorization.

Nadav Rotem nrotem at apple.com
Sat Jun 22 14:34:10 PDT 2013


Author: nadav
Date: Sat Jun 22 16:34:10 2013
New Revision: 184647

URL: http://llvm.org/viewvc/llvm-project?rev=184647&view=rev
Log:
SLP Vectorizer: Implement multi-block slp-vectorization.

Rewrote the SLP-vectorization as a whole-function vectorization pass. It is now able to vectorize chains across multiple basic blocks.
It still does not vectorize PHIs, but this should be easy to do now that we scan the entire function.
I removed the support for extracting values from trees.
We are now able to vectorize more programs, but there are some serious regressions in many workloads (such as flops-6 and mandel-2).


Added:
    llvm/trunk/test/Transforms/SLPVectorizer/X86/multi_block.ll
Removed:
    llvm/trunk/lib/Transforms/Vectorize/VecUtils.cpp
    llvm/trunk/lib/Transforms/Vectorize/VecUtils.h
Modified:
    llvm/trunk/lib/Transforms/Vectorize/CMakeLists.txt
    llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
    llvm/trunk/test/Transforms/SLPVectorizer/X86/diamond.ll
    llvm/trunk/test/Transforms/SLPVectorizer/X86/multi_user.ll

Modified: llvm/trunk/lib/Transforms/Vectorize/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/CMakeLists.txt?rev=184647&r1=184646&r2=184647&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/CMakeLists.txt (original)
+++ llvm/trunk/lib/Transforms/Vectorize/CMakeLists.txt Sat Jun 22 16:34:10 2013
@@ -3,7 +3,6 @@ add_llvm_library(LLVMVectorize
   Vectorize.cpp
   LoopVectorize.cpp
   SLPVectorizer.cpp
-  VecUtils.cpp
   )
 
 add_dependencies(LLVMVectorize intrinsics_gen)

Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=184647&r1=184646&r2=184647&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Sat Jun 22 16:34:10 2013
@@ -18,17 +18,20 @@
 #define SV_NAME "slp-vectorizer"
 #define DEBUG_TYPE "SLP"
 
-#include "VecUtils.h"
 #include "llvm/Transforms/Vectorize.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -36,6 +39,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 #include <map>
 
 using namespace llvm;
@@ -46,9 +50,1138 @@ static cl::opt<int>
                               "number. (gain = -cost of vectorization)"));
 namespace {
 
+static const unsigned MinVecRegSize = 128;
+
+static const unsigned RecursionMaxDepth = 6;
+
+/// RAII pattern to save the insertion point of the IR builder.
+class BuilderLocGuard {
+public:
+  BuilderLocGuard(IRBuilder<> &B) : Builder(B), Loc(B.GetInsertPoint()) {}
+  ~BuilderLocGuard() { Builder.SetInsertPoint(Loc); }
+
+private:
+  // Prevent copying.
+  BuilderLocGuard(const BuilderLocGuard &);
+  BuilderLocGuard &operator=(const BuilderLocGuard &);
+  IRBuilder<> &Builder;
+  BasicBlock::iterator Loc;
+};
+
+/// A helper class for numbering instructions in multible blocks.
+/// Numbers starts at zero for each basic block.
+struct BlockNumbering {
+
+  BlockNumbering(BasicBlock *Bb) : BB(Bb), Valid(false) {}
+
+  BlockNumbering() : BB(0), Valid(false) {}
+
+  void numberInstructions() {
+    unsigned Loc = 0;
+    InstrIdx.clear();
+    InstrVec.clear();
+    // Number the instructions in the block.
+    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+      InstrIdx[it] = Loc++;
+      InstrVec.push_back(it);
+      assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation");
+    }
+    Valid = true;
+  }
+
+  int getIndex(Instruction *I) {
+    if (!Valid)
+      numberInstructions();
+    assert(InstrIdx.count(I) && "Unknown instruction");
+    return InstrIdx[I];
+  }
+
+  Instruction *getInstruction(unsigned loc) {
+    if (!Valid)
+      numberInstructions();
+    assert(InstrVec.size() > loc && "Invalid Index");
+    return InstrVec[loc];
+  }
+
+  void forget() { Valid = false; }
+
+private:
+  /// The block we are numbering.
+  BasicBlock *BB;
+  /// Is the block numbered.
+  bool Valid;
+  /// Maps instructions to numbers and back.
+  SmallDenseMap<Instruction *, int> InstrIdx;
+  /// Maps integers to Instructions.
+  std::vector<Instruction *> InstrVec;
+};
+
+class FuncSLP {
+  typedef SmallVector<Value *, 8> ValueList;
+  typedef SmallVector<Instruction *, 16> InstrList;
+  typedef SmallPtrSet<Value *, 16> ValueSet;
+  typedef SmallVector<StoreInst *, 8> StoreList;
+
+public:
+  static const int MAX_COST = INT_MIN;
+
+  FuncSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl,
+          TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li)
+      : F(Func), SE(Se), DL(Dl), TTI(Tti), AA(Aa), LI(Li),
+        Builder(Se->getContext()) {
+    for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) {
+      BasicBlock *BB = it;
+      BlocksNumbers[BB] = BlockNumbering(BB);
+    }
+  }
+
+  /// \brief Take the pointer operand from the Load/Store instruction.
+  /// \returns NULL if this is not a valid Load/Store instruction.
+  static Value *getPointerOperand(Value *I);
+
+  /// \brief Take the address space operand from the Load/Store instruction.
+  /// \returns -1 if this is not a valid Load/Store instruction.
+  static unsigned getAddressSpaceOperand(Value *I);
+
+  /// \returns true if the memory operations A and B are consecutive.
+  bool isConsecutiveAccess(Value *A, Value *B);
+
+  /// \brief Vectorize the tree that starts with the elements in \p VL.
+  /// \returns the vectorized value.
+  Value *vectorizeTree(ArrayRef<Value *> VL);
+
+  /// \returns the vectorization cost of the subtree that starts at \p VL.
+  /// A negative number means that this is profitable.
+  int getTreeCost(ArrayRef<Value *> VL);
+
+  /// \returns the scalarization cost for this list of values. Assuming that
+  /// this subtree gets vectorized, we may need to extract the values from the
+  /// roots. This method calculates the cost of extracting the values.
+  int getGatherCost(ArrayRef<Value *> VL);
+
+  /// \brief Attempts to order and vectorize a sequence of stores. This
+  /// function does a quadratic scan of the given stores.
+  /// \returns true if the basic block was modified.
+  bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold);
+
+  /// \brief Vectorize a group of scalars into a vector tree.
+  /// \returns the vectorized value.
+  Value *vectorizeArith(ArrayRef<Value *> Operands);
+
+  /// \brief This method contains the recursive part of getTreeCost.
+  int getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth);
+
+  /// \brief This recursive method looks for vectorization hazards such as
+  /// values that are used by multiple users and checks that values are used
+  /// by only one vector lane. It updates the variables LaneMap, MultiUserVals.
+  void getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth);
+
+  /// \brief This method contains the recursive part of vectorizeTree.
+  Value *vectorizeTree_rec(ArrayRef<Value *> VL);
+
+  ///  \brief Vectorize a sorted sequence of stores.
+  bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold);
+
+  /// \returns the scalarization cost for this type. Scalarization in this
+  /// context means the creation of vectors from a group of scalars.
+  int getGatherCost(Type *Ty);
+
+  /// \returns the AA location that is being access by the instruction.
+  AliasAnalysis::Location getLocation(Instruction *I);
+
+  /// \brief Checks if it is possible to sink an instruction from
+  /// \p Src to \p Dst.
+  /// \returns the pointer to the barrier instruction if we can't sink.
+  Value *getSinkBarrier(Instruction *Src, Instruction *Dst);
+
+  /// \returns the index of the last instrucion in the BB from \p VL.
+  int getLastIndex(ArrayRef<Value *> VL);
+
+  /// \returns the Instrucion in the bundle \p VL.
+  Instruction *getLastInstruction(ArrayRef<Value *> VL);
+
+  /// \returns the Instruction at index \p Index which is in Block \p BB.
+  Instruction *getInstructionForIndex(unsigned Index, BasicBlock *BB);
+
+  /// \returns the index of the first User of \p VL.
+  int getFirstUserIndex(ArrayRef<Value *> VL);
+
+  /// \returns a vector from a collection of scalars in \p VL.
+  Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
+
+  /// \brief Try to hoist gather sequences outside of the loop in cases where
+  /// all of the sources are loop invariant.
+  void hoistGatherSequence();
+
+  bool needToGatherAny(ArrayRef<Value *> VL) {
+    for (int i = 0, e = VL.size(); i < e; ++i)
+      if (MustGather.count(VL[i]))
+        return true;
+    return false;
+  }
+
+  /// -- Vectorization State --
+
+  /// Maps values in the tree to the vector lanes that uses them. This map must
+  /// be reset between runs of getCost.
+  std::map<Value *, int> LaneMap;
+  /// A list of instructions to ignore while sinking
+  /// memory instructions. This map must be reset between runs of getCost.
+  ValueSet MemBarrierIgnoreList;
+
+  /// Maps between the first scalar to the vector. This map must be reset
+  /// between runs.
+  DenseMap<Value *, Value *> VectorizedValues;
+
+  /// Contains values that must be gathered because they are used
+  /// by multiple lanes, or by users outside the tree.
+  /// NOTICE: The vectorization methods also use this set.
+  ValueSet MustGather;
+
+  /// Contains a list of values that are used outside the current tree. This
+  /// set must be reset between runs.
+  SetVector<Value *> MultiUserVals;
+
+  /// Holds all of the instructions that we gathered.
+  SetVector<Instruction *> GatherSeq;
+
+  /// Numbers instructions in different blocks.
+  std::map<BasicBlock *, BlockNumbering> BlocksNumbers;
+
+  // Analysis and block reference.
+  Function *F;
+  ScalarEvolution *SE;
+  DataLayout *DL;
+  TargetTransformInfo *TTI;
+  AliasAnalysis *AA;
+  LoopInfo *LI;
+  /// Instruction builder to construct the vectorized tree.
+  IRBuilder<> Builder;
+};
+
+int FuncSLP::getGatherCost(Type *Ty) {
+  int Cost = 0;
+  for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
+    Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+  return Cost;
+}
+
+int FuncSLP::getGatherCost(ArrayRef<Value *> VL) {
+  // Find the type of the operands in VL.
+  Type *ScalarTy = VL[0]->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+  // Find the cost of inserting/extracting values from the vector.
+  return getGatherCost(VecTy);
+}
+
+AliasAnalysis::Location FuncSLP::getLocation(Instruction *I) {
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return AA->getLocation(SI);
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return AA->getLocation(LI);
+  return AliasAnalysis::Location();
+}
+
+Value *FuncSLP::getPointerOperand(Value *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerOperand();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperand();
+  return 0;
+}
+
+unsigned FuncSLP::getAddressSpaceOperand(Value *I) {
+  if (LoadInst *L = dyn_cast<LoadInst>(I))
+    return L->getPointerAddressSpace();
+  if (StoreInst *S = dyn_cast<StoreInst>(I))
+    return S->getPointerAddressSpace();
+  return -1;
+}
+
+bool FuncSLP::isConsecutiveAccess(Value *A, Value *B) {
+  Value *PtrA = getPointerOperand(A);
+  Value *PtrB = getPointerOperand(B);
+  unsigned ASA = getAddressSpaceOperand(A);
+  unsigned ASB = getAddressSpaceOperand(B);
+
+  // Check that the address spaces match and that the pointers are valid.
+  if (!PtrA || !PtrB || (ASA != ASB))
+    return false;
+
+  // Check that A and B are of the same type.
+  if (PtrA->getType() != PtrB->getType())
+    return false;
+
+  // Calculate the distance.
+  const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
+  const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
+  const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB);
+  const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV);
+
+  // Non constant distance.
+  if (!ConstOffSCEV)
+    return false;
+
+  int64_t Offset = ConstOffSCEV->getValue()->getSExtValue();
+  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
+  // The Instructions are connsecutive if the size of the first load/store is
+  // the same as the offset.
+  int64_t Sz = DL->getTypeStoreSize(Ty);
+  return ((-Offset) == Sz);
+}
+
+Value *FuncSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) {
+  assert(Src->getParent() == Dst->getParent() && "Not the same BB");
+  BasicBlock::iterator I = Src, E = Dst;
+  /// Scan all of the instruction from SRC to DST and check if
+  /// the source may alias.
+  for (++I; I != E; ++I) {
+    // Ignore store instructions that are marked as 'ignore'.
+    if (MemBarrierIgnoreList.count(I))
+      continue;
+    if (Src->mayWriteToMemory()) /* Write */ {
+      if (!I->mayReadOrWriteMemory())
+        continue;
+    } else /* Read */ {
+      if (!I->mayWriteToMemory())
+        continue;
+    }
+    AliasAnalysis::Location A = getLocation(&*I);
+    AliasAnalysis::Location B = getLocation(Src);
+
+    if (!A.Ptr || !B.Ptr || AA->alias(A, B))
+      return I;
+  }
+  return 0;
+}
+
+static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
+  BasicBlock *BB = 0;
+  for (int i = 0, e = VL.size(); i < e; i++) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    if (!I)
+      return 0;
+
+    if (!BB) {
+      BB = I->getParent();
+      continue;
+    }
+
+    if (BB != I->getParent())
+      return 0;
+  }
+  return BB;
+}
+
+static bool allConstant(ArrayRef<Value *> VL) {
+  for (unsigned i = 0, e = VL.size(); i < e; ++i)
+    if (!isa<Constant>(VL[i]))
+      return false;
+  return true;
+}
+
+static bool isSplat(ArrayRef<Value *> VL) {
+  for (unsigned i = 1, e = VL.size(); i < e; ++i)
+    if (VL[i] != VL[0])
+      return false;
+  return true;
+}
+
+static unsigned getSameOpcode(ArrayRef<Value *> VL) {
+  unsigned Opcode = 0;
+  for (int i = 0, e = VL.size(); i < e; i++) {
+    if (Instruction *I = dyn_cast<Instruction>(VL[i])) {
+      if (!Opcode) {
+        Opcode = I->getOpcode();
+        continue;
+      }
+      if (Opcode != I->getOpcode())
+        return 0;
+    }
+  }
+  return Opcode;
+}
+
+static bool CanReuseExtract(ArrayRef<Value *> VL, unsigned VF,
+                            VectorType *VecTy) {
+  assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
+  // Check if all of the extracts come from the same vector and from the
+  // correct offset.
+  Value *VL0 = VL[0];
+  ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
+  Value *Vec = E0->getOperand(0);
+
+  // We have to extract from the same vector type.
+  if (Vec->getType() != VecTy)
+    return false;
+
+  // Check that all of the indices extract from the correct offset.
+  ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
+  if (!CI || CI->getZExtValue())
+    return false;
+
+  for (unsigned i = 1, e = VF; i < e; ++i) {
+    ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
+    ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
+
+    if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
+      return false;
+  }
+
+  return true;
+}
+
+void FuncSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) {
+  if (Depth == RecursionMaxDepth)
+    return MustGather.insert(VL.begin(), VL.end());
+
+  // Don't handle vectors.
+  if (VL[0]->getType()->isVectorTy())
+    return;
+
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    if (SI->getValueOperand()->getType()->isVectorTy())
+      return;
+
+  // If all of the operands are identical or constant we have a simple solution.
+  if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL))
+    return MustGather.insert(VL.begin(), VL.end());
+
+  // Stop the scan at unknown IR.
+  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
+  assert(VL0 && "Invalid instruction");
+
+  // Mark instructions with multiple users.
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    // Remember to check if all of the users of this instruction are vectorized
+    // within our tree. At depth zero we have no local users, only external
+    // users that we don't care about.
+    if (Depth && I && I->getNumUses() > 1) {
+      DEBUG(dbgs() << "SLP: Adding to MultiUserVals "
+                      "because it has multiple users:" << *I << " \n");
+      MultiUserVals.insert(I);
+    }
+  }
+
+  // Check that the instruction is only used within one lane.
+  for (int i = 0, e = VL.size(); i < e; ++i) {
+    if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) {
+      DEBUG(dbgs() << "SLP: Value used by multiple lanes:" << *VL[i] << "\n");
+      return MustGather.insert(VL.begin(), VL.end());
+    }
+    // Make this instruction as 'seen' and remember the lane.
+    LaneMap[VL[i]] = i;
+  }
+
+  unsigned Opcode = getSameOpcode(VL);
+  if (!Opcode)
+    return MustGather.insert(VL.begin(), VL.end());
+
+  switch (Opcode) {
+  case Instruction::ExtractElement: {
+    VectorType *VecTy = VectorType::get(VL[0]->getType(), VL.size());
+    // No need to follow ExtractElements that are going to be optimized away.
+    if (CanReuseExtract(VL, VL.size(), VecTy))
+      return;
+    // Fall through.
+  }
+  case Instruction::Load:
+    return;
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast:
+  case Instruction::Select:
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+      ValueList Operands;
+      // Prepare the operand vector.
+      for (unsigned j = 0; j < VL.size(); ++j)
+        Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+      getTreeUses_rec(Operands, Depth + 1);
+    }
+    return;
+  }
+  case Instruction::Store: {
+    ValueList Operands;
+    for (unsigned j = 0; j < VL.size(); ++j)
+      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
+    getTreeUses_rec(Operands, Depth + 1);
+    return;
+  }
+  default:
+    return MustGather.insert(VL.begin(), VL.end());
+  }
+}
+
+int FuncSLP::getLastIndex(ArrayRef<Value *> VL) {
+  BasicBlock *BB = cast<Instruction>(VL[0])->getParent();
+  assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block");
+  BlockNumbering &BN = BlocksNumbers[BB];
+
+  int MaxIdx = BN.getIndex(BB->getFirstNonPHI());
+  for (unsigned i = 0, e = VL.size(); i < e; ++i)
+    MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i])));
+  return MaxIdx;
+}
+
+Instruction *FuncSLP::getLastInstruction(ArrayRef<Value *> VL) {
+  BasicBlock *BB = cast<Instruction>(VL[0])->getParent();
+  assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block");
+  BlockNumbering &BN = BlocksNumbers[BB];
+
+  int MaxIdx = BN.getIndex(cast<Instruction>(VL[0]));
+  for (unsigned i = 1, e = VL.size(); i < e; ++i)
+    MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i])));
+  return BN.getInstruction(MaxIdx);
+}
+
+Instruction *FuncSLP::getInstructionForIndex(unsigned Index, BasicBlock *BB) {
+  BlockNumbering &BN = BlocksNumbers[BB];
+  return BN.getInstruction(Index);
+}
+
+int FuncSLP::getFirstUserIndex(ArrayRef<Value *> VL) {
+  BasicBlock *BB = getSameBlock(VL);
+  BlockNumbering &BN = BlocksNumbers[BB];
+
+  // Find the first user of the values.
+  int FirstUser = BN.getIndex(BB->getTerminator());
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end();
+         U != UE; ++U) {
+      Instruction *Instr = dyn_cast<Instruction>(*U);
+
+      if (!Instr || Instr->getParent() != BB)
+        continue;
+
+      FirstUser = std::min(FirstUser, BN.getIndex(Instr));
+    }
+  }
+  return FirstUser;
+}
+
+int FuncSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
+  Type *ScalarTy = VL[0]->getType();
+
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+
+  /// Don't mess with vectors.
+  if (ScalarTy->isVectorTy())
+    return FuncSLP::MAX_COST;
+
+  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+
+  if (allConstant(VL))
+    return 0;
+
+  if (isSplat(VL))
+    return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
+
+  if (Depth == RecursionMaxDepth || needToGatherAny(VL))
+    return getGatherCost(VecTy);
+
+  BasicBlock *BB = getSameBlock(VL);
+  unsigned Opcode = getSameOpcode(VL);
+  assert(Opcode && BB && "Invalid Instruction Value");
+
+  // Check if it is safe to sink the loads or the stores.
+  if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
+    int MaxIdx = getLastIndex(VL);
+    Instruction *Last = getInstructionForIndex(MaxIdx, BB);
+
+    for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+      if (VL[i] == Last)
+        continue;
+      Value *Barrier = getSinkBarrier(cast<Instruction>(VL[i]), Last);
+      if (Barrier) {
+        DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last
+                     << "\n because of " << *Barrier << "\n");
+        return MAX_COST;
+      }
+    }
+  }
+
+  Instruction *VL0 = cast<Instruction>(VL[0]);
+  switch (Opcode) {
+  case Instruction::ExtractElement: {
+    if (CanReuseExtract(VL, VL.size(), VecTy))
+      return 0;
+    return getGatherCost(VecTy);
+  }
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    ValueList Operands;
+    Type *SrcTy = VL0->getOperand(0)->getType();
+    // Prepare the operand vector.
+    for (unsigned j = 0; j < VL.size(); ++j) {
+      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
+      // Check that the casted type is the same for all users.
+      if (cast<Instruction>(VL[j])->getOperand(0)->getType() != SrcTy)
+        return getGatherCost(VecTy);
+    }
+
+    int Cost = getTreeCost_rec(Operands, Depth + 1);
+    if (Cost == FuncSLP::MAX_COST)
+      return Cost;
+
+    // Calculate the cost of this instruction.
+    int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
+                                                       VL0->getType(), SrcTy);
+
+    VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
+    int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
+    Cost += (VecCost - ScalarCost);
+    return Cost;
+  }
+  case Instruction::FCmp:
+  case Instruction::ICmp: {
+    // Check that all of the compares have the same predicate.
+    CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
+    for (unsigned i = 1, e = VL.size(); i < e; ++i) {
+      CmpInst *Cmp = cast<CmpInst>(VL[i]);
+      if (Cmp->getPredicate() != P0)
+        return getGatherCost(VecTy);
+    }
+    // Fall through.
+  }
+  case Instruction::Select:
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    int TotalCost = 0;
+    // Calculate the cost of all of the operands.
+    for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+      ValueList Operands;
+      // Prepare the operand vector.
+      for (unsigned j = 0; j < VL.size(); ++j)
+        Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+      int Cost = getTreeCost_rec(Operands, Depth + 1);
+      if (Cost == MAX_COST)
+        return MAX_COST;
+      TotalCost += TotalCost;
+    }
+
+    // Calculate the cost of this instruction.
+    int ScalarCost = 0;
+    int VecCost = 0;
+    if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
+        Opcode == Instruction::Select) {
+      VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
+      ScalarCost =
+          VecTy->getNumElements() *
+          TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
+      VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
+    } else {
+      ScalarCost = VecTy->getNumElements() *
+                   TTI->getArithmeticInstrCost(Opcode, ScalarTy);
+      VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
+    }
+    TotalCost += (VecCost - ScalarCost);
+    return TotalCost;
+  }
+  case Instruction::Load: {
+    // If we are scalarize the loads, add the cost of forming the vector.
+    for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
+      if (!isConsecutiveAccess(VL[i], VL[i + 1]))
+        return getGatherCost(VecTy);
+
+    // Cost of wide load - cost of scalar loads.
+    int ScalarLdCost = VecTy->getNumElements() *
+                       TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+    int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+    return VecLdCost - ScalarLdCost;
+  }
+  case Instruction::Store: {
+    // We know that we can merge the stores. Calculate the cost.
+    int ScalarStCost = VecTy->getNumElements() *
+                       TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
+    int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
+    int StoreCost = VecStCost - ScalarStCost;
+
+    ValueList Operands;
+    for (unsigned j = 0; j < VL.size(); ++j) {
+      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
+      MemBarrierIgnoreList.insert(VL[j]);
+    }
+
+    int Cost = getTreeCost_rec(Operands, Depth + 1);
+    if (Cost == MAX_COST)
+      return MAX_COST;
+
+    int TotalCost = StoreCost + Cost;
+    return TotalCost;
+  }
+  default:
+    // Unable to vectorize unknown instructions.
+    return getGatherCost(VecTy);
+  }
+}
+
+int FuncSLP::getTreeCost(ArrayRef<Value *> VL) {
+  // Get rid of the list of stores that were removed, and from the
+  // lists of instructions with multiple users.
+  MemBarrierIgnoreList.clear();
+  LaneMap.clear();
+  MultiUserVals.clear();
+  MustGather.clear();
+
+  if (!getSameBlock(VL))
+    return MAX_COST;
+
+  // Find the location of the last root.
+  int LastRootIndex = getLastIndex(VL);
+  int FirstUserIndex = getFirstUserIndex(VL);
+
+  // Don't vectorize if there are users of the tree roots inside the tree
+  // itself.
+  if (LastRootIndex > FirstUserIndex)
+    return MAX_COST;
+
+  // Scan the tree and find which value is used by which lane, and which values
+  // must be scalarized.
+  getTreeUses_rec(VL, 0);
+
+  // Check that instructions with multiple users can be vectorized. Mark unsafe
+  // instructions.
+  for (SetVector<Value *>::iterator it = MultiUserVals.begin(),
+                                    e = MultiUserVals.end();
+       it != e; ++it) {
+    // Check that all of the users of this instr are within the tree.
+    for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end();
+         I != E; ++I) {
+      if (LaneMap.find(*I) == LaneMap.end()) {
+        DEBUG(dbgs() << "SLP: Adding to MustExtract "
+                        "because of an out of tree usage.\n");
+        MustGather.insert(*it);
+        continue;
+      }
+    }
+  }
+
+  // Now calculate the cost of vectorizing the tree.
+  return getTreeCost_rec(VL, 0);
+}
+bool FuncSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) {
+  unsigned ChainLen = Chain.size();
+  DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
+               << "\n");
+  Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
+  unsigned Sz = DL->getTypeSizeInBits(StoreTy);
+  unsigned VF = MinVecRegSize / Sz;
+
+  if (!isPowerOf2_32(Sz) || VF < 2)
+    return false;
+
+  bool Changed = false;
+  // Look for profitable vectorizable trees at all offsets, starting at zero.
+  for (unsigned i = 0, e = ChainLen; i < e; ++i) {
+    if (i + VF > e)
+      break;
+    DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
+                 << "\n");
+    ArrayRef<Value *> Operands = Chain.slice(i, VF);
+
+    int Cost = getTreeCost(Operands);
+    if (Cost == FuncSLP::MAX_COST)
+      continue;
+    DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
+    if (Cost < CostThreshold) {
+      DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+      vectorizeTree(Operands);
+      i += VF - 1;
+      Changed = true;
+    }
+  }
+
+  if (Changed || ChainLen > VF)
+    return Changed;
+
+  // Handle short chains. This helps us catch types such as <3 x float> that
+  // are smaller than vector size.
+  int Cost = getTreeCost(Chain);
+  if (Cost == FuncSLP::MAX_COST)
+    return false;
+  if (Cost < CostThreshold) {
+    DEBUG(dbgs() << "SLP: Found store chain cost = " << Cost
+                 << " for size = " << ChainLen << "\n");
+    vectorizeTree(Chain);
+    return true;
+  }
+
+  return false;
+}
+
+bool FuncSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) {
+  SetVector<Value *> Heads, Tails;
+  SmallDenseMap<Value *, Value *> ConsecutiveChain;
+
+  // We may run into multiple chains that merge into a single chain. We mark the
+  // stores that we vectorized so that we don't visit the same store twice.
+  ValueSet VectorizedStores;
+  bool Changed = false;
+
+  // Do a quadratic search on all of the given stores and find
+  // all of the pairs of loads that follow each other.
+  for (unsigned i = 0, e = Stores.size(); i < e; ++i)
+    for (unsigned j = 0; j < e; ++j) {
+      if (i == j)
+        continue;
+
+      if (isConsecutiveAccess(Stores[i], Stores[j])) {
+        Tails.insert(Stores[j]);
+        Heads.insert(Stores[i]);
+        ConsecutiveChain[Stores[i]] = Stores[j];
+      }
+    }
+
+  // For stores that start but don't end a link in the chain:
+  for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end();
+       it != e; ++it) {
+    if (Tails.count(*it))
+      continue;
+
+    // We found a store instr that starts a chain. Now follow the chain and try
+    // to vectorize it.
+    ValueList Operands;
+    Value *I = *it;
+    // Collect the chain into a list.
+    while (Tails.count(I) || Heads.count(I)) {
+      if (VectorizedStores.count(I))
+        break;
+      Operands.push_back(I);
+      // Move to the next value in the chain.
+      I = ConsecutiveChain[I];
+    }
+
+    bool Vectorized = vectorizeStoreChain(Operands, costThreshold);
+
+    // Mark the vectorized stores so that we don't vectorize them again.
+    if (Vectorized)
+      VectorizedStores.insert(Operands.begin(), Operands.end());
+    Changed |= Vectorized;
+  }
+
+  return Changed;
+}
+
+Value *FuncSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
+  Value *Vec = UndefValue::get(Ty);
+  // Generate the 'InsertElement' instruction.
+  for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
+    Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
+    if (Instruction *I = dyn_cast<Instruction>(Vec))
+      GatherSeq.insert(I);
+  }
+
+  VectorizedValues[VL[0]] = Vec;
+  return Vec;
+}
+
+Value *FuncSLP::vectorizeTree_rec(ArrayRef<Value *> VL) {
+  BuilderLocGuard Guard(Builder);
+
+  Type *ScalarTy = VL[0]->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+
+  if (needToGatherAny(VL))
+    return Gather(VL, VecTy);
+
+  if (VectorizedValues.count(VL[0])) {
+    DEBUG(dbgs() << "SLP: Diamond merged at depth.\n");
+    return VectorizedValues[VL[0]];
+  }
+
+  Instruction *VL0 = cast<Instruction>(VL[0]);
+  unsigned Opcode = VL0->getOpcode();
+  assert(Opcode == getSameOpcode(VL) && "Invalid opcode");
+
+  switch (Opcode) {
+  case Instruction::ExtractElement: {
+    if (CanReuseExtract(VL, VL.size(), VecTy))
+      return VL0->getOperand(0);
+    return Gather(VL, VecTy);
+  }
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    ValueList INVL;
+    for (int i = 0, e = VL.size(); i < e; ++i)
+      INVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
+
+    Builder.SetInsertPoint(getLastInstruction(VL));
+    Value *InVec = vectorizeTree_rec(INVL);
+    CastInst *CI = dyn_cast<CastInst>(VL0);
+    Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
+    VectorizedValues[VL0] = V;
+    return V;
+  }
+  case Instruction::FCmp:
+  case Instruction::ICmp: {
+    // Check that all of the compares have the same predicate.
+    CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
+    for (unsigned i = 1, e = VL.size(); i < e; ++i) {
+      CmpInst *Cmp = cast<CmpInst>(VL[i]);
+      if (Cmp->getPredicate() != P0)
+        return Gather(VL, VecTy);
+    }
+
+    ValueList LHSV, RHSV;
+    for (int i = 0, e = VL.size(); i < e; ++i) {
+      LHSV.push_back(cast<Instruction>(VL[i])->getOperand(0));
+      RHSV.push_back(cast<Instruction>(VL[i])->getOperand(1));
+    }
+
+    Builder.SetInsertPoint(getLastInstruction(VL));
+    Value *L = vectorizeTree_rec(LHSV);
+    Value *R = vectorizeTree_rec(RHSV);
+    Value *V;
+
+    if (Opcode == Instruction::FCmp)
+      V = Builder.CreateFCmp(P0, L, R);
+    else
+      V = Builder.CreateICmp(P0, L, R);
+
+    VectorizedValues[VL0] = V;
+    return V;
+  }
+  case Instruction::Select: {
+    ValueList TrueVec, FalseVec, CondVec;
+    for (int i = 0, e = VL.size(); i < e; ++i) {
+      CondVec.push_back(cast<Instruction>(VL[i])->getOperand(0));
+      TrueVec.push_back(cast<Instruction>(VL[i])->getOperand(1));
+      FalseVec.push_back(cast<Instruction>(VL[i])->getOperand(2));
+    }
+
+    Builder.SetInsertPoint(getLastInstruction(VL));
+    Value *True = vectorizeTree_rec(TrueVec);
+    Value *False = vectorizeTree_rec(FalseVec);
+    Value *Cond = vectorizeTree_rec(CondVec);
+    Value *V = Builder.CreateSelect(Cond, True, False);
+    VectorizedValues[VL0] = V;
+    return V;
+  }
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    ValueList LHSVL, RHSVL;
+    for (int i = 0, e = VL.size(); i < e; ++i) {
+      LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
+      RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
+    }
+
+    Builder.SetInsertPoint(getLastInstruction(VL));
+    Value *LHS = vectorizeTree_rec(LHSVL);
+    Value *RHS = vectorizeTree_rec(RHSVL);
+
+    if (LHS == RHS) {
+      assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
+    }
+
+    BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
+    Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
+    VectorizedValues[VL0] = V;
+    return V;
+  }
+  case Instruction::Load: {
+    // Check if all of the loads are consecutive.
+    for (unsigned i = 1, e = VL.size(); i < e; ++i)
+      if (!isConsecutiveAccess(VL[i - 1], VL[i]))
+        return Gather(VL, VecTy);
+
+    // Loads are inserted at the head of the tree because we don't want to
+    // sink them all the way down past store instructions.
+    Builder.SetInsertPoint(getLastInstruction(VL));
+    LoadInst *LI = cast<LoadInst>(VL0);
+    Value *VecPtr =
+        Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo());
+    unsigned Alignment = LI->getAlignment();
+    LI = Builder.CreateLoad(VecPtr);
+    LI->setAlignment(Alignment);
+
+    VectorizedValues[VL0] = LI;
+    return LI;
+  }
+  case Instruction::Store: {
+    StoreInst *SI = cast<StoreInst>(VL0);
+    unsigned Alignment = SI->getAlignment();
+
+    ValueList ValueOp;
+    for (int i = 0, e = VL.size(); i < e; ++i)
+      ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand());
+
+    Value *VecValue = vectorizeTree_rec(ValueOp);
+
+    Builder.SetInsertPoint(getLastInstruction(VL));
+    Value *VecPtr =
+        Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo());
+    Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment);
+
+    for (int i = 0, e = VL.size(); i < e; ++i)
+      cast<Instruction>(VL[i])->eraseFromParent();
+    return 0;
+  }
+  default:
+    return Gather(VL, VecTy);
+  }
+}
+
+Value *FuncSLP::vectorizeTree(ArrayRef<Value *> VL) {
+  Builder.SetInsertPoint(getLastInstruction(VL));
+  Value *V = vectorizeTree_rec(VL);
+
+  // We moved some instructions around. We have to number them again
+  // before we can do any analysis.
+  MustGather.clear();
+  VectorizedValues.clear();
+  MemBarrierIgnoreList.clear();
+  for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it)
+    BlocksNumbers[it].forget();
+  return V;
+}
+
+Value *FuncSLP::vectorizeArith(ArrayRef<Value *> Operands) {
+  Value *Vec = vectorizeTree(Operands);
+  // After vectorizing the operands we need to generate extractelement
+  // instructions and replace all of the uses of the scalar values with
+  // the values that we extracted from the vectorized tree.
+  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+    Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i));
+    Operands[i]->replaceAllUsesWith(S);
+  }
+
+  return Vec;
+}
+
+void FuncSLP::hoistGatherSequence() {
+  for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
+                                          e = GatherSeq.end();
+       it != e; ++it) {
+    InsertElementInst *Insert = dyn_cast_or_null<InsertElementInst>(*it);
+
+    // The InsertElement sequence can be simplified into a constant.
+    // Also Ignore NULL pointers because they are only here to separate
+    // sequences.
+    if (!Insert)
+      continue;
+
+    BasicBlock *BB = Insert->getParent();
+
+    // Check if this block is inside a loop.
+    Loop *L = LI->getLoopFor(BB);
+    if (!L)
+      return;
+
+    // Check if it has a preheader.
+    BasicBlock *PreHeader = L->getLoopPreheader();
+    if (!PreHeader)
+      return;
+
+    // If the vector or the element that we insert into it are
+    // instructions that are defined in this basic block then we can't
+    // hoist this instruction.
+    Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
+    Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
+    if (CurrVec && L->contains(CurrVec))
+      continue;
+    if (NewElem && L->contains(NewElem))
+      continue;
+
+    // Mark the insertion point for the block.
+    Instruction *Location = PreHeader->getTerminator();
+    // We can hoist this instruction. Move it to the pre-header.
+    Insert->moveBefore(Location);
+  }
+}
+
 /// The SLPVectorizer Pass.
 struct SLPVectorizer : public FunctionPass {
-  typedef MapVector<Value *, BoUpSLP::StoreList> StoreListMap;
+  typedef SmallVector<StoreInst *, 8> StoreList;
+  typedef MapVector<Value *, StoreList> StoreListMap;
 
   /// Pass identification, replacement for typeid
   static char ID;
@@ -80,34 +1213,26 @@ struct SLPVectorizer : public FunctionPa
 
     DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
 
+    // Use the bollom up slp vectorizer to construct chains that start with
+    // he store instructions.
+    FuncSLP R(&F, SE, DL, TTI, AA, LI);
+
     for (Function::iterator it = F.begin(), e = F.end(); it != e; ++it) {
       BasicBlock *BB = it;
-      bool BBChanged = false;
-
-      // Use the bollom up slp vectorizer to construct chains that start with
-      // he store instructions.
-      BoUpSLP R(BB, SE, DL, TTI, AA, LI->getLoopFor(BB));
 
       // Vectorize trees that end at reductions.
-      BBChanged |= vectorizeChainsInBlock(BB, R);
+      Changed |= vectorizeChainsInBlock(BB, R);
 
       // Vectorize trees that end at stores.
       if (unsigned count = collectStores(BB, R)) {
         (void)count;
         DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
-        BBChanged |= vectorizeStoreChains(R);
+        Changed |= vectorizeStoreChains(R);
       }
-
-      // Try to hoist some of the scalarization code to the preheader.
-      if (BBChanged) {
-        hoistGatherSequence(LI, BB, R);
-        Changed |= vectorizeUsingGatherHints(R.getGatherSeqInstructions());
-      }
-
-      Changed |= BBChanged;
     }
 
     if (Changed) {
+      R.hoistGatherSequence();
       DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
       DEBUG(verifyFunction(F));
     }
@@ -128,42 +1253,31 @@ private:
   /// object. We sort the stores to their base objects to reduce the cost of the
   /// quadratic search on the stores. TODO: We can further reduce this cost
   /// if we flush the chain creation every time we run into a memory barrier.
-  unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
+  unsigned collectStores(BasicBlock *BB, FuncSLP &R);
 
   /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
-  bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
+  bool tryToVectorizePair(Value *A, Value *B, FuncSLP &R);
 
   /// \brief Try to vectorize a list of operands. If \p NeedExtracts is true
   /// then we calculate the cost of extracting the scalars from the vector.
   /// \returns true if a value was vectorized.
-  bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, bool NeedExtracts);
+  bool tryToVectorizeList(ArrayRef<Value *> VL, FuncSLP &R, bool NeedExtracts);
 
   /// \brief Try to vectorize a chain that may start at the operands of \V;
-  bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
+  bool tryToVectorize(BinaryOperator *V, FuncSLP &R);
 
   /// \brief Vectorize the stores that were collected in StoreRefs.
-  bool vectorizeStoreChains(BoUpSLP &R);
-
-  /// \brief Try to hoist gather sequences outside of the loop in cases where
-  /// all of the sources are loop invariant.
-  void hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, BoUpSLP &R);
-
-  /// \brief Try to vectorize additional sequences in different basic blocks
-  /// based on values that we gathered in previous blocks. The list \p Gathers
-  /// holds the gather InsertElement instructions that were generated during
-  /// vectorization.
-  /// \returns True if some code was vectorized.
-  bool vectorizeUsingGatherHints(BoUpSLP::InstrList &Gathers);
+  bool vectorizeStoreChains(FuncSLP &R);
 
   /// \brief Scan the basic block and look for patterns that are likely to start
   /// a vectorization chain.
-  bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
+  bool vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R);
 
 private:
   StoreListMap StoreRefs;
 };
 
-unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
+unsigned SLPVectorizer::collectStores(BasicBlock *BB, FuncSLP &R) {
   unsigned count = 0;
   StoreRefs.clear();
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
@@ -188,14 +1302,14 @@ unsigned SLPVectorizer::collectStores(Ba
   return count;
 }
 
-bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
+bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, FuncSLP &R) {
   if (!A || !B)
     return false;
   Value *VL[] = { A, B };
   return tryToVectorizeList(VL, R, true);
 }
 
-bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
+bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, FuncSLP &R,
                                        bool NeedExtracts) {
   if (VL.size() < 2)
     return false;
@@ -219,7 +1333,10 @@ bool SLPVectorizer::tryToVectorizeList(A
   }
 
   int Cost = R.getTreeCost(VL);
-  int ExtrCost = NeedExtracts ? R.getScalarizationCost(VL) : 0;
+  if (Cost == FuncSLP::MAX_COST)
+    return false;
+
+  int ExtrCost = NeedExtracts ? R.getGatherCost(VL) : 0;
   DEBUG(dbgs() << "SLP: Cost of pair:" << Cost
                << " Cost of extract:" << ExtrCost << ".\n");
   if ((Cost + ExtrCost) >= -SLPCostThreshold)
@@ -229,10 +1346,10 @@ bool SLPVectorizer::tryToVectorizeList(A
   return true;
 }
 
-bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
+bool SLPVectorizer::tryToVectorize(BinaryOperator *V, FuncSLP &R) {
   if (!V)
     return false;
-  
+
   // Try to vectorize V.
   if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
     return true;
@@ -269,7 +1386,7 @@ bool SLPVectorizer::tryToVectorize(Binar
   return 0;
 }
 
-bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
+bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R) {
   bool Changed = false;
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
     if (isa<DbgInfoIntrinsic>(it))
@@ -292,7 +1409,7 @@ bool SLPVectorizer::vectorizeChainsInBlo
       Value *Inst = BI->getOperand(0);
       if (Inst == P)
         Inst = BI->getOperand(1);
-      
+
       Changed |= tryToVectorize(dyn_cast<BinaryOperator>(Inst), R);
       continue;
     }
@@ -337,7 +1454,7 @@ bool SLPVectorizer::vectorizeChainsInBlo
   return Changed;
 }
 
-bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
+bool SLPVectorizer::vectorizeStoreChains(FuncSLP &R) {
   bool Changed = false;
   // Attempt to sort and vectorize each of the store-groups.
   for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
@@ -353,92 +1470,6 @@ bool SLPVectorizer::vectorizeStoreChains
   return Changed;
 }
 
-bool SLPVectorizer::vectorizeUsingGatherHints(BoUpSLP::InstrList &Gathers) {
-  SmallVector<Value *, 4> Seq;
-  bool Changed = false;
-  for (int i = 0, e = Gathers.size(); i < e; ++i) {
-    InsertElementInst *IEI = dyn_cast_or_null<InsertElementInst>(Gathers[i]);
-
-    if (IEI) {
-      if (Instruction *I = dyn_cast<Instruction>(IEI->getOperand(1)))
-        Seq.push_back(I);
-    } else {
-
-      if (!Seq.size())
-        continue;
-
-      Instruction *I = cast<Instruction>(Seq[0]);
-      BasicBlock *BB = I->getParent();
-
-      DEBUG(dbgs() << "SLP: Inspecting a gather list of size " << Seq.size()
-                   << " in " << BB->getName() << ".\n");
-
-      // Check if the gathered values have multiple uses. If they only have one
-      // user then we know that the insert/extract pair will go away.
-      bool HasMultipleUsers = false;
-      for (int i = 0; e = Seq.size(), i < e; ++i) {
-        if (!Seq[i]->hasOneUse()) {
-          HasMultipleUsers = true;
-          break;
-        }
-      }
-
-      BoUpSLP BO(BB, SE, DL, TTI, AA, LI->getLoopFor(BB));
-
-      if (tryToVectorizeList(Seq, BO, HasMultipleUsers)) {
-        DEBUG(dbgs() << "SLP: Vectorized a gather list of len " << Seq.size()
-                     << " in " << BB->getName() << ".\n");
-        Changed = true;
-      }
-
-      Seq.clear();
-    }
-  }
-
-  return Changed;
-}
-
-void SLPVectorizer::hoistGatherSequence(LoopInfo *LI, BasicBlock *BB,
-                                        BoUpSLP &R) {
-  // Check if this block is inside a loop.
-  Loop *L = LI->getLoopFor(BB);
-  if (!L)
-    return;
-
-  // Check if it has a preheader.
-  BasicBlock *PreHeader = L->getLoopPreheader();
-  if (!PreHeader)
-    return;
-
-  // Mark the insertion point for the block.
-  Instruction *Location = PreHeader->getTerminator();
-
-  BoUpSLP::InstrList &Gathers = R.getGatherSeqInstructions();
-  for (BoUpSLP::InstrList::iterator it = Gathers.begin(), e = Gathers.end();
-       it != e; ++it) {
-    InsertElementInst *Insert = dyn_cast_or_null<InsertElementInst>(*it);
-
-    // The InsertElement sequence can be simplified into a constant.
-    // Also Ignore NULL pointers because they are only here to separate
-    // sequences.
-    if (!Insert)
-      continue;
-
-    // If the vector or the element that we insert into it are
-    // instructions that are defined in this basic block then we can't
-    // hoist this instruction.
-    Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
-    Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
-    if (CurrVec && L->contains(CurrVec))
-      continue;
-    if (NewElem && L->contains(NewElem))
-      continue;
-
-    // We can hoist this instruction. Move it to the pre-header.
-    Insert->moveBefore(Location);
-  }
-}
-
 } // end anonymous namespace
 
 char SLPVectorizer::ID = 0;

Removed: llvm/trunk/lib/Transforms/Vectorize/VecUtils.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/VecUtils.cpp?rev=184646&view=auto
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/VecUtils.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/VecUtils.cpp (removed)
@@ -1,1031 +0,0 @@
-//===- VecUtils.cpp --- Vectorization Utilities ---------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "SLP"
-
-#include "VecUtils.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/Verifier.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <algorithm>
-#include <map>
-
-using namespace llvm;
-
-static const unsigned MinVecRegSize = 128;
-
-static const unsigned RecursionMaxDepth = 6;
-
-namespace llvm {
-
-BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl,
-                 TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp)
-    : Builder(S->getContext()), BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp) {
-  numberInstructions();
-}
-
-void BoUpSLP::numberInstructions() {
-  int Loc = 0;
-  InstrIdx.clear();
-  InstrVec.clear();
-  // Number the instructions in the block.
-  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-    InstrIdx[it] = Loc++;
-    InstrVec.push_back(it);
-    assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation");
-  }
-}
-
-Value *BoUpSLP::getPointerOperand(Value *I) {
-  if (LoadInst *LI = dyn_cast<LoadInst>(I))
-    return LI->getPointerOperand();
-  if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return SI->getPointerOperand();
-  return 0;
-}
-
-unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
-  if (LoadInst *L = dyn_cast<LoadInst>(I))
-    return L->getPointerAddressSpace();
-  if (StoreInst *S = dyn_cast<StoreInst>(I))
-    return S->getPointerAddressSpace();
-  return -1;
-}
-
-bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
-  Value *PtrA = getPointerOperand(A);
-  Value *PtrB = getPointerOperand(B);
-  unsigned ASA = getAddressSpaceOperand(A);
-  unsigned ASB = getAddressSpaceOperand(B);
-
-  // Check that the address spaces match and that the pointers are valid.
-  if (!PtrA || !PtrB || (ASA != ASB))
-    return false;
-
-  // Check that A and B are of the same type.
-  if (PtrA->getType() != PtrB->getType())
-    return false;
-
-  // Calculate the distance.
-  const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
-  const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
-  const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB);
-  const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV);
-
-  // Non constant distance.
-  if (!ConstOffSCEV)
-    return false;
-
-  int64_t Offset = ConstOffSCEV->getValue()->getSExtValue();
-  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
-  // The Instructions are connsecutive if the size of the first load/store is
-  // the same as the offset.
-  int64_t Sz = DL->getTypeStoreSize(Ty);
-  return ((-Offset) == Sz);
-}
-
-bool BoUpSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) {
-  unsigned ChainLen = Chain.size();
-  DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
-               << "\n");
-  Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
-  unsigned Sz = DL->getTypeSizeInBits(StoreTy);
-  unsigned VF = MinVecRegSize / Sz;
-
-  if (!isPowerOf2_32(Sz) || VF < 2)
-    return false;
-
-  bool Changed = false;
-  // Look for profitable vectorizable trees at all offsets, starting at zero.
-  for (unsigned i = 0, e = ChainLen; i < e; ++i) {
-    if (i + VF > e)
-      break;
-    DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
-                 << "\n");
-    ArrayRef<Value *> Operands = Chain.slice(i, VF);
-
-    int Cost = getTreeCost(Operands);
-    DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
-    if (Cost < CostThreshold) {
-      DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
-      Builder.SetInsertPoint(getInsertionPoint(getLastIndex(Operands, VF)));
-      vectorizeTree(Operands, VF);
-      i += VF - 1;
-      Changed = true;
-    }
-  }
-
-  if (Changed || ChainLen > VF)
-    return Changed;
-
-  // Handle short chains. This helps us catch types such as <3 x float> that
-  // are smaller than vector size.
-  int Cost = getTreeCost(Chain);
-  if (Cost < CostThreshold) {
-    DEBUG(dbgs() << "SLP: Found store chain cost = " << Cost
-                 << " for size = " << ChainLen << "\n");
-    Builder.SetInsertPoint(getInsertionPoint(getLastIndex(Chain, ChainLen)));
-    vectorizeTree(Chain, ChainLen);
-    return true;
-  }
-
-  return false;
-}
-
-bool BoUpSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) {
-  SetVector<Value *> Heads, Tails;
-  SmallDenseMap<Value *, Value *> ConsecutiveChain;
-
-  // We may run into multiple chains that merge into a single chain. We mark the
-  // stores that we vectorized so that we don't visit the same store twice.
-  ValueSet VectorizedStores;
-  bool Changed = false;
-
-  // Do a quadratic search on all of the given stores and find
-  // all of the pairs of loads that follow each other.
-  for (unsigned i = 0, e = Stores.size(); i < e; ++i)
-    for (unsigned j = 0; j < e; ++j) {
-      if (i == j)
-        continue;
-      
-      if (isConsecutiveAccess(Stores[i], Stores[j])) {
-        Tails.insert(Stores[j]);
-        Heads.insert(Stores[i]);
-        ConsecutiveChain[Stores[i]] = Stores[j];
-      }
-    }
-
-  // For stores that start but don't end a link in the chain:
-  for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end();
-       it != e; ++it) {
-    if (Tails.count(*it))
-      continue;
-
-    // We found a store instr that starts a chain. Now follow the chain and try
-    // to vectorize it.
-    ValueList Operands;
-    Value *I = *it;
-    // Collect the chain into a list.
-    while (Tails.count(I) || Heads.count(I)) {
-      if (VectorizedStores.count(I))
-        break;
-      Operands.push_back(I);
-      // Move to the next value in the chain.
-      I = ConsecutiveChain[I];
-    }
-
-    bool Vectorized = vectorizeStoreChain(Operands, costThreshold);
-
-    // Mark the vectorized stores so that we don't vectorize them again.
-    if (Vectorized)
-      VectorizedStores.insert(Operands.begin(), Operands.end());
-    Changed |= Vectorized;
-  }
-
-  return Changed;
-}
-
-int BoUpSLP::getScalarizationCost(ArrayRef<Value *> VL) {
-  // Find the type of the operands in VL.
-  Type *ScalarTy = VL[0]->getType();
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
-    ScalarTy = SI->getValueOperand()->getType();
-  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
-  // Find the cost of inserting/extracting values from the vector.
-  return getScalarizationCost(VecTy);
-}
-
-int BoUpSLP::getScalarizationCost(Type *Ty) {
-  int Cost = 0;
-  for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
-    Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
-  return Cost;
-}
-
-AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) {
-  if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return AA->getLocation(SI);
-  if (LoadInst *LI = dyn_cast<LoadInst>(I))
-    return AA->getLocation(LI);
-  return AliasAnalysis::Location();
-}
-
-Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) {
-  assert(Src->getParent() == Dst->getParent() && "Not the same BB");
-  BasicBlock::iterator I = Src, E = Dst;
-  /// Scan all of the instruction from SRC to DST and check if
-  /// the source may alias.
-  for (++I; I != E; ++I) {
-    // Ignore store instructions that are marked as 'ignore'.
-    if (MemBarrierIgnoreList.count(I))
-      continue;
-    if (Src->mayWriteToMemory()) /* Write */ {
-      if (!I->mayReadOrWriteMemory())
-        continue;
-    } else /* Read */ {
-      if (!I->mayWriteToMemory())
-        continue;
-    }
-    AliasAnalysis::Location A = getLocation(&*I);
-    AliasAnalysis::Location B = getLocation(Src);
-
-    if (!A.Ptr || !B.Ptr || AA->alias(A, B))
-      return I;
-  }
-  return 0;
-}
-
-Value *BoUpSLP::vectorizeArith(ArrayRef<Value *> Operands) {
-  int LastIdx = getLastIndex(Operands, Operands.size());
-  Instruction *Loc = getInsertionPoint(LastIdx);
-  Builder.SetInsertPoint(Loc);
-
-  assert(getFirstUserIndex(Operands, Operands.size()) > LastIdx &&
-         "Vectorizing with in-tree users");
-
-  Value *Vec = vectorizeTree(Operands, Operands.size());
-  // After vectorizing the operands we need to generate extractelement
-  // instructions and replace all of the uses of the scalar values with
-  // the values that we extracted from the vectorized tree.
-  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
-    Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i));
-    Operands[i]->replaceAllUsesWith(S);
-  }
-
-  return Vec;
-}
-
-int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) {
-  // Get rid of the list of stores that were removed, and from the
-  // lists of instructions with multiple users.
-  MemBarrierIgnoreList.clear();
-  LaneMap.clear();
-  MultiUserVals.clear();
-  MustScalarize.clear();
-  MustExtract.clear();
-
-  // Find the location of the last root.
-  int LastRootIndex = getLastIndex(VL, VL.size());
-  int FirstUserIndex = getFirstUserIndex(VL, VL.size());
-
-  // Don't vectorize if there are users of the tree roots inside the tree
-  // itself.
-  if (LastRootIndex > FirstUserIndex)
-    return max_cost;
-
-  // Scan the tree and find which value is used by which lane, and which values
-  // must be scalarized.
-  getTreeUses_rec(VL, 0);
-
-  // Check that instructions with multiple users can be vectorized. Mark unsafe
-  // instructions.
-  for (SetVector<Value *>::iterator it = MultiUserVals.begin(),
-                                    e = MultiUserVals.end();
-       it != e; ++it) {
-    // Check that all of the users of this instr are within the tree
-    // and that they are all from the same lane.
-    int Lane = -1;
-    for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end();
-         I != E; ++I) {
-      if (LaneMap.find(*I) == LaneMap.end()) {
-        DEBUG(dbgs() << "SLP: Instr " << **it << " has multiple users.\n");
-
-        // We don't have an ordering problem if the user is not in this basic
-        // block.
-        Instruction *Inst = cast<Instruction>(*I);
-        if (Inst->getParent() != BB) {
-          MustExtract.insert(*it);
-          continue;
-        }
-
-        // We don't have an ordering problem if the user is after the last root.
-        int Idx = InstrIdx[Inst];
-        if (Idx < LastRootIndex) {
-          MustScalarize.insert(*it);
-          DEBUG(dbgs() << "SLP: Adding to MustScalarize "
-                          "because of an unsafe out of tree usage.\n");
-          break;
-        }
-
-        DEBUG(dbgs() << "SLP: Adding to MustExtract "
-                        "because of a safe out of tree usage.\n");
-        MustExtract.insert(*it);
-        continue;
-      }
-      if (Lane == -1)
-        Lane = LaneMap[*I];
-      if (Lane != LaneMap[*I]) {
-        MustScalarize.insert(*it);
-        DEBUG(dbgs() << "SLP: Adding " << **it
-                     << " to MustScalarize because multiple lane use it: "
-                     << Lane << " and " << LaneMap[*I] << ".\n");
-        break;
-      }
-    }
-  }
-
-  // Now calculate the cost of vectorizing the tree.
-  return getTreeCost_rec(VL, 0);
-}
-
-static bool CanReuseExtract(ArrayRef<Value *> VL, unsigned VF,
-                            VectorType *VecTy) {
-  // Check if all of the extracts come from the same vector and from the
-  // correct offset.
-  Value *VL0 = VL[0];
-  ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
-  Value *Vec = E0->getOperand(0);
-
-  // We have to extract from the same vector type.
-  if (Vec->getType() != VecTy)
-    return false;
-
-  // Check that all of the indices extract from the correct offset.
-  ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
-  if (!CI || CI->getZExtValue())
-    return false;
-
-  for (unsigned i = 1, e = VF; i < e; ++i) {
-    ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
-    ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
-
-    if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
-      return false;
-  }
-
-  return true;
-}
-
-void BoUpSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) {
-  if (Depth == RecursionMaxDepth)
-    return;
-
-  // Don't handle vectors.
-  if (VL[0]->getType()->isVectorTy())
-    return;
-  
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
-    if (SI->getValueOperand()->getType()->isVectorTy())
-      return;
-
-  // Check if all of the operands are constants.
-  bool AllConst = true;
-  bool AllSameScalar = true;
-  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-    AllConst &= isa<Constant>(VL[i]);
-    AllSameScalar &= (VL[0] == VL[i]);
-    Instruction *I = dyn_cast<Instruction>(VL[i]);
-    // If one of the instructions is out of this BB, we need to scalarize all.
-    if (I && I->getParent() != BB)
-      return;
-  }
-
-  // If all of the operands are identical or constant we have a simple solution.
-  if (AllConst || AllSameScalar)
-    return;
-
-  // Scalarize unknown structures.
-  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
-  if (!VL0)
-    return;
-
-  unsigned Opcode = VL0->getOpcode();
-  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-    Instruction *I = dyn_cast<Instruction>(VL[i]);
-    // If not all of the instructions are identical then we have to scalarize.
-    if (!I || Opcode != I->getOpcode())
-      return;
-  }
-
-  for (int i = 0, e = VL.size(); i < e; ++i) {
-    // Check that the instruction is only used within
-    // one lane.
-    if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i)
-      return;
-    // Make this instruction as 'seen' and remember the lane.
-    LaneMap[VL[i]] = i;
-  }
-
-  // Mark instructions with multiple users.
-  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-    Instruction *I = dyn_cast<Instruction>(VL[i]);
-    // Remember to check if all of the users of this instr are vectorized
-    // within our tree. At depth zero we have no local users, only external
-    // users that we don't care about.
-    if (Depth && I && I->getNumUses() > 1) {
-      DEBUG(dbgs() << "SLP: Adding to MultiUserVals "
-                      "because it has multiple users:" << *I << " \n");
-      MultiUserVals.insert(I);
-    }
-  }
-
-  switch (Opcode) {
-  case Instruction::ExtractElement: {
-    VectorType *VecTy = VectorType::get(VL[0]->getType(), VL.size());
-    // No need to follow ExtractElements that are going to be optimized away.
-    if (CanReuseExtract(VL, VL.size(), VecTy))
-      return;
-    // Fall through.
-  }
-  case Instruction::ZExt:
-  case Instruction::SExt:
-  case Instruction::FPToUI:
-  case Instruction::FPToSI:
-  case Instruction::FPExt:
-  case Instruction::PtrToInt:
-  case Instruction::IntToPtr:
-  case Instruction::SIToFP:
-  case Instruction::UIToFP:
-  case Instruction::Trunc:
-  case Instruction::FPTrunc:
-  case Instruction::BitCast:
-  case Instruction::Select:
-  case Instruction::ICmp:
-  case Instruction::FCmp:
-  case Instruction::Add:
-  case Instruction::FAdd:
-  case Instruction::Sub:
-  case Instruction::FSub:
-  case Instruction::Mul:
-  case Instruction::FMul:
-  case Instruction::UDiv:
-  case Instruction::SDiv:
-  case Instruction::FDiv:
-  case Instruction::URem:
-  case Instruction::SRem:
-  case Instruction::FRem:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor: {
-    for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
-      ValueList Operands;
-      // Prepare the operand vector.
-      for (unsigned j = 0; j < VL.size(); ++j)
-        Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
-
-      getTreeUses_rec(Operands, Depth + 1);
-    }
-    return;
-  }
-  case Instruction::Store: {
-    ValueList Operands;
-    for (unsigned j = 0; j < VL.size(); ++j)
-      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
-    getTreeUses_rec(Operands, Depth + 1);
-    return;
-  }
-  default:
-    return;
-  }
-}
-
-int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
-  Type *ScalarTy = VL[0]->getType();
-
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
-    ScalarTy = SI->getValueOperand()->getType();
-
-  /// Don't mess with vectors.
-  if (ScalarTy->isVectorTy())
-    return max_cost;
-  
-  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
-
-  if (Depth == RecursionMaxDepth)
-    return getScalarizationCost(VecTy);
-
-  // Check if all of the operands are constants.
-  bool AllConst = true;
-  bool AllSameScalar = true;
-  bool MustScalarizeFlag = false;
-  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-    AllConst &= isa<Constant>(VL[i]);
-    AllSameScalar &= (VL[0] == VL[i]);
-    // Must have a single use.
-    Instruction *I = dyn_cast<Instruction>(VL[i]);
-    MustScalarizeFlag |= MustScalarize.count(VL[i]);
-    // This instruction is outside the basic block.
-    if (I && I->getParent() != BB)
-      return getScalarizationCost(VecTy);
-  }
-
-  // Is this a simple vector constant.
-  if (AllConst)
-    return 0;
-
-  // If all of the operands are identical we can broadcast them.
-  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
-  if (AllSameScalar) {
-    // If we are in a loop, and this is not an instruction (e.g. constant or
-    // argument) or the instruction is defined outside the loop then assume
-    // that the cost is zero.
-    if (L && (!VL0 || !L->contains(VL0)))
-      return 0;
-
-    // We need to broadcast the scalar.
-    return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
-  }
-
-  // If this is not a constant, or a scalar from outside the loop then we
-  // need to scalarize it.
-  if (MustScalarizeFlag)
-    return getScalarizationCost(VecTy);
-
-  if (!VL0)
-    return getScalarizationCost(VecTy);
-  assert(VL0->getParent() == BB && "Wrong BB");
-
-  unsigned Opcode = VL0->getOpcode();
-  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-    Instruction *I = dyn_cast<Instruction>(VL[i]);
-    // If not all of the instructions are identical then we have to scalarize.
-    if (!I || Opcode != I->getOpcode())
-      return getScalarizationCost(VecTy);
-  }
-
-  // Check if it is safe to sink the loads or the stores.
-  if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
-    int MaxIdx = getLastIndex(VL, VL.size());
-    Instruction *Last = InstrVec[MaxIdx];
-
-    for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-      if (VL[i] == Last)
-        continue;
-      Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last);
-      if (Barrier) {
-        DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last
-                     << "\n because of " << *Barrier << "\n");
-        return max_cost;
-      }
-    }
-  }
-
-  // Calculate the extract cost.
-  unsigned ExternalUserExtractCost = 0;
-  for (unsigned i = 0, e = VL.size(); i < e; ++i)
-    if (MustExtract.count(VL[i]))
-      ExternalUserExtractCost +=
-          TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
-
-  switch (Opcode) {
-  case Instruction::ExtractElement: {
-    if (CanReuseExtract(VL, VL.size(), VecTy))
-      return 0;
-    return getScalarizationCost(VecTy);
-  }
-  case Instruction::ZExt:
-  case Instruction::SExt:
-  case Instruction::FPToUI:
-  case Instruction::FPToSI:
-  case Instruction::FPExt:
-  case Instruction::PtrToInt:
-  case Instruction::IntToPtr:
-  case Instruction::SIToFP:
-  case Instruction::UIToFP:
-  case Instruction::Trunc:
-  case Instruction::FPTrunc:
-  case Instruction::BitCast: {
-    int Cost = ExternalUserExtractCost;
-    ValueList Operands;
-    Type *SrcTy = VL0->getOperand(0)->getType();
-    // Prepare the operand vector.
-    for (unsigned j = 0; j < VL.size(); ++j) {
-      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
-      // Check that the casted type is the same for all users.
-      if (cast<Instruction>(VL[j])->getOperand(0)->getType() != SrcTy)
-        return getScalarizationCost(VecTy);
-    }
-
-    Cost += getTreeCost_rec(Operands, Depth + 1);
-    if (Cost >= max_cost)
-      return max_cost;
-
-    // Calculate the cost of this instruction.
-    int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
-                                                       VL0->getType(), SrcTy);
-
-    VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
-    int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
-    Cost += (VecCost - ScalarCost);
-    return Cost;
-  }
-  case Instruction::FCmp:
-  case Instruction::ICmp: {
-    // Check that all of the compares have the same predicate.
-    CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
-    for (unsigned i = 1, e = VL.size(); i < e; ++i) {
-      CmpInst *Cmp = cast<CmpInst>(VL[i]);
-      if (Cmp->getPredicate() != P0)
-        return getScalarizationCost(VecTy);
-    }
-    // Fall through.
-  }
-  case Instruction::Select:
-  case Instruction::Add:
-  case Instruction::FAdd:
-  case Instruction::Sub:
-  case Instruction::FSub:
-  case Instruction::Mul:
-  case Instruction::FMul:
-  case Instruction::UDiv:
-  case Instruction::SDiv:
-  case Instruction::FDiv:
-  case Instruction::URem:
-  case Instruction::SRem:
-  case Instruction::FRem:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor: {
-    int Cost = ExternalUserExtractCost;
-    // Calculate the cost of all of the operands.
-    for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
-      ValueList Operands;
-      // Prepare the operand vector.
-      for (unsigned j = 0; j < VL.size(); ++j)
-        Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
-
-      Cost += getTreeCost_rec(Operands, Depth + 1);
-      if (Cost >= max_cost)
-        return max_cost;
-    }
-
-    // Calculate the cost of this instruction.
-    int ScalarCost = 0;
-    int VecCost = 0;
-    if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
-        Opcode == Instruction::Select) {
-      VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
-      ScalarCost =
-          VecTy->getNumElements() *
-          TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
-      VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
-    } else {
-      ScalarCost = VecTy->getNumElements() *
-                   TTI->getArithmeticInstrCost(Opcode, ScalarTy);
-      VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
-    }
-    Cost += (VecCost - ScalarCost);
-    return Cost;
-  }
-  case Instruction::Load: {
-    // If we are scalarize the loads, add the cost of forming the vector.
-    for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
-      if (!isConsecutiveAccess(VL[i], VL[i + 1]))
-        return getScalarizationCost(VecTy);
-
-    // Cost of wide load - cost of scalar loads.
-    int ScalarLdCost = VecTy->getNumElements() *
-                       TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
-    int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
-    return VecLdCost - ScalarLdCost + ExternalUserExtractCost;
-  }
-  case Instruction::Store: {
-    // We know that we can merge the stores. Calculate the cost.
-    int ScalarStCost = VecTy->getNumElements() *
-                       TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
-    int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
-    int StoreCost = VecStCost - ScalarStCost;
-
-    ValueList Operands;
-    for (unsigned j = 0; j < VL.size(); ++j) {
-      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
-      MemBarrierIgnoreList.insert(VL[j]);
-    }
-
-    int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1);
-    return TotalCost + ExternalUserExtractCost;
-  }
-  default:
-    // Unable to vectorize unknown instructions.
-    return getScalarizationCost(VecTy);
-  }
-}
-
-int BoUpSLP::getLastIndex(ArrayRef<Value *> VL, unsigned VF) {
-  int MaxIdx = InstrIdx[BB->getFirstNonPHI()];
-  for (unsigned i = 0; i < VF; ++i)
-    MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
-  return MaxIdx;
-}
-
-int BoUpSLP::getFirstUserIndex(ArrayRef<Value *> VL, unsigned VF) {
-  // Find the first user of the values.
-  int FirstUser = InstrVec.size();
-  for (unsigned i = 0; i < VF; ++i) {
-    for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end();
-         U != UE; ++U) {
-      Instruction *Instr = dyn_cast<Instruction>(*U);
-      if (!Instr || Instr->getParent() != BB)
-        continue;
-
-      FirstUser = std::min(FirstUser, InstrIdx[Instr]);
-    }
-  }
-  return FirstUser;
-}
-
-int BoUpSLP::getLastIndex(Instruction *I, Instruction *J) {
-  assert(I->getParent() == BB && "Invalid parent for instruction I");
-  assert(J->getParent() == BB && "Invalid parent for instruction J");
-  return std::max(InstrIdx[I], InstrIdx[J]);
-}
-
-Instruction *BoUpSLP::getInsertionPoint(unsigned Index) {
-  return InstrVec[Index + 1];
-}
-
-Value *BoUpSLP::Scalarize(ArrayRef<Value *> VL, VectorType *Ty) {
-  Value *Vec = UndefValue::get(Ty);
-  for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
-    // Generate the 'InsertElement' instruction.
-    Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
-    // Remember that this instruction is used as part of a 'gather' sequence.
-    // The caller of the bottom-up slp vectorizer can try to hoist the sequence
-    // if the users are outside of the basic block.
-    if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(Vec))
-      GatherInstructions.push_back(IEI);
-  }
-
-  // Mark the end of the gather sequence.
-  GatherInstructions.push_back(0);
-
-  for (unsigned i = 0; i < Ty->getNumElements(); ++i)
-    VectorizedValues[VL[i]] = Vec;
-
-  return Vec;
-}
-
-Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int VF) {
-  Value *V = vectorizeTree_rec(VL, VF);
-
-  int LastInstrIdx = getLastIndex(VL, VL.size());
-  for (SetVector<Value *>::iterator it = MustExtract.begin(),
-                                    e = MustExtract.end();
-       it != e; ++it) {
-    Instruction *I = cast<Instruction>(*it);
-
-    // This is a scalarized value, so we can use the original value.
-    // No need to extract from the vector.
-    if (!LaneMap.count(I))
-      continue;
-
-    Value *Vec = VectorizedValues[I];
-    // We decided not to vectorize I because one of its users was not
-    // vectorizerd. This is okay.
-    if (!Vec)
-      continue;
-
-    Value *Idx = Builder.getInt32(LaneMap[I]);
-    Value *Extract = Builder.CreateExtractElement(Vec, Idx);
-    bool Replaced = false;
-    for (Value::use_iterator U = I->use_begin(), UE = I->use_end(); U != UE;
-         ++U) {
-      Instruction *UI = cast<Instruction>(*U);
-      if (UI->getParent() != I->getParent() || InstrIdx[UI] > LastInstrIdx)
-        UI->replaceUsesOfWith(I, Extract);
-      Replaced = true;
-    }
-    assert(Replaced && "Must replace at least one outside user");
-    (void)Replaced;
-  }
-
-  // We moved some instructions around. We have to number them again
-  // before we can do any analysis.
-  numberInstructions();
-  MustScalarize.clear();
-  MustExtract.clear();
-  VectorizedValues.clear();
-  return V;
-}
-
-Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) {
-  Type *ScalarTy = VL[0]->getType();
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
-    ScalarTy = SI->getValueOperand()->getType();
-  VectorType *VecTy = VectorType::get(ScalarTy, VF);
-
-  // Check if all of the operands are constants or identical.
-  bool AllConst = true;
-  bool AllSameScalar = true;
-  for (unsigned i = 0, e = VF; i < e; ++i) {
-    AllConst &= isa<Constant>(VL[i]);
-    AllSameScalar &= (VL[0] == VL[i]);
-    // The instruction must be in the same BB, and it must be vectorizable.
-    Instruction *I = dyn_cast<Instruction>(VL[i]);
-    if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB))
-      return Scalarize(VL, VecTy);
-  }
-
-  // Check that this is a simple vector constant.
-  if (AllConst || AllSameScalar)
-    return Scalarize(VL, VecTy);
-
-  // Scalarize unknown structures.
-  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
-  if (!VL0)
-    return Scalarize(VL, VecTy);
-
-  if (VectorizedValues.count(VL0)) {
-    Value *Vec = VectorizedValues[VL0];
-    for (int i = 0; i < VF; ++i)
-      VectorizedValues[VL[i]] = Vec;
-    return Vec;
-  }
-
-  unsigned Opcode = VL0->getOpcode();
-  for (unsigned i = 0, e = VF; i < e; ++i) {
-    Instruction *I = dyn_cast<Instruction>(VL[i]);
-    // If not all of the instructions are identical then we have to scalarize.
-    if (!I || Opcode != I->getOpcode())
-      return Scalarize(VL, VecTy);
-  }
-
-  switch (Opcode) {
-  case Instruction::ExtractElement: {
-    if (CanReuseExtract(VL, VL.size(), VecTy))
-      return VL0->getOperand(0);
-    return Scalarize(VL, VecTy);
-  }
-  case Instruction::ZExt:
-  case Instruction::SExt:
-  case Instruction::FPToUI:
-  case Instruction::FPToSI:
-  case Instruction::FPExt:
-  case Instruction::PtrToInt:
-  case Instruction::IntToPtr:
-  case Instruction::SIToFP:
-  case Instruction::UIToFP:
-  case Instruction::Trunc:
-  case Instruction::FPTrunc:
-  case Instruction::BitCast: {
-    ValueList INVL;
-    for (int i = 0; i < VF; ++i)
-      INVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
-    Value *InVec = vectorizeTree_rec(INVL, VF);
-    CastInst *CI = dyn_cast<CastInst>(VL0);
-    Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
-
-    for (int i = 0; i < VF; ++i)
-      VectorizedValues[VL[i]] = V;
-
-    return V;
-  }
-  case Instruction::FCmp:
-  case Instruction::ICmp: {
-    // Check that all of the compares have the same predicate.
-    CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
-    for (unsigned i = 1, e = VF; i < e; ++i) {
-      CmpInst *Cmp = cast<CmpInst>(VL[i]);
-      if (Cmp->getPredicate() != P0)
-        return Scalarize(VL, VecTy);
-    }
-
-    ValueList LHSV, RHSV;
-    for (int i = 0; i < VF; ++i) {
-      LHSV.push_back(cast<Instruction>(VL[i])->getOperand(0));
-      RHSV.push_back(cast<Instruction>(VL[i])->getOperand(1));
-    }
-
-    Value *L = vectorizeTree_rec(LHSV, VF);
-    Value *R = vectorizeTree_rec(RHSV, VF);
-    Value *V;
-    if (VL0->getOpcode() == Instruction::FCmp)
-      V = Builder.CreateFCmp(P0, L, R);
-    else
-      V = Builder.CreateICmp(P0, L, R);
-
-    for (int i = 0; i < VF; ++i)
-      VectorizedValues[VL[i]] = V;
-
-    return V;
-  }
-  case Instruction::Select: {
-    ValueList TrueVec, FalseVec, CondVec;
-    for (int i = 0; i < VF; ++i) {
-      CondVec.push_back(cast<Instruction>(VL[i])->getOperand(0));
-      TrueVec.push_back(cast<Instruction>(VL[i])->getOperand(1));
-      FalseVec.push_back(cast<Instruction>(VL[i])->getOperand(2));
-    }
-
-    Value *True = vectorizeTree_rec(TrueVec, VF);
-    Value *False = vectorizeTree_rec(FalseVec, VF);
-    Value *Cond = vectorizeTree_rec(CondVec, VF);
-    Value *V = Builder.CreateSelect(Cond, True, False);
-
-    for (int i = 0; i < VF; ++i)
-      VectorizedValues[VL[i]] = V;
-
-    return V;
-  }
-  case Instruction::Add:
-  case Instruction::FAdd:
-  case Instruction::Sub:
-  case Instruction::FSub:
-  case Instruction::Mul:
-  case Instruction::FMul:
-  case Instruction::UDiv:
-  case Instruction::SDiv:
-  case Instruction::FDiv:
-  case Instruction::URem:
-  case Instruction::SRem:
-  case Instruction::FRem:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor: {
-    ValueList LHSVL, RHSVL;
-    for (int i = 0; i < VF; ++i) {
-      LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
-      RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
-    }
-
-    Value *LHS = vectorizeTree_rec(LHSVL, VF);
-    Value *RHS = vectorizeTree_rec(RHSVL, VF);
-    BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
-    Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
-
-    for (int i = 0; i < VF; ++i)
-      VectorizedValues[VL[i]] = V;
-
-    return V;
-  }
-  case Instruction::Load: {
-    LoadInst *LI = cast<LoadInst>(VL0);
-    unsigned Alignment = LI->getAlignment();
-
-    // Check if all of the loads are consecutive.
-    for (unsigned i = 1, e = VF; i < e; ++i)
-      if (!isConsecutiveAccess(VL[i - 1], VL[i]))
-        return Scalarize(VL, VecTy);
-
-    // Loads are inserted at the head of the tree because we don't want to sink
-    // them all the way down past store instructions.
-    Instruction *Loc = getInsertionPoint(getLastIndex(VL, VL.size()));
-    IRBuilder<> LoadBuilder(Loc);
-    Value *VecPtr = LoadBuilder.CreateBitCast(LI->getPointerOperand(),
-                                              VecTy->getPointerTo());
-    LI = LoadBuilder.CreateLoad(VecPtr);
-    LI->setAlignment(Alignment);
-
-    for (int i = 0; i < VF; ++i)
-      VectorizedValues[VL[i]] = LI;
-
-    return LI;
-  }
-  case Instruction::Store: {
-    StoreInst *SI = cast<StoreInst>(VL0);
-    unsigned Alignment = SI->getAlignment();
-
-    ValueList ValueOp;
-    for (int i = 0; i < VF; ++i)
-      ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand());
-
-    Value *VecValue = vectorizeTree_rec(ValueOp, VF);
-    Value *VecPtr =
-        Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo());
-    Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment);
-
-    for (int i = 0; i < VF; ++i)
-      cast<Instruction>(VL[i])->eraseFromParent();
-    return 0;
-  }
-  default:
-    return Scalarize(VL, VecTy);
-  }
-}
-
-} // end of namespace

Removed: llvm/trunk/lib/Transforms/Vectorize/VecUtils.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/VecUtils.h?rev=184646&view=auto
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/VecUtils.h (original)
+++ llvm/trunk/lib/Transforms/Vectorize/VecUtils.h (removed)
@@ -1,194 +0,0 @@
-//===- VecUtils.h - Vectorization Utilities -------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This family of classes and functions manipulate vectors and chains of
-// vectors.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H
-#define LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/IR/IRBuilder.h"
-#include <vector>
-
-namespace llvm {
-
-class BasicBlock;
-class Instruction;
-class Type;
-class VectorType;
-class StoreInst;
-class Value;
-class ScalarEvolution;
-class DataLayout;
-class TargetTransformInfo;
-class AliasAnalysis;
-class Loop;
-
-/// Bottom Up SLP vectorization utility class.
-struct BoUpSLP {
-  typedef SmallVector<Value *, 8> ValueList;
-  typedef SmallVector<Instruction *, 16> InstrList;
-  typedef SmallPtrSet<Value *, 16> ValueSet;
-  typedef SmallVector<StoreInst *, 8> StoreList;
-  static const int max_cost = 1 << 20;
-
-  // \brief C'tor.
-  BoUpSLP(BasicBlock *Bb, ScalarEvolution *Se, DataLayout *Dl,
-          TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp);
-
-  /// \brief Take the pointer operand from the Load/Store instruction.
-  /// \returns NULL if this is not a valid Load/Store instruction.
-  static Value *getPointerOperand(Value *I);
-
-  /// \brief Take the address space operand from the Load/Store instruction.
-  /// \returns -1 if this is not a valid Load/Store instruction.
-  static unsigned getAddressSpaceOperand(Value *I);
-
-  /// \returns true if the memory operations A and B are consecutive.
-  bool isConsecutiveAccess(Value *A, Value *B);
-
-  /// \brief Vectorize the tree that starts with the elements in \p VL.
-  /// \returns the vectorized value.
-  Value *vectorizeTree(ArrayRef<Value *> VL, int VF);
-
-  /// \returns the vectorization cost of the subtree that starts at \p VL.
-  /// A negative number means that this is profitable.
-  int getTreeCost(ArrayRef<Value *> VL);
-
-  /// \returns the scalarization cost for this list of values. Assuming that
-  /// this subtree gets vectorized, we may need to extract the values from the
-  /// roots. This method calculates the cost of extracting the values.
-  int getScalarizationCost(ArrayRef<Value *> VL);
-
-  /// \brief Attempts to order and vectorize a sequence of stores. This
-  /// function does a quadratic scan of the given stores.
-  /// \returns true if the basic block was modified.
-  bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold);
-
-  /// \brief Vectorize a group of scalars into a vector tree.
-  /// \returns the vectorized value.
-  Value *vectorizeArith(ArrayRef<Value *> Operands);
-
-  /// \returns the list of new instructions that were added in order to collect
-  /// scalars into vectors. This list can be used to further optimize the gather
-  /// sequences.
-  InstrList &getGatherSeqInstructions() { return GatherInstructions; }
-
-private:
-  /// \brief This method contains the recursive part of getTreeCost.
-  int getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth);
-
-  /// \brief This recursive method looks for vectorization hazards such as
-  /// values that are used by multiple users and checks that values are used
-  /// by only one vector lane. It updates the variables LaneMap, MultiUserVals.
-  void getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth);
-
-  /// \brief This method contains the recursive part of vectorizeTree.
-  Value *vectorizeTree_rec(ArrayRef<Value *> VL, int VF);
-
-  /// \brief Number all of the instructions in the block.
-  void numberInstructions();
-
-  ///  \brief Vectorize a sorted sequence of stores.
-  bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold);
-
-  /// \returns the scalarization cost for this type. Scalarization in this
-  /// context means the creation of vectors from a group of scalars.
-  int getScalarizationCost(Type *Ty);
-
-  /// \returns the AA location that is being access by the instruction.
-  AliasAnalysis::Location getLocation(Instruction *I);
-
-  /// \brief Checks if it is possible to sink an instruction from
-  /// \p Src to \p Dst.
-  /// \returns the pointer to the barrier instruction if we can't sink.
-  Value *isUnsafeToSink(Instruction *Src, Instruction *Dst);
-
-  /// \returns the index of the last instrucion in the BB from \p VL.
-  /// Only consider the first \p VF elements.
-  int getLastIndex(ArrayRef<Value *> VL, unsigned VF);
-
-  /// \returns the index of the first User of \p VL.
-  /// Only consider the first \p VF elements.
-  int getFirstUserIndex(ArrayRef<Value *> VL, unsigned VF);
-
-  /// \returns the instruction \p I or \p J that appears last in the BB .
-  int getLastIndex(Instruction *I, Instruction *J);
-
-  /// \returns the insertion point for \p Index.
-  Instruction *getInsertionPoint(unsigned Index);
-
-  /// \returns a vector from a collection of scalars in \p VL.
-  Value *Scalarize(ArrayRef<Value *> VL, VectorType *Ty);
-
-private:
-  /// Maps instructions to numbers and back.
-  SmallDenseMap<Value *, int> InstrIdx;
-  /// Maps integers to Instructions.
-  std::vector<Instruction *> InstrVec;
-
-  // -- containers that are used during getTreeCost -- //
-
-  /// Contains values that must be scalarized because they are used
-  /// by multiple lanes, or by users outside the tree.
-  /// NOTICE: The vectorization methods also use this set.
-  ValueSet MustScalarize;
-
-  /// Contains values that have users outside of the vectorized graph.
-  /// We need to generate extract instructions for these values.
-  /// NOTICE: The vectorization methods also use this set.
-  SetVector<Value *> MustExtract;
-
-  /// Contains a list of values that are used outside the current tree. This
-  /// set must be reset between runs.
-  SetVector<Value *> MultiUserVals;
-  /// Maps values in the tree to the vector lanes that uses them. This map must
-  /// be reset between runs of getCost.
-  std::map<Value *, int> LaneMap;
-  /// A list of instructions to ignore while sinking
-  /// memory instructions. This map must be reset between runs of getCost.
-  ValueSet MemBarrierIgnoreList;
-
-  // -- Containers that are used during vectorizeTree -- //
-
-  /// Maps between the first scalar to the vector. This map must be reset
-  /// between runs.
-  DenseMap<Value *, Value *> VectorizedValues;
-
-  // -- Containers that are used after vectorization by the caller -- //
-
-  /// A list of instructions that are used when gathering scalars into vectors.
-  /// In many cases these instructions can be hoisted outside of the BB.
-  /// Iterating over this list is faster than calling LICM.
-  /// Notice: We insert NULL ptrs to separate between the different gather
-  /// sequences.
-  InstrList GatherInstructions;
-
-  /// Instruction builder to construct the vectorized tree.
-  IRBuilder<> Builder;
-
-  // Analysis and block reference.
-  BasicBlock *BB;
-  ScalarEvolution *SE;
-  DataLayout *DL;
-  TargetTransformInfo *TTI;
-  AliasAnalysis *AA;
-  Loop *L;
-};
-
-} // end of namespace
-
-#endif // LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/diamond.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/diamond.ll?rev=184647&r1=184646&r2=184647&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/diamond.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/diamond.ll Sat Jun 22 16:34:10 2013
@@ -50,9 +50,9 @@ entry:
 ; }
 
 ; CHECK: @extr_user
+; CHECK: load i32*
 ; CHECK: store <4 x i32>
-; CHECK-NEXT: extractelement <4 x i32>
-; CHECK: ret
+; CHECK-NEXT: ret
 define i32 @extr_user(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) {
 entry:
   %0 = load i32* %A, align 4
@@ -79,9 +79,9 @@ entry:
 
 ; In this example we have an external user that is not the first element in the vector.
 ; CHECK: @extr_user1
+; CHECK: load i32*
 ; CHECK: store <4 x i32>
-; CHECK-NEXT: extractelement <4 x i32>
-; CHECK: ret
+; CHECK-NEXT: ret
 define i32 @extr_user1(i32* noalias nocapture %B, i32* noalias nocapture %A, i32 %n, i32 %m) {
 entry:
   %0 = load i32* %A, align 4

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/multi_block.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/multi_block.ll?rev=184647&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/multi_block.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/multi_block.ll Sat Jun 22 16:34:10 2013
@@ -0,0 +1,55 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.0"
+
+; int bar(double *A, int d) {
+;   double A0 = A[0];
+;   double A1 = A[1];
+;   float F0 = A0;
+;   float F1 = A1;
+;   if (d) foo(); <----- This splits the blocks
+;   F0+=4.0;
+;   F1+=5.0;
+;   A[8] = 9.0 + F0;
+;   A[9] = 5.0 + F1;
+; }
+
+
+;CHECK: @bar
+;CHECK: load <2 x double>
+;CHECK: fptrunc <2 x double>
+;CHECK: call i32
+;CHECK: fadd <2 x float>
+;CHECK: fpext <2 x float>
+;CHECK: store <2 x double>
+;CHECK: ret
+define i32 @bar(double* nocapture %A, i32 %d) {
+  %1 = load double* %A, align 8
+  %2 = getelementptr inbounds double* %A, i64 1
+  %3 = load double* %2, align 8
+  %4 = fptrunc double %1 to float
+  %5 = fptrunc double %3 to float
+  %6 = icmp eq i32 %d, 0
+  br i1 %6, label %9, label %7
+
+; <label>:7                                       ; preds = %0
+  %8 = tail call i32 (...)* @foo()
+  br label %9
+
+; <label>:9                                       ; preds = %0, %7
+  %10 = fadd float %4, 4.000000e+00
+  %11 = fadd float %5, 5.000000e+00
+  %12 = fpext float %10 to double
+  %13 = fadd double %12, 9.000000e+00
+  %14 = getelementptr inbounds double* %A, i64 8
+  store double %13, double* %14, align 8
+  %15 = fpext float %11 to double
+  %16 = fadd double %15, 5.000000e+00
+  %17 = getelementptr inbounds double* %A, i64 9
+  store double %16, double* %17, align 8
+  ret i32 undef
+}
+
+declare i32 @foo(...)
+

Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/multi_user.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/multi_user.ll?rev=184647&r1=184646&r2=184647&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/multi_user.ll (original)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/multi_user.ll Sat Jun 22 16:34:10 2013
@@ -12,8 +12,8 @@ target triple = "x86_64-apple-macosx10.7
 ;}
 
 ;CHECK: @foo
-;CHECK: load <4 x i32>
 ;CHECK: insertelement <4 x i32>
+;CHECK: load <4 x i32>
 ;CHECK: add <4 x i32>
 ;CHECK: store <4 x i32>
 ;CHECK: ret





More information about the llvm-commits mailing list