[llvm-commits] [llvm] r149468 - in /llvm/trunk: docs/ include/llvm-c/ include/llvm-c/Transforms/ include/llvm/ include/llvm/Transforms/ include/llvm/Transforms/IPO/ lib/Transforms/ lib/Transforms/IPO/ lib/Transforms/Vectorize/ test/Transforms/BBVectorize/ tools/bugpoint/ tools/llvm-ld/ tools/lto/ tools/opt/

Nick Lewycky nicholas at mxc.ca
Tue Jan 31 20:59:29 PST 2012


Hal Finkel wrote:
> Author: hfinkel
> Date: Tue Jan 31 21:51:43 2012
> New Revision: 149468
>
> URL: http://llvm.org/viewvc/llvm-project?rev=149468&view=rev
> Log:
> Add a basic-block autovectorization pass.
>
> This is the initial checkin of the basic-block autovectorization pass along with some supporting vectorization infrastructure.
> Special thanks to everyone who helped review this code over the last several months (especially Tobias Grosser).

Great stuff!

> Copied: llvm/trunk/include/llvm-c/Transforms/Vectorize.h (from r149457, llvm/trunk/include/llvm-c/Initialization.h)
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm-c/Transforms/Vectorize.h?p2=llvm/trunk/include/llvm-c/Transforms/Vectorize.h&p1=llvm/trunk/include/llvm-c/Initialization.h&r1=149457&r2=149468&rev=149468&view=diff
> ==============================================================================
> --- llvm/trunk/include/llvm-c/Initialization.h (original)
> +++ llvm/trunk/include/llvm-c/Transforms/Vectorize.h Tue Jan 31 21:51:43 2012
> @@ -1,4 +1,5 @@
> -/*===-- llvm-c/Initialization.h - Initialization C Interface ------*- C -*-===*\
> +/*===---------------------------Vectorize.h ------------------- -*- C++ -*-===*\

-*- C -*- not C++.

> Modified: llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp?rev=149468&r1=149467&r2=149468&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp (original)
> +++ llvm/trunk/lib/Transforms/IPO/PassManagerBuilder.cpp Tue Jan 31 21:51:43 2012
> @@ -21,14 +21,20 @@
>   #include "llvm/DefaultPasses.h"
>   #include "llvm/PassManager.h"
>   #include "llvm/Analysis/Passes.h"
> +#include "llvm/Analysis/Verifier.h"
> +#include "llvm/Support/CommandLine.h"
>   #include "llvm/Target/TargetLibraryInfo.h"
>   #include "llvm/Transforms/Scalar.h"
> +#include "llvm/Transforms/Vectorize.h"
>   #include "llvm/Transforms/IPO.h"
>   #include "llvm/ADT/SmallVector.h"
>   #include "llvm/Support/ManagedStatic.h"
>
>   using namespace llvm;
>
> +static cl::opt<bool>
> +RunVectorization("vectorize", cl::desc("Run vectorization passes"));
> +
>   PassManagerBuilder::PassManagerBuilder() {
>       OptLevel = 2;
>       SizeLevel = 0;
> @@ -37,6 +43,7 @@
>       DisableSimplifyLibCalls = false;
>       DisableUnitAtATime = false;
>       DisableUnrollLoops = false;
> +    Vectorize = RunVectorization;
>   }
>
>   PassManagerBuilder::~PassManagerBuilder() {
> @@ -172,6 +179,13 @@
>
>     addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
>
> +  if (Vectorize) {
> +    MPM.add(createBBVectorizePass());
> +    MPM.add(createInstructionCombiningPass());
> +    if (OptLevel>  1)
> +      MPM.add(createGVNPass());                 // Remove redundancies

Whooooaa... GVN is *really* expensive, I find it hard to believe that 
you want to run it twice even with vectorization on. Are you sure? What 
is this doing that instcombine isn't?

> Added: llvm/trunk/lib/Transforms/Vectorize/BBVectorize.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/BBVectorize.cpp?rev=149468&view=auto
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Vectorize/BBVectorize.cpp (added)
> +++ llvm/trunk/lib/Transforms/Vectorize/BBVectorize.cpp Tue Jan 31 21:51:43 2012
> @@ -0,0 +1,1796 @@
> +//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===//
> +//
> +//                     The LLVM Compiler Infrastructure
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +//===----------------------------------------------------------------------===//
> +//
> +// This file implements a basic-block vectorization pass. The algorithm was
> +// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
> +// et al. It works by looking for chains of pairable operations and then
> +// pairing them.
> +//
> +//===----------------------------------------------------------------------===//
> +
> +#define BBV_NAME "bb-vectorize"

I think it's safe to constant propagate this away. :)

> +#define DEBUG_TYPE BBV_NAME
> +#include "llvm/Constants.h"
> +#include "llvm/DerivedTypes.h"
> +#include "llvm/Function.h"
> +#include "llvm/Instructions.h"
> +#include "llvm/IntrinsicInst.h"
> +#include "llvm/Intrinsics.h"
> +#include "llvm/LLVMContext.h"
> +#include "llvm/Pass.h"
> +#include "llvm/Type.h"
> +#include "llvm/ADT/DenseMap.h"
> +#include "llvm/ADT/DenseSet.h"
> +#include "llvm/ADT/SmallVector.h"
> +#include "llvm/ADT/Statistic.h"
> +#include "llvm/ADT/STLExtras.h"
> +#include "llvm/ADT/StringExtras.h"
> +#include "llvm/Analysis/AliasAnalysis.h"
> +#include "llvm/Analysis/AliasSetTracker.h"
> +#include "llvm/Analysis/ScalarEvolution.h"
> +#include "llvm/Analysis/ScalarEvolutionExpressions.h"
> +#include "llvm/Analysis/ValueTracking.h"
> +#include "llvm/Support/CommandLine.h"
> +#include "llvm/Support/Debug.h"
> +#include "llvm/Support/raw_ostream.h"
> +#include "llvm/Support/ValueHandle.h"
> +#include "llvm/Target/TargetData.h"
> +#include "llvm/Transforms/Vectorize.h"
> +#include<algorithm>
> +#include<map>
> +using namespace llvm;
> +
> +static cl::opt<unsigned>
> +ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
> +  cl::desc("The required chain depth for vectorization"));
> +
> +static cl::opt<unsigned>
> +SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
> +  cl::desc("The maximum search distance for instruction pairs"));
> +
> +static cl::opt<bool>
> +SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden,
> +  cl::desc("Replicating one element to a pair breaks the chain"));
> +
> +static cl::opt<unsigned>
> +VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden,
> +  cl::desc("The size of the native vector registers"));
> +
> +static cl::opt<unsigned>
> +MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
> +  cl::desc("The maximum number of pairing iterations"));
> +
> +static cl::opt<unsigned>
> +MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
> +  cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
> +                       " a full cycle check"));
> +
> +static cl::opt<bool>
> +NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
> +  cl::desc("Don't try to vectorize integer values"));
> +
> +static cl::opt<bool>
> +NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
> +  cl::desc("Don't try to vectorize floating-point values"));
> +
> +static cl::opt<bool>
> +NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
> +  cl::desc("Don't try to vectorize casting (conversion) operations"));
> +
> +static cl::opt<bool>
> +NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
> +  cl::desc("Don't try to vectorize floating-point math intrinsics"));
> +
> +static cl::opt<bool>
> +NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
> +  cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
> +
> +static cl::opt<bool>
> +NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
> +  cl::desc("Don't try to vectorize loads and stores"));
> +
> +static cl::opt<bool>
> +AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
> +  cl::desc("Only generate aligned loads and stores"));
> +
> +static cl::opt<bool>
> +FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
> +  cl::desc("Use a fast instruction dependency analysis"));
> +
> +#ifndef NDEBUG
> +static cl::opt<bool>
> +DebugInstructionExamination("bb-vectorize-debug-instruction-examination",
> +  cl::init(false), cl::Hidden,
> +  cl::desc("When debugging is enabled, output information on the"
> +           " instruction-examination process"));
> +static cl::opt<bool>
> +DebugCandidateSelection("bb-vectorize-debug-candidate-selection",
> +  cl::init(false), cl::Hidden,
> +  cl::desc("When debugging is enabled, output information on the"
> +           " candidate-selection process"));
> +static cl::opt<bool>
> +DebugPairSelection("bb-vectorize-debug-pair-selection",
> +  cl::init(false), cl::Hidden,
> +  cl::desc("When debugging is enabled, output information on the"
> +           " pair-selection process"));
> +static cl::opt<bool>
> +DebugCycleCheck("bb-vectorize-debug-cycle-check",
> +  cl::init(false), cl::Hidden,
> +  cl::desc("When debugging is enabled, output information on the"
> +           " cycle-checking process"));
> +#endif
> +
> +STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
> +
> +namespace {
> +  struct BBVectorize : public BasicBlockPass {
> +    static char ID; // Pass identification, replacement for typeid
> +    BBVectorize() : BasicBlockPass(ID) {
> +      initializeBBVectorizePass(*PassRegistry::getPassRegistry());
> +    }
> +
> +    typedef std::pair<Value *, Value *>  ValuePair;
> +    typedef std::pair<ValuePair, size_t>  ValuePairWithDepth;
> +    typedef std::pair<ValuePair, ValuePair>  VPPair; // A ValuePair pair
> +    typedef std::pair<std::multimap<Value *, Value *>::iterator,
> +              std::multimap<Value *, Value *>::iterator>  VPIteratorPair;
> +    typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator,
> +              std::multimap<ValuePair, ValuePair>::iterator>
> +                VPPIteratorPair;
> +
> +    AliasAnalysis *AA;
> +    ScalarEvolution *SE;
> +    TargetData *TD;
> +
> +    // FIXME: const correct?
> +
> +    bool vectorizePairs(BasicBlock&BB);
> +
> +    void getCandidatePairs(BasicBlock&BB,
> +                       std::multimap<Value *, Value *>  &CandidatePairs,
> +                       std::vector<Value *>  &PairableInsts);
> +
> +    void computeConnectedPairs(std::multimap<Value *, Value *>  &CandidatePairs,
> +                       std::vector<Value *>  &PairableInsts,
> +                       std::multimap<ValuePair, ValuePair>  &ConnectedPairs);
> +
> +    void buildDepMap(BasicBlock&BB,
> +                       std::multimap<Value *, Value *>  &CandidatePairs,
> +                       std::vector<Value *>  &PairableInsts,
> +                       DenseSet<ValuePair>  &PairableInstUsers);
> +
> +    void choosePairs(std::multimap<Value *, Value *>  &CandidatePairs,
> +                        std::vector<Value *>  &PairableInsts,
> +                        std::multimap<ValuePair, ValuePair>  &ConnectedPairs,
> +                        DenseSet<ValuePair>  &PairableInstUsers,
> +                        DenseMap<Value *, Value *>&  ChosenPairs);
> +
> +    void fuseChosenPairs(BasicBlock&BB,
> +                     std::vector<Value *>  &PairableInsts,
> +                     DenseMap<Value *, Value *>&  ChosenPairs);
> +
> +    bool isInstVectorizable(Instruction *I, bool&IsSimpleLoadStore);
> +
> +    bool areInstsCompatible(Instruction *I, Instruction *J,
> +                       bool IsSimpleLoadStore);
> +
> +    bool trackUsesOfI(DenseSet<Value *>  &Users,
> +                      AliasSetTracker&WriteSet, Instruction *I,
> +                      Instruction *J, bool UpdateUsers = true,
> +                      std::multimap<Value *, Value *>  *LoadMoveSet = 0);
> +
> +    void computePairsConnectedTo(
> +                      std::multimap<Value *, Value *>  &CandidatePairs,
> +                      std::vector<Value *>  &PairableInsts,
> +                      std::multimap<ValuePair, ValuePair>  &ConnectedPairs,
> +                      ValuePair P);
> +
> +    bool pairsConflict(ValuePair P, ValuePair Q,
> +                 DenseSet<ValuePair>  &PairableInstUsers,
> +                 std::multimap<ValuePair, ValuePair>  *PairableInstUserMap = 0);
> +
> +    bool pairWillFormCycle(ValuePair P,
> +                       std::multimap<ValuePair, ValuePair>  &PairableInstUsers,
> +                       DenseSet<ValuePair>  &CurrentPairs);
> +
> +    void pruneTreeFor(
> +                      std::multimap<Value *, Value *>  &CandidatePairs,
> +                      std::vector<Value *>  &PairableInsts,
> +                      std::multimap<ValuePair, ValuePair>  &ConnectedPairs,
> +                      DenseSet<ValuePair>  &PairableInstUsers,
> +                      std::multimap<ValuePair, ValuePair>  &PairableInstUserMap,
> +                      DenseMap<Value *, Value *>  &ChosenPairs,
> +                      DenseMap<ValuePair, size_t>  &Tree,
> +                      DenseSet<ValuePair>  &PrunedTree, ValuePair J,
> +                      bool UseCycleCheck);
> +
> +    void buildInitialTreeFor(
> +                      std::multimap<Value *, Value *>  &CandidatePairs,
> +                      std::vector<Value *>  &PairableInsts,
> +                      std::multimap<ValuePair, ValuePair>  &ConnectedPairs,
> +                      DenseSet<ValuePair>  &PairableInstUsers,
> +                      DenseMap<Value *, Value *>  &ChosenPairs,
> +                      DenseMap<ValuePair, size_t>  &Tree, ValuePair J);
> +
> +    void findBestTreeFor(
> +                      std::multimap<Value *, Value *>  &CandidatePairs,
> +                      std::vector<Value *>  &PairableInsts,
> +                      std::multimap<ValuePair, ValuePair>  &ConnectedPairs,
> +                      DenseSet<ValuePair>  &PairableInstUsers,
> +                      std::multimap<ValuePair, ValuePair>  &PairableInstUserMap,
> +                      DenseMap<Value *, Value *>  &ChosenPairs,
> +                      DenseSet<ValuePair>  &BestTree, size_t&BestMaxDepth,
> +                      size_t&BestEffSize, VPIteratorPair ChoiceRange,
> +                      bool UseCycleCheck);
> +
> +    Value *getReplacementPointerInput(LLVMContext&  Context, Instruction *I,
> +                     Instruction *J, unsigned o, bool&FlipMemInputs);
> +
> +    void fillNewShuffleMask(LLVMContext&  Context, Instruction *J,
> +                     unsigned NumElem, unsigned MaskOffset, unsigned NumInElem,
> +                     unsigned IdxOffset, std::vector<Constant*>  &Mask);
> +
> +    Value *getReplacementShuffleMask(LLVMContext&  Context, Instruction *I,
> +                     Instruction *J);
> +
> +    Value *getReplacementInput(LLVMContext&  Context, Instruction *I,
> +                     Instruction *J, unsigned o, bool FlipMemInputs);
> +
> +    void getReplacementInputsForPair(LLVMContext&  Context, Instruction *I,
> +                     Instruction *J, SmallVector<Value *, 3>  &ReplacedOperands,
> +                     bool&FlipMemInputs);
> +
> +    void replaceOutputsOfPair(LLVMContext&  Context, Instruction *I,
> +                     Instruction *J, Instruction *K,
> +                     Instruction *&InsertionPt, Instruction *&K1,
> +                     Instruction *&K2, bool&FlipMemInputs);
> +
> +    void collectPairLoadMoveSet(BasicBlock&BB,
> +                     DenseMap<Value *, Value *>  &ChosenPairs,
> +                     std::multimap<Value *, Value *>  &LoadMoveSet,
> +                     Instruction *I);
> +
> +    void collectLoadMoveSet(BasicBlock&BB,
> +                     std::vector<Value *>  &PairableInsts,
> +                     DenseMap<Value *, Value *>  &ChosenPairs,
> +                     std::multimap<Value *, Value *>  &LoadMoveSet);
> +
> +    bool canMoveUsesOfIAfterJ(BasicBlock&BB,
> +                     std::multimap<Value *, Value *>  &LoadMoveSet,
> +                     Instruction *I, Instruction *J);
> +
> +    void moveUsesOfIAfterJ(BasicBlock&BB,
> +                     std::multimap<Value *, Value *>  &LoadMoveSet,
> +                     Instruction *&InsertionPt,
> +                     Instruction *I, Instruction *J);
> +
> +    virtual bool runOnBasicBlock(BasicBlock&BB) {
> +      AA =&getAnalysis<AliasAnalysis>();
> +      SE =&getAnalysis<ScalarEvolution>();
> +      TD = getAnalysisIfAvailable<TargetData>();
> +
> +      bool changed = false;
> +      // Iterate a sufficient number of times to merge types of size 1 bit,
> +      // then 2 bits, then 4, etc. up to half of the target vector width of the
> +      // target vector register.
> +      for (unsigned v = 2, n = 1; v<= VectorBits&&  (!MaxIter || n<= MaxIter);
> +           v *= 2, ++n) {
> +        DEBUG(dbgs()<<  "BBV: fusing loop #"<<  n<<
> +              " for "<<  BB.getName()<<  " in "<<
> +              BB.getParent()->getName()<<  "...\n");
> +        if (vectorizePairs(BB))
> +          changed = true;
> +        else
> +          break;
> +      }
> +
> +      DEBUG(dbgs()<<  "BBV: done!\n");
> +      return changed;
> +    }
> +
> +    virtual void getAnalysisUsage(AnalysisUsage&AU) const {

Does this pass mutate the CFG (ie., modify terminator instructions)? I 
don't see where it does, so AU.setPreservesCFG() should be here?

> +      BasicBlockPass::getAnalysisUsage(AU);
> +      AU.addRequired<AliasAnalysis>();
> +      AU.addRequired<ScalarEvolution>();
> +      AU.addPreserved<AliasAnalysis>();
> +      AU.addPreserved<ScalarEvolution>();
> +    }
> +
> +    // This returns the vector type that holds a pair of the provided type.
> +    // If the provided type is already a vector, then its length is doubled.
> +    static inline VectorType *getVecTypeForPair(Type *ElemTy) {
> +      if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
> +        unsigned numElem = VTy->getNumElements();
> +        return VectorType::get(ElemTy->getScalarType(), numElem*2);
> +      } else {

No else-after-return. 
http://llvm.org/docs/CodingStandards.html#hl_else_after_return

> +        return VectorType::get(ElemTy, 2);
> +      }
> +    }
> +
> +    // Returns the weight associated with the provided value. A chain of
> +    // candidate pairs has a length given by the sum of the weights of its
> +    // members (one weight per pair; the weight of each member of the pair
> +    // is assumed to be the same). This length is then compared to the
> +    // chain-length threshold to determine if a given chain is significant
> +    // enough to be vectorized. The length is also used in comparing
> +    // candidate chains where longer chains are considered to be better.
> +    // Note: when this function returns 0, the resulting instructions are
> +    // not actually fused.
> +    static inline size_t getDepthFactor(Value *V) {
> +      // InsertElement and ExtractElement have a depth factor of zero. This is
> +      // for two reasons: First, they cannot be usefully fused. Second, because
> +      // the pass generates a lot of these, they can confuse the simple metric
> +      // used to compare the trees in the next iteration. Thus, giving them a
> +      // weight of zero allows the pass to essentially ignore them in
> +      // subsequent iterations when looking for vectorization opportunities
> +      // while still tracking dependency chains that flow through those
> +      // instructions.
> +      if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V))
> +        return 0;
> +
> +      return 1;
> +    }
> +
> +    // This determines the relative offset of two loads or stores, returning
> +    // true if the offset could be determined to be some constant value.
> +    // For example, if OffsetInElmts == 1, then J accesses the memory directly
> +    // after I; if OffsetInElmts == -1 then I accesses the memory
> +    // directly after J. This function assumes that both instructions
> +    // have the same type.
> +    bool getPairPtrInfo(Instruction *I, Instruction *J,
> +        Value *&IPtr, Value *&JPtr, unsigned&IAlignment, unsigned&JAlignment,
> +        int64_t&OffsetInElmts) {
> +      OffsetInElmts = 0;
> +      if (isa<LoadInst>(I)) {
> +        IPtr = cast<LoadInst>(I)->getPointerOperand();
> +        JPtr = cast<LoadInst>(J)->getPointerOperand();
> +        IAlignment = cast<LoadInst>(I)->getAlignment();
> +        JAlignment = cast<LoadInst>(J)->getAlignment();
> +      } else {
> +        IPtr = cast<StoreInst>(I)->getPointerOperand();
> +        JPtr = cast<StoreInst>(J)->getPointerOperand();
> +        IAlignment = cast<StoreInst>(I)->getAlignment();
> +        JAlignment = cast<StoreInst>(J)->getAlignment();
> +      }
> +
> +      const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
> +      const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
> +
> +      // If this is a trivial offset, then we'll get something like
> +      // 1*sizeof(type). With target data, which we need anyway, this will get
> +      // constant folded into a number.
> +      const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
> +      if (const SCEVConstant *ConstOffSCEV =
> +            dyn_cast<SCEVConstant>(OffsetSCEV)) {
> +        ConstantInt *IntOff = ConstOffSCEV->getValue();
> +        int64_t Offset = IntOff->getSExtValue();
> +
> +        Type *VTy = cast<PointerType>(IPtr->getType())->getElementType();
> +        int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy);
> +
> +        assert(VTy == cast<PointerType>(JPtr->getType())->getElementType());
> +
> +        OffsetInElmts = Offset/VTyTSS;
> +        return (abs64(Offset) % VTyTSS) == 0;
> +      }
> +
> +      return false;
> +    }
> +
> +    // Returns true if the provided CallInst represents an intrinsic that can
> +    // be vectorized.
> +    bool isVectorizableIntrinsic(CallInst* I) {
> +      Function *F = I->getCalledFunction();
> +      if (!F) return false;
> +
> +      unsigned IID = F->getIntrinsicID();
> +      if (!IID) return false;
> +
> +      switch(IID) {
> +      default:
> +        return false;
> +      case Intrinsic::sqrt:
> +      case Intrinsic::powi:
> +      case Intrinsic::sin:
> +      case Intrinsic::cos:
> +      case Intrinsic::log:
> +      case Intrinsic::log2:
> +      case Intrinsic::log10:
> +      case Intrinsic::exp:
> +      case Intrinsic::exp2:
> +      case Intrinsic::pow:
> +        return !NoMath;
> +      case Intrinsic::fma:
> +        return !NoFMA;
> +      }
> +    }
> +
> +    // Returns true if J is the second element in some pair referenced by
> +    // some multimap pair iterator pair.
> +    template<typename V>
> +    bool isSecondInIteratorPair(V J, std::pair<
> +           typename std::multimap<V, V>::iterator,
> +           typename std::multimap<V, V>::iterator>  PairRange) {
> +      for (typename std::multimap<V, V>::iterator K = PairRange.first;
> +           K != PairRange.second; ++K)
> +        if (K->second == J) return true;
> +
> +      return false;
> +    }
> +  };
> +
> +  // This function implements one vectorization iteration on the provided
> +  // basic block. It returns true if the block is changed.
> +  bool BBVectorize::vectorizePairs(BasicBlock&BB) {
> +    std::vector<Value *>  PairableInsts;
> +    std::multimap<Value *, Value *>  CandidatePairs;
> +    getCandidatePairs(BB, CandidatePairs, PairableInsts);
> +    if (PairableInsts.size() == 0) return false;
> +
> +    // Now we have a map of all of the pairable instructions and we need to
> +    // select the best possible pairing. A good pairing is one such that the
> +    // users of the pair are also paired. This defines a (directed) forest
> +    // over the pairs such that two pairs are connected iff the second pair
> +    // uses the first.
> +
> +    // Note that it only matters that both members of the second pair use some
> +    // element of the first pair (to allow for splatting).
> +
> +    std::multimap<ValuePair, ValuePair>  ConnectedPairs;
> +    computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs);
> +    if (ConnectedPairs.size() == 0) return false;

ConnectedPairs.empty()

> +
> +    // Build the pairable-instruction dependency map
> +    DenseSet<ValuePair>  PairableInstUsers;
> +    buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
> +
> +    // There is now a graph of the connected pairs. For each variable, pick the
> +    // pairing with the largest tree meeting the depth requirement on at least
> +    // one branch. Then select all pairings that are part of that tree and
> +    // remove them from the list of available pairings and pairable variables.
> +
> +    DenseMap<Value *, Value *>  ChosenPairs;
> +    choosePairs(CandidatePairs, PairableInsts, ConnectedPairs,
> +      PairableInstUsers, ChosenPairs);
> +
> +    if (ChosenPairs.size() == 0) return false;

ChosenPairs.empty()

> +    NumFusedOps += ChosenPairs.size();
> +
> +    // A set of pairs has now been selected. It is now necessary to replace the
> +    // paired instructions with vector instructions. For this procedure each
> +    // operand much be replaced with a vector operand. This vector is formed
> +    // by using build_vector on the old operands. The replaced values are then
> +    // replaced with a vector_extract on the result.  Subsequent optimization
> +    // passes should coalesce the build/extract combinations.
> +
> +    fuseChosenPairs(BB, PairableInsts, ChosenPairs);
> +
> +    return true;
> +  }
> +
> +  // This function returns true if the provided instruction is capable of being
> +  // fused into a vector instruction. This determination is based only on the
> +  // type and other attributes of the instruction.
> +  bool BBVectorize::isInstVectorizable(Instruction *I,
> +                                         bool&IsSimpleLoadStore) {
> +    IsSimpleLoadStore = false;
> +
> +    if (CallInst *C = dyn_cast<CallInst>(I)) {
> +      if (!isVectorizableIntrinsic(C))
> +        return false;
> +    } else if (LoadInst *L = dyn_cast<LoadInst>(I)) {
> +      // Vectorize simple loads if possbile:
> +      IsSimpleLoadStore = L->isSimple();
> +      if (!IsSimpleLoadStore || NoMemOps)
> +        return false;
> +    } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
> +      // Vectorize simple stores if possbile:
> +      IsSimpleLoadStore = S->isSimple();
> +      if (!IsSimpleLoadStore || NoMemOps)
> +        return false;
> +    } else if (CastInst *C = dyn_cast<CastInst>(I)) {
> +      // We can vectorize casts, but not casts of pointer types, etc.
> +      if (NoCasts)
> +        return false;
> +
> +      Type *SrcTy = C->getSrcTy();
> +      if (!SrcTy->isSingleValueType() || SrcTy->isPointerTy())
> +        return false;
> +
> +      Type *DestTy = C->getDestTy();
> +      if (!DestTy->isSingleValueType() || DestTy->isPointerTy())
> +        return false;
> +    } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) ||
> +        isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {
> +      return false;
> +    }
> +
> +    // We can't vectorize memory operations without target data
> +    if (TD == 0&&  IsSimpleLoadStore)
> +      return false;
> +
> +    Type *T1, *T2;
> +    if (isa<StoreInst>(I)) {
> +      // For stores, it is the value type, not the pointer type that matters
> +      // because the value is what will come from a vector register.
> +
> +      Value *IVal = cast<StoreInst>(I)->getValueOperand();
> +      T1 = IVal->getType();
> +    } else {
> +      T1 = I->getType();
> +    }
> +
> +    if (I->isCast())
> +      T2 = cast<CastInst>(I)->getSrcTy();
> +    else
> +      T2 = T1;
> +
> +    // Not every type can be vectorized...
> +    if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
> +        !(VectorType::isValidElementType(T2) || T2->isVectorTy()))
> +      return false;
> +
> +    if (NoInts&&  (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy()))
> +      return false;
> +
> +    if (NoFloats&&  (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
> +      return false;
> +
> +    if (T1->getPrimitiveSizeInBits()>  VectorBits/2 ||
> +        T2->getPrimitiveSizeInBits()>  VectorBits/2)
> +      return false;
> +
> +    return true;
> +  }
> +
> +  // This function returns true if the two provided instructions are compatible
> +  // (meaning that they can be fused into a vector instruction). This assumes
> +  // that I has already been determined to be vectorizable and that J is not
> +  // in the use tree of I.
> +  bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
> +                       bool IsSimpleLoadStore) {
> +    DEBUG(if (DebugInstructionExamination) dbgs()<<  "BBV: looking at "<<  *I<<
> +                     "<->  "<<  *J<<  "\n");
> +
> +    // Loads and stores can be merged if they have different alignments,
> +    // but are otherwise the same.
> +    LoadInst *LI, *LJ;
> +    StoreInst *SI, *SJ;
> +    if ((LI = dyn_cast<LoadInst>(I))&&  (LJ = dyn_cast<LoadInst>(J))) {
> +      if (I->getType() != J->getType())
> +        return false;
> +
> +      if (LI->getPointerOperand()->getType() !=
> +            LJ->getPointerOperand()->getType() ||
> +          LI->isVolatile() != LJ->isVolatile() ||

You don't combine two separate volatile loads, do you? That sounds bad.

I'm also not sure about merging two atomic load/stores...

> +          LI->getOrdering() != LJ->getOrdering() ||
> +          LI->getSynchScope() != LJ->getSynchScope())
> +        return false;
> +    } else if ((SI = dyn_cast<StoreInst>(I))&&  (SJ = dyn_cast<StoreInst>(J))) {
> +      if (SI->getValueOperand()->getType() !=
> +            SJ->getValueOperand()->getType() ||
> +          SI->getPointerOperand()->getType() !=
> +            SJ->getPointerOperand()->getType() ||
> +          SI->isVolatile() != SJ->isVolatile() ||
> +          SI->getOrdering() != SJ->getOrdering() ||
> +          SI->getSynchScope() != SJ->getSynchScope())
> +        return false;
> +    } else if (!J->isSameOperationAs(I)) {
> +      return false;
> +    }
> +    // FIXME: handle addsub-type operations!
> +
> +    if (IsSimpleLoadStore) {
> +      Value *IPtr, *JPtr;
> +      unsigned IAlignment, JAlignment;
> +      int64_t OffsetInElmts = 0;
> +      if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
> +            OffsetInElmts)&&  abs64(OffsetInElmts) == 1) {
> +        if (AlignedOnly) {
> +          Type *aType = isa<StoreInst>(I) ?
> +            cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
> +          // An aligned load or store is possible only if the instruction
> +          // with the lower offset has an alignment suitable for the
> +          // vector type.
> +
> +          unsigned BottomAlignment = IAlignment;
> +          if (OffsetInElmts<  0) BottomAlignment = JAlignment;
> +
> +          Type *VType = getVecTypeForPair(aType);
> +          unsigned VecAlignment = TD->getPrefTypeAlignment(VType);
> +          if (BottomAlignment<  VecAlignment)
> +            return false;
> +        }
> +      } else {
> +        return false;
> +      }
> +    } else if (isa<ShuffleVectorInst>(I)) {
> +      // Only merge two shuffles if they're both constant
> +      return isa<Constant>(I->getOperand(2))&&
> +             isa<Constant>(J->getOperand(2));
> +      // FIXME: We may want to vectorize non-constant shuffles also.
> +    }
> +
> +    return true;
> +  }
> +
> +  // Figure out whether or not J uses I and update the users and write-set
> +  // structures associated with I. Specifically, Users represents the set of
> +  // instructions that depend on I. WriteSet represents the set
> +  // of memory locations that are dependent on I. If UpdateUsers is true,
> +  // and J uses I, then Users is updated to contain J and WriteSet is updated
> +  // to contain any memory locations to which J writes. The function returns
> +  // true if J uses I. By default, alias analysis is used to determine
> +  // whether J reads from memory that overlaps with a location in WriteSet.
> +  // If LoadMoveSet is not null, then it is a previously-computed multimap
> +  // where the key is the memory-based user instruction and the value is
> +  // the instruction to be compared with I. So, if LoadMoveSet is provided,
> +  // then the alias analysis is not used. This is necessary because this
> +  // function is called during the process of moving instructions during
> +  // vectorization and the results of the alias analysis are not stable during
> +  // that process.
> +  bool BBVectorize::trackUsesOfI(DenseSet<Value *>  &Users,
> +                       AliasSetTracker&WriteSet, Instruction *I,
> +                       Instruction *J, bool UpdateUsers,
> +                       std::multimap<Value *, Value *>  *LoadMoveSet) {
> +    bool UsesI = false;
> +
> +    // This instruction may already be marked as a user due, for example, to
> +    // being a member of a selected pair.
> +    if (Users.count(J))
> +      UsesI = true;
> +
> +    if (!UsesI)
> +      for (User::op_iterator JU = J->op_begin(), e = J->op_end();
> +           JU != e; ++JU) {

This is correct, but it's common to say "JU = ..., JE = ..." for 
consistency.

> +        Value *V = *JU;
> +        if (I == V || Users.count(V)) {
> +          UsesI = true;
> +          break;
> +        }
> +      }
> +    if (!UsesI&&  J->mayReadFromMemory()) {
> +      if (LoadMoveSet) {
> +        VPIteratorPair JPairRange = LoadMoveSet->equal_range(J);
> +        UsesI = isSecondInIteratorPair<Value*>(I, JPairRange);
> +      } else {
> +        for (AliasSetTracker::iterator W = WriteSet.begin(),
> +             WE = WriteSet.end(); W != WE; ++W) {
> +          for (AliasSet::iterator A = W->begin(), AE = W->end();
> +               A != AE; ++A) {
> +            AliasAnalysis::Location ptrLoc(A->getValue(), A->getSize(),
> +                                           A->getTBAAInfo());
> +            if (AA->getModRefInfo(J, ptrLoc) != AliasAnalysis::NoModRef) {
> +              UsesI = true;
> +              break;
> +            }
> +          }
> +          if (UsesI) break;
> +        }
> +      }
> +    }
> +
> +    if (UsesI&&  UpdateUsers) {
> +      if (J->mayWriteToMemory()) WriteSet.add(J);
> +      Users.insert(J);
> +    }
> +
> +    return UsesI;
> +  }
> +
> +  // This function iterates over all instruction pairs in the provided
> +  // basic block and collects all candidate pairs for vectorization.
> +  void BBVectorize::getCandidatePairs(BasicBlock&BB,
> +                       std::multimap<Value *, Value *>  &CandidatePairs,
> +                       std::vector<Value *>  &PairableInsts) {
> +    BasicBlock::iterator E = BB.end();
> +    for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
> +      bool IsSimpleLoadStore;
> +      if (!isInstVectorizable(I, IsSimpleLoadStore)) continue;
> +
> +      // Look for an instruction with which to pair instruction *I...
> +      DenseSet<Value *>  Users;
> +      AliasSetTracker WriteSet(*AA);
> +      BasicBlock::iterator J = I; ++J;
> +      for (unsigned ss = 0; J != E&&  ss<= SearchLimit; ++J, ++ss) {
> +        // Determine if J uses I, if so, exit the loop.
> +        bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !FastDep);
> +        if (FastDep) {
> +          // Note: For this heuristic to be effective, independent operations
> +          // must tend to be intermixed. This is likely to be true from some
> +          // kinds of grouped loop unrolling (but not the generic LLVM pass),
> +          // but otherwise may require some kind of reordering pass.
> +
> +          // When using fast dependency analysis,
> +          // stop searching after first use:
> +          if (UsesI) break;
> +        } else {
> +          if (UsesI) continue;
> +        }
> +
> +        // J does not use I, and comes before the first use of I, so it can be
> +        // merged with I if the instructions are compatible.
> +        if (!areInstsCompatible(I, J, IsSimpleLoadStore)) continue;
> +
> +        // J is a candidate for merging with I.
> +        if (!PairableInsts.size() ||
> +             PairableInsts[PairableInsts.size()-1] != I) {
> +          PairableInsts.push_back(I);
> +        }
> +        CandidatePairs.insert(ValuePair(I, J));
> +        DEBUG(if (DebugCandidateSelection) dbgs()<<  "BBV: candidate pair"
> +<<  *I<<  "<->  "<<  *J<<  "\n");
> +      }
> +    }
> +
> +    DEBUG(dbgs()<<  "BBV: found "<<  PairableInsts.size()
> +<<  " instructions with candidate pairs\n");
> +  }
> +
> +  // Finds candidate pairs connected to the pair P =<PI, PJ>. This means that
> +  // it looks for pairs such that both members have an input which is an
> +  // output of PI or PJ.
> +  void BBVectorize::computePairsConnectedTo(
> +                      std::multimap<Value *, Value *>  &CandidatePairs,
> +                      std::vector<Value *>  &PairableInsts,
> +                      std::multimap<ValuePair, ValuePair>  &ConnectedPairs,
> +                      ValuePair P) {
> +    // For each possible pairing for this variable, look at the uses of
> +    // the first value...
> +    for (Value::use_iterator I = P.first->use_begin(),
> +         E = P.first->use_end(); I != E; ++I) {
> +      VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
> +
> +      // For each use of the first variable, look for uses of the second
> +      // variable...
> +      for (Value::use_iterator J = P.second->use_begin(),
> +           E2 = P.second->use_end(); J != E2; ++J) {
> +        VPIteratorPair JPairRange = CandidatePairs.equal_range(*J);
> +
> +        // Look for<I, J>:
> +        if (isSecondInIteratorPair<Value*>(*J, IPairRange))
> +          ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
> +
> +        // Look for<J, I>:
> +        if (isSecondInIteratorPair<Value*>(*I, JPairRange))
> +          ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I)));
> +      }
> +
> +      if (SplatBreaksChain) continue;
> +      // Look for cases where just the first value in the pair is used by
> +      // both members of another pair (splatting).
> +      for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) {
> +        if (isSecondInIteratorPair<Value*>(*J, IPairRange))
> +          ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
> +      }
> +    }
> +
> +    if (SplatBreaksChain) return;
> +    // Look for cases where just the second value in the pair is used by
> +    // both members of another pair (splatting).
> +    for (Value::use_iterator I = P.second->use_begin(),
> +         E = P.second->use_end(); I != E; ++I) {
> +      VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
> +
> +      for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) {
> +        if (isSecondInIteratorPair<Value*>(*J, IPairRange))
> +          ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
> +      }
> +    }
> +  }
> +
> +  // This function figures out which pairs are connected.  Two pairs are
> +  // connected if some output of the first pair forms an input to both members
> +  // of the second pair.
> +  void BBVectorize::computeConnectedPairs(
> +                      std::multimap<Value *, Value *>  &CandidatePairs,
> +                      std::vector<Value *>  &PairableInsts,
> +                      std::multimap<ValuePair, ValuePair>  &ConnectedPairs) {
> +
> +    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
> +         PE = PairableInsts.end(); PI != PE; ++PI) {
> +      VPIteratorPair choiceRange = CandidatePairs.equal_range(*PI);
> +
> +      for (std::multimap<Value *, Value *>::iterator P = choiceRange.first;
> +           P != choiceRange.second; ++P)
> +        computePairsConnectedTo(CandidatePairs, PairableInsts,
> +                                ConnectedPairs, *P);
> +    }
> +
> +    DEBUG(dbgs()<<  "BBV: found "<<  ConnectedPairs.size()
> +<<  " pair connections.\n");
> +  }
> +
> +  // This function builds a set of use tuples such that<A, B>  is in the set
> +  // if B is in the use tree of A. If B is in the use tree of A, then B
> +  // depends on the output of A.
> +  void BBVectorize::buildDepMap(
> +                      BasicBlock&BB,
> +                      std::multimap<Value *, Value *>  &CandidatePairs,
> +                      std::vector<Value *>  &PairableInsts,
> +                      DenseSet<ValuePair>  &PairableInstUsers) {
> +    DenseSet<Value *>  IsInPair;
> +    for (std::multimap<Value *, Value *>::iterator C = CandidatePairs.begin(),
> +         E = CandidatePairs.end(); C != E; ++C) {
> +      IsInPair.insert(C->first);
> +      IsInPair.insert(C->second);
> +    }
> +
> +    // Iterate through the basic block, recording all Users of each
> +    // pairable instruction.
> +
> +    BasicBlock::iterator E = BB.end();
> +    for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {

"for (...; !isa<TerminatorInst>(I); ++I) {" should also work, and avoid 
the need to declare 'E' above.

> +      if (IsInPair.find(I) == IsInPair.end()) continue;
> +
> +      DenseSet<Value *>  Users;
> +      AliasSetTracker WriteSet(*AA);
> +      for (BasicBlock::iterator J = llvm::next(I); J != E; ++J)
> +        (void) trackUsesOfI(Users, WriteSet, I, J);
> +
> +      for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
> +           U != E; ++U)
> +        PairableInstUsers.insert(ValuePair(I, *U));
> +    }
> +  }
> +
> +  // Returns true if an input to pair P is an output of pair Q and also an
> +  // input of pair Q is an output of pair P. If this is the case, then these
> +  // two pairs cannot be simultaneously fused.
> +  bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
> +                     DenseSet<ValuePair>  &PairableInstUsers,
> +                     std::multimap<ValuePair, ValuePair>  *PairableInstUserMap) {
> +    // Two pairs are in conflict if they are mutual Users of eachother.
> +    bool QUsesP = PairableInstUsers.count(ValuePair(P.first,  Q.first))  ||
> +                  PairableInstUsers.count(ValuePair(P.first,  Q.second)) ||
> +                  PairableInstUsers.count(ValuePair(P.second, Q.first))  ||
> +                  PairableInstUsers.count(ValuePair(P.second, Q.second));
> +    bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first,  P.first))  ||
> +                  PairableInstUsers.count(ValuePair(Q.first,  P.second)) ||
> +                  PairableInstUsers.count(ValuePair(Q.second, P.first))  ||
> +                  PairableInstUsers.count(ValuePair(Q.second, P.second));
> +    if (PairableInstUserMap) {
> +      // FIXME: The expensive part of the cycle check is not so much the cycle
> +      // check itself but this edge insertion procedure. This needs some
> +      // profiling and probably a different data structure (same is true of
> +      // most uses of std::multimap).
> +      if (PUsesQ) {
> +        VPPIteratorPair QPairRange = PairableInstUserMap->equal_range(Q);
> +        if (!isSecondInIteratorPair(P, QPairRange))
> +          PairableInstUserMap->insert(VPPair(Q, P));
> +      }
> +      if (QUsesP) {
> +        VPPIteratorPair PPairRange = PairableInstUserMap->equal_range(P);
> +        if (!isSecondInIteratorPair(Q, PPairRange))
> +          PairableInstUserMap->insert(VPPair(P, Q));
> +      }
> +    }
> +
> +    return (QUsesP&&  PUsesQ);
> +  }
> +
> +  // This function walks the use graph of current pairs to see if, starting
> +  // from P, the walk returns to P.
> +  bool BBVectorize::pairWillFormCycle(ValuePair P,
> +                       std::multimap<ValuePair, ValuePair>  &PairableInstUserMap,
> +                       DenseSet<ValuePair>  &CurrentPairs) {
> +    DEBUG(if (DebugCycleCheck)
> +            dbgs()<<  "BBV: starting cycle check for : "<<  *P.first<<  "<->  "
> +<<  *P.second<<  "\n");
> +    // A lookup table of visisted pairs is kept because the PairableInstUserMap
> +    // contains non-direct associations.
> +    DenseSet<ValuePair>  Visited;
> +    std::vector<ValuePair>  Q;
> +    // General depth-first post-order traversal:
> +    Q.push_back(P);
> +    while (!Q.empty()) {

This is always true on the first iteration. Please make this a:

   SmallVector<ValuePair, 32> Q;
   Q.push_back(P);
   do {
     ValuePair QTop = Q.pop_back_val();
     Visited.insert(QTop);
     // ...
   } while(!Q.empty());

loop.

> +      ValuePair QTop = Q.back();
> +
> +      Visited.insert(QTop);
> +      Q.pop_back();
> +
> +      DEBUG(if (DebugCycleCheck)
> +              dbgs()<<  "BBV: cycle check visiting: "<<  *QTop.first<<  "<->  "
> +<<  *QTop.second<<  "\n");
> +      VPPIteratorPair QPairRange = PairableInstUserMap.equal_range(QTop);
> +      for (std::multimap<ValuePair, ValuePair>::iterator C = QPairRange.first;
> +           C != QPairRange.second; ++C) {
> +        if (C->second == P) {
> +          DEBUG(dbgs()
> +<<  "BBV: rejected to prevent non-trivial cycle formation:"
> +<<  *C->first.first<<  "<->  "<<  *C->first.second<<  "\n");
> +          return true;
> +        }
> +
> +        if (CurrentPairs.count(C->second)>  0&&
> +            Visited.count(C->second) == 0)
> +          Q.push_back(C->second);
> +      }
> +    }
> +
> +    return false;
> +  }
> +
> +  // This function builds the initial tree of connected pairs with the
> +  // pair J at the root.
> +  void BBVectorize::buildInitialTreeFor(
> +                      std::multimap<Value *, Value *>  &CandidatePairs,
> +                      std::vector<Value *>  &PairableInsts,
> +                      std::multimap<ValuePair, ValuePair>  &ConnectedPairs,
> +                      DenseSet<ValuePair>  &PairableInstUsers,
> +                      DenseMap<Value *, Value *>  &ChosenPairs,
> +                      DenseMap<ValuePair, size_t>  &Tree, ValuePair J) {
> +    // Each of these pairs is viewed as the root node of a Tree. The Tree
> +    // is then walked (depth-first). As this happens, we keep track of
> +    // the pairs that compose the Tree and the maximum depth of the Tree.
> +    std::vector<ValuePairWithDepth>  Q;
> +    // General depth-first post-order traversal:
> +    Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
> +    while (!Q.empty()) {
> +      ValuePairWithDepth QTop = Q.back();

This loop can be rotated too, though you may not want to switch to using 
pop_back_val() here (I see that you do additional pushes and optional 
pops in the loop).

> +
> +      // Push each child onto the queue:
> +      bool MoreChildren = false;
> +      size_t MaxChildDepth = QTop.second;
> +      VPPIteratorPair qtRange = ConnectedPairs.equal_range(QTop.first);
> +      for (std::map<ValuePair, ValuePair>::iterator k = qtRange.first;
> +           k != qtRange.second; ++k) {
> +        // Make sure that this child pair is still a candidate:
> +        bool IsStillCand = false;
> +        VPIteratorPair checkRange =
> +          CandidatePairs.equal_range(k->second.first);
> +        for (std::multimap<Value *, Value *>::iterator m = checkRange.first;
> +             m != checkRange.second; ++m) {
> +          if (m->second == k->second.second) {
> +            IsStillCand = true;
> +            break;
> +          }
> +        }
> +
> +        if (IsStillCand) {
> +          DenseMap<ValuePair, size_t>::iterator C = Tree.find(k->second);
> +          if (C == Tree.end()) {
> +            size_t d = getDepthFactor(k->second.first);
> +            Q.push_back(ValuePairWithDepth(k->second, QTop.second+d));
> +            MoreChildren = true;
> +          } else {
> +            MaxChildDepth = std::max(MaxChildDepth, C->second);
> +          }
> +        }
> +      }
> +
> +      if (!MoreChildren) {
> +        // Record the current pair as part of the Tree:
> +        Tree.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
> +        Q.pop_back();
> +      }
> +    }
> +  }
> +
> +  // Given some initial tree, prune it by removing conflicting pairs (pairs
> +  // that cannot be simultaneously chosen for vectorization).
> +  void BBVectorize::pruneTreeFor(
> +                      std::multimap<Value *, Value *>  &CandidatePairs,
> +                      std::vector<Value *>  &PairableInsts,
> +                      std::multimap<ValuePair, ValuePair>  &ConnectedPairs,
> +                      DenseSet<ValuePair>  &PairableInstUsers,
> +                      std::multimap<ValuePair, ValuePair>  &PairableInstUserMap,
> +                      DenseMap<Value *, Value *>  &ChosenPairs,
> +                      DenseMap<ValuePair, size_t>  &Tree,
> +                      DenseSet<ValuePair>  &PrunedTree, ValuePair J,
> +                      bool UseCycleCheck) {
> +    std::vector<ValuePairWithDepth>  Q;
> +    // General depth-first post-order traversal:
> +    Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
> +    while (!Q.empty()) {
> +      ValuePairWithDepth QTop = Q.back();
> +      PrunedTree.insert(QTop.first);
> +      Q.pop_back();

Another loop to restructure.

(Stopped reviewing at this point.)

Nick



More information about the llvm-commits mailing list