[llvm-commits] [llvm] r171469 - in /llvm/trunk: include/llvm/Target/TargetTransformImpl.h include/llvm/TargetTransformInfo.h lib/Target/TargetTransformImpl.cpp lib/Target/X86/X86ISelLowering.cpp lib/Target/X86/X86ISelLowering.h lib/Transforms/Vectorize/LoopVectorize.cpp lib/Transforms/Vectorize/LoopVectorize.h test/Transforms/LoopVectorize/X86/gcc-examples.ll test/Transforms/LoopVectorize/gcc-examples.ll

Fri Jan 4 10:15:48 PST 2013

----- Original Message -----
> From: "Nadav Rotem" <nrotem at apple.com>
> To: llvm-commits at cs.uiuc.edu
> Sent: Friday, January 4, 2013 11:48:26 AM
> Subject: [llvm-commits] [llvm] r171469 - in /llvm/trunk: include/llvm/Target/TargetTransformImpl.h
> include/llvm/TargetTransformInfo.h lib/Target/TargetTransformImpl.cpp lib/Target/X86/X86ISelLowering.cpp
> lib/Target/X86/X86ISelLowering.h lib/Transforms/Vectorize/LoopVectorize.cpp lib/Transforms/Vectorize/LoopVectorize.h
> test/Transforms/LoopVectorize/X86/gcc-examples.ll test/Transforms/LoopVectorize/gcc-examples.ll
> 
> Author: nadav
> Date: Fri Jan  4 11:48:25 2013
> New Revision: 171469
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=171469&view=rev
> Log:
> LoopVectorizer:
> 
> 1. Add code to estimate register pressure.
> 2. Add code to select the unroll factor based on register pressure.
> 3. Add bits to TargetTransformInfo to provide the number of
> registers.
> 
> 
> Modified:
>     llvm/trunk/include/llvm/Target/TargetTransformImpl.h
>     llvm/trunk/include/llvm/TargetTransformInfo.h
>     llvm/trunk/lib/Target/TargetTransformImpl.cpp
>     llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>     llvm/trunk/lib/Target/X86/X86ISelLowering.h
>     llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
>     llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.h
>     llvm/trunk/test/Transforms/LoopVectorize/X86/gcc-examples.ll
>     llvm/trunk/test/Transforms/LoopVectorize/gcc-examples.ll
> 
> Modified: llvm/trunk/include/llvm/Target/TargetTransformImpl.h
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetTransformImpl.h?rev=171469&r1=171468&r2=171469&view=diff
> ==============================================================================
> --- llvm/trunk/include/llvm/Target/TargetTransformImpl.h (original)
> +++ llvm/trunk/include/llvm/Target/TargetTransformImpl.h Fri Jan  4
> 11:48:25 2013
> @@ -69,6 +69,8 @@
>  
>    virtual ~VectorTargetTransformImpl() {}
>  
> +  virtual unsigned getNumberOfRegisters(bool Vector) const;
> +
>    virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty)
>    const;
>  
>    virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
> 
> Modified: llvm/trunk/include/llvm/TargetTransformInfo.h
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/TargetTransformInfo.h?rev=171469&r1=171468&r2=171469&view=diff
> ==============================================================================
> --- llvm/trunk/include/llvm/TargetTransformInfo.h (original)
> +++ llvm/trunk/include/llvm/TargetTransformInfo.h Fri Jan  4 11:48:25
> 2013
> @@ -164,12 +164,19 @@
>      ExtractSubvector // ExtractSubvector Index indicates start
>      offset.
>    };
>  
> -  /// Returns the expected cost of arithmetic ops, such as mul, xor,
> fsub, etc.
> +  /// \return The number of scalar or vector registers that the
> target has.
> +  /// If 'Vectors' is true, it returns the number of vector
> registers. If it is
> +  /// set to false, it returns the number of scalar registers.
> +  virtual unsigned getNumberOfRegisters(bool Vector) const {
> +    return 8;
> +  }
> +
> +  /// \return The expected cost of arithmetic ops, such as mul, xor,
> fsub, etc.
>    virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty)
>    const {
>      return 1;
>    }
>  
> -  /// Returns the cost of a shuffle instruction of kind Kind and of
> type Tp.
> +  /// \return The cost of a shuffle instruction of kind Kind and of
> type Tp.
>    /// The index and subtype parameters are used by the subvector
>    insertion and
>    /// extraction shuffle kinds.
>    virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
> @@ -177,47 +184,47 @@
>      return 1;
>    }
>  
> -  /// Returns the expected cost of cast instructions, such as
> bitcast, trunc,
> +  /// \return The expected cost of cast instructions, such as
> bitcast, trunc,
>    /// zext, etc.
>    virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
>                                      Type *Src) const {
>      return 1;
>    }
>  
> -  /// Returns the expected cost of control-flow related
> instrutctions such as
> +  /// \return The expected cost of control-flow related
> instrutctions such as
>    /// Phi, Ret, Br.
>    virtual unsigned getCFInstrCost(unsigned Opcode) const {
>      return 1;
>    }
>  
> -  /// Returns the expected cost of compare and select instructions.
> +  /// \returns The expected cost of compare and select instructions.
>    virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
>                                        Type *CondTy = 0) const {
>      return 1;
>    }
>  
> -  /// Returns the expected cost of vector Insert and Extract.
> +  /// \return The expected cost of vector Insert and Extract.
>    /// Use -1 to indicate that there is no information on the index
>    value.
>    virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
>                                        unsigned Index = -1) const {
>      return 1;
>    }
>  
> -  /// Returns the cost of Load and Store instructions.
> +  /// \return The cost of Load and Store instructions.
>    virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
>                                     unsigned Alignment,
>                                     unsigned AddressSpace) const {
>      return 1;
>    }
>  
> -  /// Returns the cost of Intrinsic instructions.
> +  /// \returns The cost of Intrinsic instructions.
>    virtual unsigned getIntrinsicInstrCost(Intrinsic::ID,
>                                           Type *RetTy,
>                                           ArrayRef<Type*> Tys) const
>                                           {
>      return 1;
>    }
>  
> -  /// Returns the number of pieces into which the provided type must
> be
> +  /// \returns The number of pieces into which the provided type
> must be
>    /// split during legalization. Zero is returned when the answer is
>    unknown.
>    virtual unsigned getNumberOfParts(Type *Tp) const {
>      return 0;
> 
> Modified: llvm/trunk/lib/Target/TargetTransformImpl.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/TargetTransformImpl.cpp?rev=171469&r1=171468&r2=171469&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/TargetTransformImpl.cpp (original)
> +++ llvm/trunk/lib/Target/TargetTransformImpl.cpp Fri Jan  4 11:48:25
> 2013
> @@ -171,6 +171,10 @@
>    return Cost;
>  }
>  
> +unsigned VectorTargetTransformImpl::getNumberOfRegisters(bool
> Vector) const {
> +  return 8;
> +}
> +
>  unsigned VectorTargetTransformImpl::getArithmeticInstrCost(unsigned
>  Opcode,
>                                                             Type *Ty)
>                                                             const {
>    // Check if any of the operands are vector operands.
> 
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=171469&r1=171468&r2=171469&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Jan  4 11:48:25
> 2013
> @@ -18115,6 +18115,13 @@
>    return ST.hasSSE41() ? Fast : None;
>  }
>  
> +unsigned X86VectorTargetTransformInfo::getNumberOfRegisters(bool
> Vector) const {
> +  const X86Subtarget &ST =
> TLI->getTargetMachine().getSubtarget<X86Subtarget>();
> +  if (ST.is64Bit())
> +    return 16;
> +  return 8;
> +}

Can't we get this from the generic TLI code? As I recall, you tried to get register information into VTTI at one point, but had to back it out because of shared-library linking problems. We should really investigate that again.

> +
>  unsigned
>  X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned
>  Opcode,
>                                                       Type *Ty) const
>                                                       {
> 
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.h
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.h?rev=171469&r1=171468&r2=171469&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.h (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h Fri Jan  4 11:48:25
> 2013
> @@ -959,6 +959,8 @@
>      explicit X86VectorTargetTransformInfo(const TargetLowering *TL)
>      :
>      VectorTargetTransformImpl(TL) {}
>  
> +    virtual unsigned getNumberOfRegisters(bool Vector) const;
> +
>      virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type
>      *Ty) const;
>  
>      virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
> 
> Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=171469&r1=171468&r2=171469&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
> +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Fri Jan  4
> 11:48:25 2013
> @@ -7,6 +7,7 @@
>  //
>  //===----------------------------------------------------------------------===//
>  #include "LoopVectorize.h"
> +#include "llvm/ADT/SmallSet.h"
>  #include "llvm/ADT/StringExtras.h"
>  #include "llvm/Analysis/AliasAnalysis.h"
>  #include "llvm/Analysis/AliasSetTracker.h"
> @@ -43,7 +44,7 @@
>                      cl::desc("Sets the SIMD width. Zero is
>                      autoselect."));
>  
>  static cl::opt<unsigned>
> -VectorizationUnroll("force-vector-unroll", cl::init(1), cl::Hidden,
> +VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden,
>                      cl::desc("Sets the vectorization unroll count. "
>                               "Zero is autoselect."));
>  
> @@ -94,7 +95,7 @@
>      if (TTI)
>        VTTI = TTI->getVectorTargetTransformInfo();
>      // Use the cost model.
> -    LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
> +    LoopVectorizationCostModel CM(L, SE, LI, &LVL, VTTI);
>  
>      // Check the function attribues to find out if this function
>      should be
>      // optimized for size.
> @@ -112,6 +113,7 @@
>      }
>  
>      unsigned VF = CM.selectVectorizationFactor(OptForSize,
>      VectorizationFactor);
> +    unsigned UF = CM.selectUnrollFactor(OptForSize,
> VectorizationUnroll);
>  
>      if (VF == 1) {
>        DEBUG(dbgs() << "LV: Vectorization is possible but not
>        beneficial.\n");
> @@ -120,9 +122,10 @@
>  
>      DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in
>      "<<
>            F->getParent()->getModuleIdentifier()<<"\n");
> +    DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
>  
>      // If we decided that it is *legal* to vectorizer the loop then
>      do it.
> -    InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF,
> VectorizationUnroll);
> +    InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, UF);
>      LB.vectorize(&LVL);
>  
>      DEBUG(verifyFunction(*L->getHeader()->getParent()));
> @@ -2082,7 +2085,7 @@
>  
>  unsigned
>  LoopVectorizationCostModel::selectVectorizationFactor(bool
>  OptForSize,
> -                                                        unsigned
> UserVF) {
> +                                                      unsigned
> UserVF) {
>    if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
>      DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in
>      Os.\n");
>      return 1;
> @@ -2148,6 +2151,161 @@
>    return Width;
>  }
>  
> +unsigned
> +LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
> +                                               unsigned UserUF) {
> +  // Use the user preference, unless 'auto' is selected.
> +  if (UserUF != 0)
> +    return UserUF;
> +
> +  // When we optimize for size we don't unroll.
> +  if (OptForSize)
> +    return 1;
> +
> +  unsigned TargetVectorRegisters = VTTI->getNumberOfRegisters(true);
> +  DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters <<
> +        " vector registers\n");
> +
> +  LoopVectorizationCostModel::RegisterUsage R =
> calculateRegisterUsage();
> +  // We divide by these constants so assume that we have at least
> one
> +  // instruction that uses at least one register.
> +  R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
> +  R.NumInstructions = std::max(R.NumInstructions, 1U);
> +
> +  // We calculate the unroll factor using the following formula.
> +  // Subtract the number of loop invariants from the number of
> available
> +  // registers. These registers are used by all of the unrolled
> instances.
> +  // Next, divide the remaining registers by the number of registers
> that is
> +  // required by the loop, in order to estimate how many parallel
> instances
> +  // fit without causing spills.
> +  unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) /
> R.MaxLocalUsers;
> +
> +  // We don't want to unroll the loops to the point where they do
> not fit into
> +  // the decoded cache. Assume that we only allow 32 IR
> instructions.
> +  UF = std::min(UF, (32 / R.NumInstructions));
> +
> +  // Clamp the unroll factor ranges to reasonable factors.
> +  if (UF > MaxUnrollSize)
> +    UF = MaxUnrollSize;
> +  else if (UF < 1)
> +    UF = 1;
> +
> +  return UF;
> +}
> +
> +LoopVectorizationCostModel::RegisterUsage
> +LoopVectorizationCostModel::calculateRegisterUsage() {
> +  // This function calculates the register usage by measuring the
> highest number
> +  // of values that are alive at a single location. Obviously, this
> is a very
> +  // rough estimation. We scan the loop in a topological order in
> order and
> +  // assign a number to each instruction. We use RPO to ensure that
> defs are
> +  // met before their users. We assume that each instruction that
> has in-loop
> +  // users starts an interval. We record every time that an in-loop
> value is
> +  // used, so we have a list of the first and last occurrences of
> each
> +  // instruction. Next, we transpose this data structure into a
> multi map that
> +  // holds the list of intervals that *end* at a specific location.
> This multi
> +  // map allows us to perform a linear search. We scan the
> instructions linearly
> +  // and record each time that a new interval starts, by placing it
> in a set.
> +  // If we find this value in the multi-map then we remove it from
> the set.
> +  // The max register usage is the maximum size of the set.
> +  // We also search for instructions that are defined outside the
> loop, but are
> +  // used inside the loop. We need this number separately from the
> max-interval
> +  // usage number because when we unroll, loop-invariant values do
> not take

This is great, but please move it elsewhere. I'd like to use this in the regular unroller (among other places). Also, we'll need to differentiate different register types (at least vector registers from scalar registers) -- I've taken only a quick look, but this does not seem to do that.

Thanks again,
Hal

> +  // more register.
> +  LoopBlocksDFS DFS(TheLoop);
> +  DFS.perform(LI);
> +
> +  RegisterUsage R;
> +  R.NumInstructions = 0;
> +
> +  // Each 'key' in the map opens a new interval. The values
> +  // of the map are the index of the 'last seen' usage of the
> +  // instruction that is the key.
> +  typedef DenseMap<Instruction*, unsigned> IntervalMap;
> +  // Maps instruction to its index.
> +  DenseMap<unsigned, Instruction*> IdxToInstr;
> +  // Marks the end of each interval.
> +  IntervalMap EndPoint;
> +  // Saves the list of instruction indices that are used in the
> loop.
> +  SmallSet<Instruction*, 8> Ends;
> +  // Saves the list of values that are used in the loop but are
> +  // defined outside the loop, such as arguments and constants.
> +  SmallPtrSet<Value*, 8> LoopInvariants;
> +
> +  unsigned Index = 0;
> +  for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
> +       be = DFS.endRPO(); bb != be; ++bb) {
> +    R.NumInstructions += (*bb)->size();
> +    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end();
> it != e;
> +         ++it) {
> +      Instruction *I = it;
> +      IdxToInstr[Index++] = I;
> +
> +      // Save the end location of each USE.
> +      for (unsigned i = 0; i < I->getNumOperands(); ++i) {
> +        Value *U = I->getOperand(i);
> +        Instruction *Instr = dyn_cast<Instruction>(U);
> +
> +        // Ignore non-instruction values such as arguments,
> constants, etc.
> +        if (!Instr) continue;
> +
> +        // If this instruction is outside the loop then record it
> and continue.
> +        if (!TheLoop->contains(Instr)) {
> +          LoopInvariants.insert(Instr);
> +          continue;
> +        }
> +
> +        // Overwrite previous end points.
> +        EndPoint[Instr] = Index;
> +        Ends.insert(Instr);
> +      }
> +    }
> +  }
> +
> +  // Saves the list of intervals that end with the index in 'key'.
> +  typedef SmallVector<Instruction*, 2> InstrList;
> +  DenseMap<unsigned, InstrList> TransposeEnds;
> +
> +  // Transpose the EndPoints to a list of values that end at each
> index.
> +  for (IntervalMap::iterator it = EndPoint.begin(), e =
> EndPoint.end();
> +       it != e; ++it)
> +    TransposeEnds[it->second].push_back(it->first);
> +
> +  SmallSet<Instruction*, 8> OpenIntervals;
> +  unsigned MaxUsage = 0;
> +
> +
> +  DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
> +  for (unsigned int i = 0; i < Index; ++i) {
> +    Instruction *I = IdxToInstr[i];
> +    // Ignore instructions that are never used within the loop.
> +    if (!Ends.count(I)) continue;
> +
> +    // Remove all of the instructions that end at this location.
> +    InstrList &List = TransposeEnds[i];
> +    for (unsigned int i=0, e = List.size(); i < e; ++i)
> +      OpenIntervals.erase(List[i]);
> +
> +    // Count the number of live interals.
> +    MaxUsage = std::max(MaxUsage, OpenIntervals.size());
> +
> +    DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
> +          OpenIntervals.size() <<"\n");
> +
> +    // Add the current instruction to the list of open intervals.
> +    OpenIntervals.insert(I);
> +  }
> +
> +  unsigned Invariant = LoopInvariants.size();
> +  DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << "
> \n");
> +  DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant <<
> " \n");
> +  DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << "
> \n");
> +
> +  R.LoopInvariantRegs = Invariant;
> +  R.MaxLocalUsers = MaxUsage;
> +  return R;
> +}
> +
>  unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
>    unsigned Cost = 0;
>  
> 
> Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.h
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.h?rev=171469&r1=171468&r2=171469&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.h (original)
> +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.h Fri Jan  4
> 11:48:25 2013
> @@ -68,6 +68,9 @@
>  /// This is the highest vector width that we try to generate.
>  const unsigned MaxVectorSize = 8;
>  
> +/// This is the highest Unroll Factor.
> +const unsigned MaxUnrollSize = 4;
> +
>  namespace llvm {
>  
>  // Forward declarations.
> @@ -473,17 +476,37 @@
>  class LoopVectorizationCostModel {
>  public:
>    /// C'tor.
> -  LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se,
> +  LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, LoopInfo
> *Li,
>                               LoopVectorizationLegality *Leg,
>                               const VectorTargetTransformInfo *Vtti):
> -  TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { }
> +  TheLoop(Lp), SE(Se), LI(Li), Legal(Leg), VTTI(Vtti) { }
>  
> -  /// Returns the most profitable vectorization factor in powers of
> two.
> +  /// \return The most profitable vectorization factor.
>    /// This method checks every power of two up to VF. If UserVF is
>    not ZERO
>    /// then this vectorization factor will be selected if
>    vectorization is
>    /// possible.
>    unsigned selectVectorizationFactor(bool OptForSize, unsigned
>    UserVF);
>  
> +
> +  /// \return The most profitable unroll factor.
> +  /// If UserUF is non-zero then this method finds the best
> unroll-factor
> +  /// based on register pressure and other parameters.
> +  unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF);
> +
> +  /// \brief A struct that represents some properties of the
> register usage
> +  /// of a loop.
> +  struct RegisterUsage {
> +    /// Holds the number of loop invariant values that are used in
> the loop.
> +    unsigned LoopInvariantRegs;
> +    /// Holds the maximum number of concurrent live intervals in the
> loop.
> +    unsigned MaxLocalUsers;
> +    /// Holds the number of instructions in the loop.
> +    unsigned NumInstructions;
> +  };
> +
> +  /// \return  information about the register usage of the loop.
> +  RegisterUsage calculateRegisterUsage();
> +
>  private:
>    /// Returns the expected execution cost. The unit of the cost does
>    /// not matter because we use the 'cost' units to compare
>    different
> @@ -504,7 +527,8 @@
>    Loop *TheLoop;
>    /// Scev analysis.
>    ScalarEvolution *SE;
> -
> +  /// Loop Info analysis.
> +  LoopInfo *LI;
>    /// Vectorization legality.
>    LoopVectorizationLegality *Legal;
>    /// Vector target information.
> 
> Modified:
> llvm/trunk/test/Transforms/LoopVectorize/X86/gcc-examples.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/gcc-examples.ll?rev=171469&r1=171468&r2=171469&view=diff
> ==============================================================================
> --- llvm/trunk/test/Transforms/LoopVectorize/X86/gcc-examples.ll
> (original)
> +++ llvm/trunk/test/Transforms/LoopVectorize/X86/gcc-examples.ll Fri
> Jan  4 11:48:25 2013
> @@ -1,4 +1,5 @@
>  ; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0
>  -mcpu=corei7 -dce -instcombine -licm -S | FileCheck %s
> +; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0
> -mcpu=corei7 -force-vector-unroll=0 -dce -instcombine -licm -S |
> FileCheck %s -check-prefix=UNROLL
>  
>  target datalayout =
>  "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
>  target triple = "x86_64-apple-macosx10.8.0"
> @@ -13,6 +14,15 @@
>  ;CHECK: add nsw <4 x i32>
>  ;CHECK: store <4 x i32>
>  ;CHECK: ret void
> +
> +;UNROLL: @example1
> +;UNROLL: load <4 x i32>
> +;UNROLL: load <4 x i32>
> +;UNROLL: add nsw <4 x i32>
> +;UNROLL: add nsw <4 x i32>
> +;UNROLL: store <4 x i32>
> +;UNROLL: store <4 x i32>
> +;UNROLL: ret void
>  define void @example1() nounwind uwtable ssp {
>    br label %1
>  
> @@ -34,13 +44,20 @@
>    ret void
>  }
>  
> -
> -; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive.
> +; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive.
>  ;CHECK: @example10b
>  ;CHECK: load <4 x i16>
>  ;CHECK: sext <4 x i16>
>  ;CHECK: store <4 x i32>
>  ;CHECK: ret void
> +;UNROLL: @example10b
> +;UNROLL: load <4 x i16>
> +;UNROLL: load <4 x i16>
> +;UNROLL: load <4 x i16>
> +;UNROLL: store <4 x i32>
> +;UNROLL: store <4 x i32>
> +;UNROLL: store <4 x i32>
> +;UNROLL: ret void
>  define void @example10b(i16* noalias nocapture %sa, i16* noalias
>  nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture
>  %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic)
>  nounwind uwtable ssp {
>    br label %1
>  
> 
> Modified: llvm/trunk/test/Transforms/LoopVectorize/gcc-examples.ll
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/gcc-examples.ll?rev=171469&r1=171468&r2=171469&view=diff
> ==============================================================================
> --- llvm/trunk/test/Transforms/LoopVectorize/gcc-examples.ll
> (original)
> +++ llvm/trunk/test/Transforms/LoopVectorize/gcc-examples.ll Fri Jan
>  4 11:48:25 2013
> @@ -1,4 +1,5 @@
>  ; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce
>  -instcombine -licm -S | FileCheck %s
> +; RUN: opt < %s  -loop-vectorize -force-vector-width=4
> -force-vector-unroll=4 -dce -instcombine -licm -S | FileCheck %s
> -check-prefix=UNROLL
>  
>  target datalayout =
>  "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
>  target triple = "x86_64-apple-macosx10.8.0"
> @@ -24,6 +25,20 @@
>  ;CHECK: add nsw <4 x i32>
>  ;CHECK: store <4 x i32>
>  ;CHECK: ret void
> +;UNROLL: @example1
> +;UNROLL: load <4 x i32>
> +;UNROLL: load <4 x i32>
> +;UNROLL: load <4 x i32>
> +;UNROLL: load <4 x i32>
> +;UNROLL: add nsw <4 x i32>
> +;UNROLL: add nsw <4 x i32>
> +;UNROLL: add nsw <4 x i32>
> +;UNROLL: add nsw <4 x i32>
> +;UNROLL: store <4 x i32>
> +;UNROLL: store <4 x i32>
> +;UNROLL: store <4 x i32>
> +;UNROLL: store <4 x i32>
> +;UNROLL: ret void
>  define void @example1() nounwind uwtable ssp {
>    br label %1
>  
> @@ -48,6 +63,12 @@
>  ;CHECK: @example2
>  ;CHECK: store <4 x i32>
>  ;CHECK: ret void
> +;UNROLL: @example2
> +;UNROLL: store <4 x i32>
> +;UNROLL: store <4 x i32>
> +;UNROLL: store <4 x i32>
> +;UNROLL: store <4 x i32>
> +;UNROLL: ret void
>  define void @example2(i32 %n, i32 %x) nounwind uwtable ssp {
>    %1 = icmp sgt i32 %n, 0
>    br i1 %1, label %.lr.ph5, label %.preheader
> @@ -92,6 +113,12 @@
>  ;CHECK: @example3
>  ;CHECK: <4 x i32>
>  ;CHECK: ret void
> +;UNROLL: @example3
> +;UNROLL: <4 x i32>
> +;UNROLL: <4 x i32>
> +;UNROLL: <4 x i32>
> +;UNROLL: <4 x i32>
> +;UNROLL: ret void
>  define void @example3(i32 %n, i32* noalias nocapture %p, i32*
>  noalias nocapture %q) nounwind uwtable ssp {
>    %1 = icmp eq i32 %n, 0
>    br i1 %1, label %._crit_edge, label %.lr.ph
> @@ -115,6 +142,12 @@
>  ;CHECK: @example4
>  ;CHECK: load <4 x i32>
>  ;CHECK: ret void
> +;UNROLL: @example4
> +;UNROLL: load <4 x i32>
> +;UNROLL: load <4 x i32>
> +;UNROLL: load <4 x i32>
> +;UNROLL: load <4 x i32>
> +;UNROLL: ret void
>  define void @example4(i32 %n, i32* noalias nocapture %p, i32*
>  noalias nocapture %q) nounwind uwtable ssp {
>    %1 = add nsw i32 %n, -1
>    %2 = icmp eq i32 %n, 0
> @@ -175,6 +208,12 @@
>  ;CHECK: @example8
>  ;CHECK: store <4 x i32>
>  ;CHECK: ret void
> +;UNROLL: @example8
> +;UNROLL: store <4 x i32>
> +;UNROLL: store <4 x i32>
> +;UNROLL: store <4 x i32>
> +;UNROLL: store <4 x i32>
> +;UNROLL: ret void
>  define void @example8(i32 %x) nounwind uwtable ssp {
>    br label %.preheader
>  
> 
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
> 

-- 
Hal Finkel
Postdoctoral Appointee
Leadership Computing Facility
Argonne National Laboratory