[llvm] r211339 - Add Support to Recognize and Vectorize NON SIMD instructions in SLPVectorizer.

Thu Jun 19 21:32:49 PDT 2014

Author: karthik
Date: Thu Jun 19 23:32:48 2014
New Revision: 211339

URL: http://llvm.org/viewvc/llvm-project?rev=211339&view=rev
Log:
Add Support to Recognize and Vectorize NON SIMD instructions in SLPVectorizer.

This patch adds support to recognize patterns such as fadd,fsub,fadd,fsub.../add,sub,add,sub... and
vectorizes them as vector shuffles if they are profitable.
These patterns of vector shuffle can later be converted to instructions such as addsubpd etc on X86.
Thanks to Arnold and Hal for the reviews. http://reviews.llvm.org/D4015 

Added:
    llvm/trunk/test/Transforms/SLPVectorizer/X86/addsub.ll
Modified:
    llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
    llvm/trunk/lib/CodeGen/BasicTargetTransformInfo.cpp
    llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp
    llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
    llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp

Modified: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h?rev=211339&r1=211338&r2=211339&view=diff
==============================================================================

--- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h (original)
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h Thu Jun 19 23:32:48 2014
@@ -322,6 +322,7 @@ public:
   enum ShuffleKind {
     SK_Broadcast,       ///< Broadcast element 0 to all other elements.
     SK_Reverse,         ///< Reverse the order of the vector.
+    SK_Alternate,       ///< Choose alternate elements from vector.
     SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset.
     SK_ExtractSubvector ///< ExtractSubvector Index indicates start offset.
   };

Modified: llvm/trunk/lib/CodeGen/BasicTargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/BasicTargetTransformInfo.cpp?rev=211339&r1=211338&r2=211339&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/BasicTargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/CodeGen/BasicTargetTransformInfo.cpp Thu Jun 19 23:32:48 2014
@@ -39,6 +39,9 @@ class BasicTTI final : public ImmutableP
   /// are set if the result needs to be inserted and/or extracted from vectors.
   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
 
+  /// Estimate the cost overhead of SK_Alternate shuffle.
+  unsigned getAltShuffleOverhead(Type *Ty) const;
+
   const TargetLoweringBase *getTLI() const { return TM->getTargetLowering(); }
 
 public:
@@ -327,8 +330,28 @@ unsigned BasicTTI::getArithmeticInstrCos
   return OpCost;
 }
 
+unsigned BasicTTI::getAltShuffleOverhead(Type *Ty) const {
+  assert(Ty->isVectorTy() && "Can only shuffle vectors");
+  unsigned Cost = 0;
+  // Shuffle cost is equal to the cost of extracting element from its argument
+  // plus the cost of inserting them onto the result vector.
+
+  // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from index
+  // 0 of first vector, index 1 of second vector,index 2 of first vector and
+  // finally index 3 of second vector and insert them at index <0,1,2,3> of
+  // result vector.
+  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+    Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+    Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+  }
+  return Cost;
+}
+
 unsigned BasicTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
                                   Type *SubTp) const {
+  if (Kind == SK_Alternate) {
+    return getAltShuffleOverhead(Tp);
+  }
   return 1;
 }
 

Modified: llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp?rev=211339&r1=211338&r2=211339&view=diff
==============================================================================
--- llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.cpp Thu Jun 19 23:32:48 2014
@@ -443,31 +443,58 @@ unsigned ARMTTI::getAddressComputationCo
 
 unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
                                 Type *SubTp) const {
-  // We only handle costs of reverse shuffles for now.
-  if (Kind != SK_Reverse)
+  // We only handle costs of reverse and alternate shuffles for now.
+  if (Kind != SK_Reverse && Kind != SK_Alternate)
     return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 
-  static const CostTblEntry<MVT::SimpleValueType> NEONShuffleTbl[] = {
-    // Reverse shuffle cost one instruction if we are shuffling within a double
-    // word (vrev) or two if we shuffle a quad word (vrev, vext).
-    { ISD::VECTOR_SHUFFLE, MVT::v2i32, 1 },
-    { ISD::VECTOR_SHUFFLE, MVT::v2f32, 1 },
-    { ISD::VECTOR_SHUFFLE, MVT::v2i64, 1 },
-    { ISD::VECTOR_SHUFFLE, MVT::v2f64, 1 },
-
-    { ISD::VECTOR_SHUFFLE, MVT::v4i32, 2 },
-    { ISD::VECTOR_SHUFFLE, MVT::v4f32, 2 },
-    { ISD::VECTOR_SHUFFLE, MVT::v8i16, 2 },
-    { ISD::VECTOR_SHUFFLE, MVT::v16i8, 2 }
-  };
-
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
-
-  int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
-  if (Idx == -1)
-    return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
-
-  return LT.first * NEONShuffleTbl[Idx].Cost;
+  if (Kind == SK_Reverse) {
+    static const CostTblEntry<MVT::SimpleValueType> NEONShuffleTbl[] = {
+        // Reverse shuffle cost one instruction if we are shuffling within a
+        // double word (vrev) or two if we shuffle a quad word (vrev, vext).
+        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
+
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+
+    int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+    if (Idx == -1)
+      return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+
+    return LT.first * NEONShuffleTbl[Idx].Cost;
+  }
+  if (Kind == SK_Alternate) {
+    static const CostTblEntry<MVT::SimpleValueType> NEONAltShuffleTbl[] = {
+        // Alt shuffle cost table for ARM. Cost is the number of instructions
+        // required to create the shuffled vector.
+
+        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
+
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+    int Idx =
+        CostTableLookup(NEONAltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+    if (Idx == -1)
+      return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+    return LT.first * NEONAltShuffleTbl[Idx].Cost;
+  }
+  return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
 unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,

Modified: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp?rev=211339&r1=211338&r2=211339&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp Thu Jun 19 23:32:48 2014
@@ -402,17 +402,47 @@ unsigned X86TTI::getArithmeticInstrCost(
 
 unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
                                 Type *SubTp) const {
-  // We only estimate the cost of reverse shuffles.
-  if (Kind != SK_Reverse)
+  // We only estimate the cost of reverse and alternate shuffles.
+  if (Kind != SK_Reverse && Kind != SK_Alternate)
     return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
-  unsigned Cost = 1;
-  if (LT.second.getSizeInBits() > 128)
-    Cost = 3; // Extract + insert + copy.
+  if (Kind == SK_Reverse) {
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+    unsigned Cost = 1;
+    if (LT.second.getSizeInBits() > 128)
+      Cost = 3; // Extract + insert + copy.
 
-  // Multiple by the number of parts.
-  return Cost * LT.first;
+    // Multiple by the number of parts.
+    return Cost * LT.first;
+  }
+
+  if (Kind == SK_Alternate) {
+    static const CostTblEntry<MVT::SimpleValueType> X86AltShuffleTbl[] = {
+        // Alt shuffle cost table for X86. Cost is the number of instructions
+        // required to create the shuffled vector.
+
+        {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
+        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v2i32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
+        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v4i16, 8},
+        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 8},
+
+        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 49}};
+
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+
+    int Idx = CostTableLookup(X86AltShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
+    if (Idx == -1)
+      return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+    return LT.first * X86AltShuffleTbl[Idx].Cost;
+  }
+
+  return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
 unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {

Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=211339&r1=211338&r2=211339&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Thu Jun 19 23:32:48 2014
@@ -149,6 +149,48 @@ static bool isSplat(ArrayRef<Value *> VL
   return true;
 }
 
+///\returns Opcode that can be clubbed with \p Op to create an alternate
+/// sequence which can later be merged as a ShuffleVector instruction.
+static unsigned getAltOpcode(unsigned Op) {
+  switch (Op) {
+  case Instruction::FAdd:
+    return Instruction::FSub;
+  case Instruction::FSub:
+    return Instruction::FAdd;
+  case Instruction::Add:
+    return Instruction::Sub;
+  case Instruction::Sub:
+    return Instruction::Add;
+  default:
+    return 0;
+  }
+}
+
+///\returns bool representing if Opcode \p Op can be part
+/// of an alternate sequence which can later be merged as
+/// a ShuffleVector instruction.
+static bool canCombineAsAltInst(unsigned Op) {
+  if (Op == Instruction::FAdd || Op == Instruction::FSub ||
+      Op == Instruction::Sub || Op == Instruction::Add)
+    return true;
+  return false;
+}
+
+/// \returns ShuffleVector instruction if intructions in \p VL have
+///  alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
+/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
+static unsigned isAltInst(ArrayRef<Value *> VL) {
+  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+  unsigned Opcode = I0->getOpcode();
+  unsigned AltOpcode = getAltOpcode(Opcode);
+  for (int i = 1, e = VL.size(); i < e; i++) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
+      return 0;
+  }
+  return Instruction::ShuffleVector;
+}
+
 /// \returns The opcode if all of the Instructions in \p VL have the same
 /// opcode, or zero.
 static unsigned getSameOpcode(ArrayRef<Value *> VL) {
@@ -158,8 +200,11 @@ static unsigned getSameOpcode(ArrayRef<V
   unsigned Opcode = I0->getOpcode();
   for (int i = 1, e = VL.size(); i < e; i++) {
     Instruction *I = dyn_cast<Instruction>(VL[i]);
-    if (!I || Opcode != I->getOpcode())
+    if (!I || Opcode != I->getOpcode()) {
+      if (canCombineAsAltInst(Opcode) && i == 1)
+        return isAltInst(VL);
       return 0;
+    }
   }
   return Opcode;
 }
@@ -377,6 +422,7 @@ public:
 
   /// \brief Perform LICM and CSE on the newly generated gather sequences.
   void optimizeGatherSequence();
+
 private:
   struct TreeEntry;
 
@@ -594,6 +640,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *
 
 void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
   bool SameTy = getSameType(VL); (void)SameTy;
+  bool isAltShuffle = false;
   assert(SameTy && "Invalid types!");
 
   if (Depth == RecursionMaxDepth) {
@@ -615,10 +662,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
       newTreeEntry(VL, false);
       return;
     }
+  unsigned Opcode = getSameOpcode(VL);
+
+  // Check that this shuffle vector refers to the alternate
+  // sequence of opcodes.
+  if (Opcode == Instruction::ShuffleVector) {
+    Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+    unsigned Op = I0->getOpcode();
+    if (Op != Instruction::ShuffleVector)
+      isAltShuffle = true;
+  }
 
   // If all of the operands are identical or constant we have a simple solution.
-  if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) ||
-      !getSameOpcode(VL)) {
+  if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
     DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
     newTreeEntry(VL, false);
     return;
@@ -754,8 +810,6 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
 
   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
 
-  unsigned Opcode = getSameOpcode(VL);
-
   // Check if it is safe to sink the loads or the stores.
   if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
     Instruction *Last = getLastInstruction(VL);
@@ -1057,6 +1111,26 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
       }
       return;
     }
+    case Instruction::ShuffleVector: {
+      // If this is not an alternate sequence of opcode like add-sub
+      // then do not vectorize this instruction.
+      if (!isAltShuffle) {
+        newTreeEntry(VL, false);
+        DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
+        return;
+      }
+      newTreeEntry(VL, true);
+      DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
+      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (unsigned j = 0; j < VL.size(); ++j)
+          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+        buildTree_rec(Operands, Depth + 1);
+      }
+      return;
+    }
     default:
       newTreeEntry(VL, false);
       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
@@ -1080,11 +1154,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
     }
     return getGatherCost(E->Scalars);
   }
-
-  assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) &&
-         "Invalid VL");
+  unsigned Opcode = getSameOpcode(VL);
+  assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
   Instruction *VL0 = cast<Instruction>(VL[0]);
-  unsigned Opcode = VL0->getOpcode();
   switch (Opcode) {
     case Instruction::PHI: {
       return 0;
@@ -1242,6 +1314,32 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
 
       return VecCallCost - ScalarCallCost;
     }
+    case Instruction::ShuffleVector: {
+      TargetTransformInfo::OperandValueKind Op1VK =
+          TargetTransformInfo::OK_AnyValue;
+      TargetTransformInfo::OperandValueKind Op2VK =
+          TargetTransformInfo::OK_AnyValue;
+      int ScalarCost = 0;
+      int VecCost = 0;
+      for (unsigned i = 0; i < VL.size(); ++i) {
+        Instruction *I = cast<Instruction>(VL[i]);
+        if (!I)
+          break;
+        ScalarCost +=
+            TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
+      }
+      // VecCost is equal to sum of the cost of creating 2 vectors
+      // and the cost of creating shuffle.
+      Instruction *I0 = cast<Instruction>(VL[0]);
+      VecCost =
+          TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
+      Instruction *I1 = cast<Instruction>(VL[1]);
+      VecCost +=
+          TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
+      VecCost +=
+          TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
+      return VecCost - ScalarCost;
+    }
     default:
       llvm_unreachable("Unknown instruction");
   }
@@ -1522,9 +1620,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
     setInsertPointAfterBundle(E->Scalars);
     return Gather(E->Scalars, VecTy);
   }
-
-  unsigned Opcode = VL0->getOpcode();
-  assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode");
+  unsigned Opcode = getSameOpcode(E->Scalars);
 
   switch (Opcode) {
     case Instruction::PHI: {
@@ -1797,6 +1893,49 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
       E->VectorizedValue = V;
       return V;
     }
+    case Instruction::ShuffleVector: {
+      ValueList LHSVL, RHSVL;
+      for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
+        LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+        RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+      }
+      setInsertPointAfterBundle(E->Scalars);
+
+      Value *LHS = vectorizeTree(LHSVL);
+      Value *RHS = vectorizeTree(RHSVL);
+
+      if (Value *V = alreadyVectorized(E->Scalars))
+        return V;
+
+      // Create a vector of LHS op1 RHS
+      BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
+      Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
+
+      // Create a vector of LHS op2 RHS
+      Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
+      BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
+      Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
+
+      // Create appropriate shuffle to take alternative operations from
+      // the vector.
+      std::vector<Constant *> Mask(E->Scalars.size());
+      unsigned e = E->Scalars.size();
+      for (unsigned i = 0; i < e; ++i) {
+        if (i & 1)
+          Mask[i] = Builder.getInt32(e + i);
+        else
+          Mask[i] = Builder.getInt32(i);
+      }
+
+      Value *ShuffleMask = ConstantVector::get(Mask);
+
+      Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+      E->VectorizedValue = V;
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        return propagateMetadata(I, E->Scalars);
+
+      return V;
+    }
     default:
     llvm_unreachable("unknown inst");
   }
@@ -1865,7 +2004,6 @@ Value *BoUpSLP::vectorizeTree() {
     // For each lane:
     for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
       Value *Scalar = Entry->Scalars[Lane];
-
       // No need to handle users of gathered values.
       if (Entry->NeedToGather)
         continue;
@@ -2049,7 +2187,6 @@ struct SLPVectorizer : public FunctionPa
     for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
          e = po_end(&F.getEntryBlock()); it != e; ++it) {
       BasicBlock *BB = *it;
-
       // Vectorize trees that end at stores.
       if (unsigned count = collectStores(BB, R)) {
         (void)count;

Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/addsub.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/addsub.ll?rev=211339&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/SLPVectorizer/X86/addsub.ll (added)
+++ llvm/trunk/test/Transforms/SLPVectorizer/X86/addsub.ll Thu Jun 19 23:32:48 2014
@@ -0,0 +1,181 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+ at b = common global [4 x i32] zeroinitializer, align 16
+ at c = common global [4 x i32] zeroinitializer, align 16
+ at d = common global [4 x i32] zeroinitializer, align 16
+ at e = common global [4 x i32] zeroinitializer, align 16
+ at a = common global [4 x i32] zeroinitializer, align 16
+ at fb = common global [4 x float] zeroinitializer, align 16
+ at fc = common global [4 x float] zeroinitializer, align 16
+ at fa = common global [4 x float] zeroinitializer, align 16
+
+; CHECK-LABEL: @addsub
+; CHECK: %5 = add <4 x i32> %3, %4
+; CHECK: %6 = add <4 x i32> %2, %5
+; CHECK: %7 = sub <4 x i32> %2, %5
+; CHECK: %8 = shufflevector <4 x i32> %6, <4 x i32> %7, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+
+; Function Attrs: nounwind uwtable
+define void @addsub() #0 {
+entry:
+  %0 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 0), align 4
+  %1 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 0), align 4
+  %add = add nsw i32 %0, %1
+  %2 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 0), align 4
+  %3 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 0), align 4
+  %add1 = add nsw i32 %2, %3
+  %add2 = add nsw i32 %add, %add1
+  store i32 %add2, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 0), align 4
+  %4 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 1), align 4
+  %5 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 1), align 4
+  %add3 = add nsw i32 %4, %5
+  %6 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 1), align 4
+  %7 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 1), align 4
+  %add4 = add nsw i32 %6, %7
+  %sub = sub nsw i32 %add3, %add4
+  store i32 %sub, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 1), align 4
+  %8 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 2), align 4
+  %9 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 2), align 4
+  %add5 = add nsw i32 %8, %9
+  %10 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 2), align 4
+  %11 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 2), align 4
+  %add6 = add nsw i32 %10, %11
+  %add7 = add nsw i32 %add5, %add6
+  store i32 %add7, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 2), align 4
+  %12 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 3), align 4
+  %13 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 3), align 4
+  %add8 = add nsw i32 %12, %13
+  %14 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 3), align 4
+  %15 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 3), align 4
+  %add9 = add nsw i32 %14, %15
+  %sub10 = sub nsw i32 %add8, %add9
+  store i32 %sub10, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 3), align 4
+  ret void
+}
+
+; CHECK-LABEL: @subadd
+; CHECK:  %5 = add <4 x i32> %3, %4
+; CHECK:  %6 = sub <4 x i32> %2, %5
+; CHECK:  %7 = add <4 x i32> %2, %5
+; CHECK:  %8 = shufflevector <4 x i32> %6, <4 x i32> %7, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+
+; Function Attrs: nounwind uwtable
+define void @subadd() #0 {
+entry:
+  %0 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 0), align 4
+  %1 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 0), align 4
+  %add = add nsw i32 %0, %1
+  %2 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 0), align 4
+  %3 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 0), align 4
+  %add1 = add nsw i32 %2, %3
+  %sub = sub nsw i32 %add, %add1
+  store i32 %sub, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 0), align 4
+  %4 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 1), align 4
+  %5 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 1), align 4
+  %add2 = add nsw i32 %4, %5
+  %6 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 1), align 4
+  %7 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 1), align 4
+  %add3 = add nsw i32 %6, %7
+  %add4 = add nsw i32 %add2, %add3
+  store i32 %add4, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 1), align 4
+  %8 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 2), align 4
+  %9 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 2), align 4
+  %add5 = add nsw i32 %8, %9
+  %10 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 2), align 4
+  %11 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 2), align 4
+  %add6 = add nsw i32 %10, %11
+  %sub7 = sub nsw i32 %add5, %add6
+  store i32 %sub7, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 2), align 4
+  %12 = load i32* getelementptr inbounds ([4 x i32]* @b, i32 0, i64 3), align 4
+  %13 = load i32* getelementptr inbounds ([4 x i32]* @c, i32 0, i64 3), align 4
+  %add8 = add nsw i32 %12, %13
+  %14 = load i32* getelementptr inbounds ([4 x i32]* @d, i32 0, i64 3), align 4
+  %15 = load i32* getelementptr inbounds ([4 x i32]* @e, i32 0, i64 3), align 4
+  %add9 = add nsw i32 %14, %15
+  %add10 = add nsw i32 %add8, %add9
+  store i32 %add10, i32* getelementptr inbounds ([4 x i32]* @a, i32 0, i64 3), align 4
+  ret void
+}
+
+; CHECK-LABEL: @faddfsub
+; CHECK: %2 = fadd <4 x float> %0, %1
+; CHECK: %3 = fsub <4 x float> %0, %1
+; CHECK: %4 = shufflevector <4 x float> %2, <4 x float> %3, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; Function Attrs: nounwind uwtable
+define void @faddfsub() #0 {
+entry:
+  %0 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
+  %1 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
+  %add = fadd float %0, %1
+  store float %add, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
+  %2 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
+  %3 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
+  %sub = fsub float %2, %3
+  store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
+  %4 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
+  %5 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
+  %add1 = fadd float %4, %5
+  store float %add1, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
+  %6 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
+  %7 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
+  %sub2 = fsub float %6, %7
+  store float %sub2, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
+  ret void
+}
+
+; CHECK-LABEL: @fsubfadd
+; CHECK: %2 = fsub <4 x float> %0, %1
+; CHECK: %3 = fadd <4 x float> %0, %1
+; CHECK: %4 = shufflevector <4 x float> %2, <4 x float> %3, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; Function Attrs: nounwind uwtable
+define void @fsubfadd() #0 {
+entry:
+  %0 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
+  %1 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
+  %sub = fsub float %0, %1
+  store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
+  %2 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
+  %3 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
+  %add = fadd float %2, %3
+  store float %add, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
+  %4 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
+  %5 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
+  %sub1 = fsub float %4, %5
+  store float %sub1, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
+  %6 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
+  %7 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
+  %add2 = fadd float %6, %7
+  store float %add2, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
+  ret void
+}
+
+; CHECK-LABEL: @No_faddfsub
+; CHECK-NOT: fadd <4 x float>
+; CHECK-NOT: fsub <4 x float>
+; CHECK-NOT: shufflevector
+; Function Attrs: nounwind uwtable
+define void @No_faddfsub() #0 {
+entry:
+  %0 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 0), align 4
+  %1 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 0), align 4
+  %add = fadd float %0, %1
+  store float %add, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 0), align 4
+  %2 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 1), align 4
+  %3 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 1), align 4
+  %add1 = fadd float %2, %3
+  store float %add1, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 1), align 4
+  %4 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 2), align 4
+  %5 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 2), align 4
+  %add2 = fadd float %4, %5
+  store float %add2, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 2), align 4
+  %6 = load float* getelementptr inbounds ([4 x float]* @fb, i32 0, i64 3), align 4
+  %7 = load float* getelementptr inbounds ([4 x float]* @fc, i32 0, i64 3), align 4
+  %sub = fsub float %6, %7
+  store float %sub, float* getelementptr inbounds ([4 x float]* @fa, i32 0, i64 3), align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+