[llvm] r323447 - Revert "[SLP] Fix for PR32086: Count InsertElementInstr of the same elements as shuffle."

Philip Reames via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 29 09:40:38 PST 2018


Alexey,

It is customary when reverting a patch to reply to the commit thread 
stating the patch has been reverted and with what commit ID.  It is also 
customary to include a brief summary of the bugs fixed when 
resubmitting.  This makes it easier for the broader community to follow 
along and see the progress being made.

Philip


On 01/25/2018 09:28 AM, Alexey Bataev via llvm-commits wrote:
> Author: abataev
> Date: Thu Jan 25 09:28:12 2018
> New Revision: 323447
>
> URL: http://llvm.org/viewvc/llvm-project?rev=323447&view=rev
> Log:
> Revert "[SLP] Fix for PR32086: Count InsertElementInstr of the same elements as shuffle."
>
> This reverts commit r323441 to fix buildbots.
>
> Modified:
>      llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
>      llvm/trunk/test/Transforms/SLPVectorizer/X86/PR32086.ll
>      llvm/trunk/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
>      llvm/trunk/test/Transforms/SLPVectorizer/X86/hoist.ll
>
> Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp?rev=323447&r1=323446&r2=323447&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
> +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Thu Jan 25 09:28:12 2018
> @@ -662,9 +662,13 @@ private:
>     /// Vectorize a single entry in the tree, starting in \p VL.
>     Value *vectorizeTree(ArrayRef<Value *> VL);
>   
> +  /// \returns the pointer to the vectorized value if \p VL is already
> +  /// vectorized, or NULL. They may happen in cycles.
> +  Value *alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const;
> +
>     /// \returns the scalarization cost for this type. Scalarization in this
>     /// context means the creation of vectors from a group of scalars.
> -  int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices);
> +  int getGatherCost(Type *Ty);
>   
>     /// \returns the scalarization cost for this list of values. Assuming that
>     /// this subtree gets vectorized, we may need to extract the values from the
> @@ -698,12 +702,8 @@ private:
>   
>       /// \returns true if the scalars in VL are equal to this entry.
>       bool isSame(ArrayRef<Value *> VL) const {
> -      if (VL.size() == Scalars.size())
> -        return std::equal(VL.begin(), VL.end(), Scalars.begin());
> -      return VL.size() == ReuseShuffleIndices.size() &&
> -             std::equal(
> -                 VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
> -                 [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; });
> +      assert(VL.size() == Scalars.size() && "Invalid size");
> +      return std::equal(VL.begin(), VL.end(), Scalars.begin());
>       }
>   
>       /// A vector of scalars.
> @@ -715,9 +715,6 @@ private:
>       /// Do we need to gather this sequence ?
>       bool NeedToGather = false;
>   
> -    /// Does this sequence require some shuffling?
> -    SmallVector<unsigned, 4> ReuseShuffleIndices;
> -
>       /// Points back to the VectorizableTree.
>       ///
>       /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
> @@ -732,15 +729,13 @@ private:
>     };
>   
>     /// Create a new VectorizableTree entry.
> -  void newTreeEntry(ArrayRef<Value *> VL, bool Vectorized, int &UserTreeIdx,
> -                    ArrayRef<unsigned> ReuseShuffleIndices = None) {
> +  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
> +                          int &UserTreeIdx) {
>       VectorizableTree.emplace_back(VectorizableTree);
>       int idx = VectorizableTree.size() - 1;
>       TreeEntry *Last = &VectorizableTree[idx];
>       Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
>       Last->NeedToGather = !Vectorized;
> -    Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
> -                                     ReuseShuffleIndices.end());
>       if (Vectorized) {
>         for (int i = 0, e = VL.size(); i != e; ++i) {
>           assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
> @@ -753,6 +748,7 @@ private:
>       if (UserTreeIdx >= 0)
>         Last->UserTreeIndices.push_back(UserTreeIdx);
>       UserTreeIdx = idx;
> +    return Last;
>     }
>   
>     /// -- Vectorization State --
> @@ -766,6 +762,13 @@ private:
>       return nullptr;
>     }
>   
> +  const TreeEntry *getTreeEntry(Value *V) const {
> +    auto I = ScalarToTreeEntry.find(V);
> +    if (I != ScalarToTreeEntry.end())
> +      return &VectorizableTree[I->second];
> +    return nullptr;
> +  }
> +
>     /// Maps a specific scalar to its tree entry.
>     SmallDenseMap<Value*, int> ScalarToTreeEntry;
>   
> @@ -1429,11 +1432,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>   
>     // Check if this is a duplicate of another entry.
>     if (TreeEntry *E = getTreeEntry(S.OpValue)) {
> -    DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
> -    if (!E->isSame(VL)) {
> -      DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
> -      newTreeEntry(VL, false, UserTreeIdx);
> -      return;
> +    for (unsigned i = 0, e = VL.size(); i != e; ++i) {
> +      DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
> +      if (E->Scalars[i] != VL[i]) {
> +        DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
> +        newTreeEntry(VL, false, UserTreeIdx);
> +        return;
> +      }
>       }
>       // Record the reuse of the tree node.  FIXME, currently this is only used to
>       // properly draw the graph rather than for the actual vectorization.
> @@ -1479,26 +1484,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>     }
>   
>     // Check that every instruction appears once in this bundle.
> -  SmallVector<unsigned, 4> ReuseShuffleIndicies;
> -  SmallVector<Value *, 4> UniqueValues;
> -  DenseMap<Value *, unsigned> UniquePositions;
> -  for (Value *V : VL) {
> -    auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
> -    ReuseShuffleIndicies.emplace_back(Res.first->second);
> -    if (Res.second)
> -      UniqueValues.emplace_back(V);
> -  }
> -  if (UniqueValues.size() == VL.size()) {
> -    ReuseShuffleIndicies.clear();
> -  } else {
> -    DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
> -    if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) {
> -      DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
> -      newTreeEntry(VL, false, UserTreeIdx);
> -      return;
> -    }
> -    VL = UniqueValues;
> -  }
> +  for (unsigned i = 0, e = VL.size(); i < e; ++i)
> +    for (unsigned j = i + 1; j < e; ++j)
> +      if (VL[i] == VL[j]) {
> +        DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
> +        newTreeEntry(VL, false, UserTreeIdx);
> +        return;
> +      }
>   
>     auto &BSRef = BlocksSchedules[BB];
>     if (!BSRef)
> @@ -1506,12 +1498,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>   
>     BlockScheduling &BS = *BSRef.get();
>   
> -  if (!BS.tryScheduleBundle(VL, this, VL0)) {
> +  if (!BS.tryScheduleBundle(VL, this, S.OpValue)) {
>       DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
>       assert((!BS.getScheduleData(VL0) ||
>               !BS.getScheduleData(VL0)->isPartOfBundle()) &&
>              "tryScheduleBundle should cancelScheduling on failure");
> -    newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
> +    newTreeEntry(VL, false, UserTreeIdx);
>       return;
>     }
>     DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
> @@ -1530,12 +1522,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>             if (Term) {
>               DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
>               BS.cancelScheduling(VL, VL0);
> -            newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
> +            newTreeEntry(VL, false, UserTreeIdx);
>               return;
>             }
>           }
>   
> -      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
> +      newTreeEntry(VL, true, UserTreeIdx);
>         DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
>   
>         for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
> @@ -1553,7 +1545,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>       case Instruction::ExtractElement: {
>         bool Reuse = canReuseExtract(VL, VL0);
>         if (Reuse) {
> -        DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
> +        DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
>           ++NumOpsWantToKeepOrder[S.Opcode];
>         } else {
>           SmallVector<Value *, 4> ReverseVL(VL.rbegin(), VL.rend());
> @@ -1561,7 +1553,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>             --NumOpsWantToKeepOrder[S.Opcode];
>           BS.cancelScheduling(VL, VL0);
>         }
> -      newTreeEntry(VL, Reuse, UserTreeIdx, ReuseShuffleIndicies);
> +      newTreeEntry(VL, Reuse, UserTreeIdx);
>         return;
>       }
>       case Instruction::Load: {
> @@ -1576,7 +1568,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>         if (DL->getTypeSizeInBits(ScalarTy) !=
>             DL->getTypeAllocSizeInBits(ScalarTy)) {
>           BS.cancelScheduling(VL, VL0);
> -        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
> +        newTreeEntry(VL, false, UserTreeIdx);
>           DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
>           return;
>         }
> @@ -1587,7 +1579,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>           LoadInst *L = cast<LoadInst>(VL[i]);
>           if (!L->isSimple()) {
>             BS.cancelScheduling(VL, VL0);
> -          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
> +          newTreeEntry(VL, false, UserTreeIdx);
>             DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
>             return;
>           }
> @@ -1609,7 +1601,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>   
>         if (Consecutive) {
>           ++NumOpsWantToKeepOrder[S.Opcode];
> -        newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
> +        newTreeEntry(VL, true, UserTreeIdx);
>           DEBUG(dbgs() << "SLP: added a vector of loads.\n");
>           return;
>         }
> @@ -1624,7 +1616,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>             }
>   
>         BS.cancelScheduling(VL, VL0);
> -      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
> +      newTreeEntry(VL, false, UserTreeIdx);
>   
>         if (ReverseConsecutive) {
>           --NumOpsWantToKeepOrder[S.Opcode];
> @@ -1651,12 +1643,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>           Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
>           if (Ty != SrcTy || !isValidElementType(Ty)) {
>             BS.cancelScheduling(VL, VL0);
> -          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
> +          newTreeEntry(VL, false, UserTreeIdx);
>             DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
>             return;
>           }
>         }
> -      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
> +      newTreeEntry(VL, true, UserTreeIdx);
>         DEBUG(dbgs() << "SLP: added a vector of casts.\n");
>   
>         for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
> @@ -1679,13 +1671,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>           if (Cmp->getPredicate() != P0 ||
>               Cmp->getOperand(0)->getType() != ComparedTy) {
>             BS.cancelScheduling(VL, VL0);
> -          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
> +          newTreeEntry(VL, false, UserTreeIdx);
>             DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
>             return;
>           }
>         }
>   
> -      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
> +      newTreeEntry(VL, true, UserTreeIdx);
>         DEBUG(dbgs() << "SLP: added a vector of compares.\n");
>   
>         for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
> @@ -1717,7 +1709,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>       case Instruction::And:
>       case Instruction::Or:
>       case Instruction::Xor:
> -      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
> +      newTreeEntry(VL, true, UserTreeIdx);
>         DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
>   
>         // Sort operands of the instructions so that each side is more likely to
> @@ -1746,7 +1738,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>           if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
>             DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
>             BS.cancelScheduling(VL, VL0);
> -          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
> +          newTreeEntry(VL, false, UserTreeIdx);
>             return;
>           }
>         }
> @@ -1759,7 +1751,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>           if (Ty0 != CurTy) {
>             DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
>             BS.cancelScheduling(VL, VL0);
> -          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
> +          newTreeEntry(VL, false, UserTreeIdx);
>             return;
>           }
>         }
> @@ -1771,12 +1763,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>             DEBUG(
>                 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
>             BS.cancelScheduling(VL, VL0);
> -          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
> +          newTreeEntry(VL, false, UserTreeIdx);
>             return;
>           }
>         }
>   
> -      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
> +      newTreeEntry(VL, true, UserTreeIdx);
>         DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
>         for (unsigned i = 0, e = 2; i < e; ++i) {
>           ValueList Operands;
> @@ -1793,12 +1785,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>         for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
>           if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
>             BS.cancelScheduling(VL, VL0);
> -          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
> +          newTreeEntry(VL, false, UserTreeIdx);
>             DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
>             return;
>           }
>   
> -      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
> +      newTreeEntry(VL, true, UserTreeIdx);
>         DEBUG(dbgs() << "SLP: added a vector of stores.\n");
>   
>         ValueList Operands;
> @@ -1816,7 +1808,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>         Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
>         if (!isTriviallyVectorizable(ID)) {
>           BS.cancelScheduling(VL, VL0);
> -        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
> +        newTreeEntry(VL, false, UserTreeIdx);
>           DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
>           return;
>         }
> @@ -1830,7 +1822,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>               getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
>               !CI->hasIdenticalOperandBundleSchema(*CI2)) {
>             BS.cancelScheduling(VL, VL0);
> -          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
> +          newTreeEntry(VL, false, UserTreeIdx);
>             DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
>                          << "\n");
>             return;
> @@ -1841,7 +1833,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>             Value *A1J = CI2->getArgOperand(1);
>             if (A1I != A1J) {
>               BS.cancelScheduling(VL, VL0);
> -            newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
> +            newTreeEntry(VL, false, UserTreeIdx);
>               DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
>                            << " argument "<< A1I<<"!=" << A1J
>                            << "\n");
> @@ -1854,14 +1846,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>                           CI->op_begin() + CI->getBundleOperandsEndIndex(),
>                           CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
>             BS.cancelScheduling(VL, VL0);
> -          newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
> +          newTreeEntry(VL, false, UserTreeIdx);
>             DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
>                          << *VL[i] << '\n');
>             return;
>           }
>         }
>   
> -      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
> +      newTreeEntry(VL, true, UserTreeIdx);
>         for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
>           ValueList Operands;
>           // Prepare the operand vector.
> @@ -1878,11 +1870,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>         // then do not vectorize this instruction.
>         if (!S.IsAltShuffle) {
>           BS.cancelScheduling(VL, VL0);
> -        newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
> +        newTreeEntry(VL, false, UserTreeIdx);
>           DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
>           return;
>         }
> -      newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies);
> +      newTreeEntry(VL, true, UserTreeIdx);
>         DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
>   
>         // Reorder operands if reordering would enable vectorization.
> @@ -1906,7 +1898,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>   
>       default:
>         BS.cancelScheduling(VL, VL0);
> -      newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies);
> +      newTreeEntry(VL, false, UserTreeIdx);
>         DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
>         return;
>     }
> @@ -1999,22 +1991,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>       VecTy = VectorType::get(
>           IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
>   
> -  unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
> -  bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
> -  int ReuseShuffleCost = 0;
> -  if (NeedToShuffleReuses) {
> -    ReuseShuffleCost =
> -        TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
> -  }
>     if (E->NeedToGather) {
>       if (allConstant(VL))
>         return 0;
>       if (isSplat(VL)) {
> -      return ReuseShuffleCost +
> -             TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
> +      return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
>       }
> -    if (getSameOpcode(VL).Opcode == Instruction::ExtractElement &&
> -        allSameType(VL) && allSameBlock(VL)) {
> +    if (getSameOpcode(VL).Opcode == Instruction::ExtractElement) {
>         Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
>         if (ShuffleKind.hasValue()) {
>           int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
> @@ -2031,10 +2014,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>                                               IO->getZExtValue());
>             }
>           }
> -        return ReuseShuffleCost + Cost;
> +        return Cost;
>         }
>       }
> -    return ReuseShuffleCost + getGatherCost(VL);
> +    return getGatherCost(E->Scalars);
>     }
>     InstructionsState S = getSameOpcode(VL);
>     assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
> @@ -2047,36 +2030,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>   
>       case Instruction::ExtractValue:
>       case Instruction::ExtractElement:
> -      if (NeedToShuffleReuses) {
> -        unsigned Idx = 0;
> -        for (unsigned I : E->ReuseShuffleIndices) {
> -          if (ShuffleOrOp == Instruction::ExtractElement) {
> -            auto *IO = cast<ConstantInt>(
> -                cast<ExtractElementInst>(VL[I])->getIndexOperand());
> -            Idx = IO->getZExtValue();
> -            ReuseShuffleCost -= TTI->getVectorInstrCost(
> -                Instruction::ExtractElement, VecTy, Idx);
> -          } else {
> -            ReuseShuffleCost -= TTI->getVectorInstrCost(
> -                Instruction::ExtractElement, VecTy, Idx);
> -            ++Idx;
> -          }
> -        }
> -        Idx = ReuseShuffleNumbers;
> -        for (Value *V : VL) {
> -          if (ShuffleOrOp == Instruction::ExtractElement) {
> -            auto *IO = cast<ConstantInt>(
> -                cast<ExtractElementInst>(V)->getIndexOperand());
> -            Idx = IO->getZExtValue();
> -          } else {
> -            --Idx;
> -          }
> -          ReuseShuffleCost +=
> -              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
> -        }
> -      }
>         if (canReuseExtract(VL, S.OpValue)) {
> -        int DeadCost = ReuseShuffleCost;
> +        int DeadCost = 0;
>           for (unsigned i = 0, e = VL.size(); i < e; ++i) {
>             Instruction *E = cast<Instruction>(VL[i]);
>             // If all users are going to be vectorized, instruction can be
> @@ -2084,12 +2039,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>             // The same, if have only one user, it will be vectorized for sure.
>             if (areAllUsersVectorized(E))
>               // Take credit for instruction that will become dead.
> -            DeadCost -=
> +            DeadCost +=
>                   TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
>           }
> -        return DeadCost;
> +        return -DeadCost;
>         }
> -      return ReuseShuffleCost + getGatherCost(VL);
> +      return getGatherCost(VecTy);
>   
>       case Instruction::ZExt:
>       case Instruction::SExt:
> @@ -2104,11 +2059,6 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>       case Instruction::FPTrunc:
>       case Instruction::BitCast: {
>         Type *SrcTy = VL0->getOperand(0)->getType();
> -      if (NeedToShuffleReuses) {
> -        ReuseShuffleCost -=
> -            (ReuseShuffleNumbers - VL.size()) *
> -            TTI->getCastInstrCost(S.Opcode, ScalarTy, SrcTy, VL0);
> -      }
>   
>         // Calculate the cost of this instruction.
>         int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
> @@ -2117,26 +2067,19 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>         VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
>         int VecCost = 0;
>         // Check if the values are candidates to demote.
> -      if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
> -        VecCost = ReuseShuffleCost +
> -                  TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
> -      }
> +      if (!MinBWs.count(VL0) || VecTy != SrcVecTy)
> +        VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
>         return VecCost - ScalarCost;
>       }
>       case Instruction::FCmp:
>       case Instruction::ICmp:
>       case Instruction::Select: {
>         // Calculate the cost of this instruction.
> -      if (NeedToShuffleReuses) {
> -        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) *
> -                            TTI->getCmpSelInstrCost(S.Opcode, ScalarTy,
> -                                                    Builder.getInt1Ty(), VL0);
> -      }
>         VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
>         int ScalarCost = VecTy->getNumElements() *
>             TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
>         int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0);
> -      return ReuseShuffleCost + VecCost - ScalarCost;
> +      return VecCost - ScalarCost;
>       }
>       case Instruction::Add:
>       case Instruction::FAdd:
> @@ -2194,19 +2137,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>           Op2VP = TargetTransformInfo::OP_PowerOf2;
>   
>         SmallVector<const Value *, 4> Operands(VL0->operand_values());
> -      if (NeedToShuffleReuses) {
> -        ReuseShuffleCost -=
> -            (ReuseShuffleNumbers - VL.size()) *
> -            TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
> -                                        Op2VP, Operands);
> -      }
>         int ScalarCost =
>             VecTy->getNumElements() *
>             TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
>                                         Op2VP, Operands);
>         int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK,
>                                                   Op1VP, Op2VP, Operands);
> -      return ReuseShuffleCost + VecCost - ScalarCost;
> +      return VecCost - ScalarCost;
>       }
>       case Instruction::GetElementPtr: {
>         TargetTransformInfo::OperandValueKind Op1VK =
> @@ -2214,46 +2151,31 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>         TargetTransformInfo::OperandValueKind Op2VK =
>             TargetTransformInfo::OK_UniformConstantValue;
>   
> -      if (NeedToShuffleReuses) {
> -        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) *
> -                            TTI->getArithmeticInstrCost(Instruction::Add,
> -                                                        ScalarTy, Op1VK, Op2VK);
> -      }
>         int ScalarCost =
>             VecTy->getNumElements() *
>             TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
>         int VecCost =
>             TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
>   
> -      return ReuseShuffleCost + VecCost - ScalarCost;
> +      return VecCost - ScalarCost;
>       }
>       case Instruction::Load: {
>         // Cost of wide load - cost of scalar loads.
>         unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
> -      if (NeedToShuffleReuses) {
> -        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) *
> -                            TTI->getMemoryOpCost(Instruction::Load, ScalarTy,
> -                                                 alignment, 0, VL0);
> -      }
>         int ScalarLdCost = VecTy->getNumElements() *
>             TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
>         int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
>                                              VecTy, alignment, 0, VL0);
> -      return ReuseShuffleCost + VecLdCost - ScalarLdCost;
> +      return VecLdCost - ScalarLdCost;
>       }
>       case Instruction::Store: {
>         // We know that we can merge the stores. Calculate the cost.
>         unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
> -      if (NeedToShuffleReuses) {
> -        ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) *
> -                            TTI->getMemoryOpCost(Instruction::Store, ScalarTy,
> -                                                 alignment, 0, VL0);
> -      }
>         int ScalarStCost = VecTy->getNumElements() *
>             TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
>         int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
>                                              VecTy, alignment, 0, VL0);
> -      return ReuseShuffleCost + VecStCost - ScalarStCost;
> +      return VecStCost - ScalarStCost;
>       }
>       case Instruction::Call: {
>         CallInst *CI = cast<CallInst>(VL0);
> @@ -2268,11 +2190,6 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>         if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
>           FMF = FPMO->getFastMathFlags();
>   
> -      if (NeedToShuffleReuses) {
> -        ReuseShuffleCost -=
> -            (ReuseShuffleNumbers - VL.size()) *
> -            TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
> -      }
>         int ScalarCallCost = VecTy->getNumElements() *
>             TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
>   
> @@ -2284,7 +2201,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>               << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")"
>               << " for " << *CI << "\n");
>   
> -      return ReuseShuffleCost + VecCallCost - ScalarCallCost;
> +      return VecCallCost - ScalarCallCost;
>       }
>       case Instruction::ShuffleVector: {
>         TargetTransformInfo::OperandValueKind Op1VK =
> @@ -2292,22 +2209,6 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>         TargetTransformInfo::OperandValueKind Op2VK =
>             TargetTransformInfo::OK_AnyValue;
>         int ScalarCost = 0;
> -      if (NeedToShuffleReuses) {
> -        for (unsigned Idx : E->ReuseShuffleIndices) {
> -          Instruction *I = cast<Instruction>(VL[Idx]);
> -          if (!I)
> -            continue;
> -          ReuseShuffleCost -= TTI->getArithmeticInstrCost(
> -              I->getOpcode(), ScalarTy, Op1VK, Op2VK);
> -        }
> -        for (Value *V : VL) {
> -          Instruction *I = cast<Instruction>(V);
> -          if (!I)
> -            continue;
> -          ReuseShuffleCost += TTI->getArithmeticInstrCost(
> -              I->getOpcode(), ScalarTy, Op1VK, Op2VK);
> -        }
> -      }
>         int VecCost = 0;
>         for (Value *i : VL) {
>           Instruction *I = cast<Instruction>(i);
> @@ -2326,7 +2227,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>             TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
>         VecCost +=
>             TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
> -      return ReuseShuffleCost + VecCost - ScalarCost;
> +      return VecCost - ScalarCost;
>       }
>       default:
>         llvm_unreachable("Unknown instruction");
> @@ -2502,14 +2403,10 @@ int BoUpSLP::getTreeCost() {
>     return Cost;
>   }
>   
> -int BoUpSLP::getGatherCost(Type *Ty,
> -                           const DenseSet<unsigned> &ShuffledIndices) {
> +int BoUpSLP::getGatherCost(Type *Ty) {
>     int Cost = 0;
>     for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
> -    if (!ShuffledIndices.count(i))
> -      Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
> -  if (!ShuffledIndices.empty())
> -      Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
> +    Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
>     return Cost;
>   }
>   
> @@ -2520,17 +2417,7 @@ int BoUpSLP::getGatherCost(ArrayRef<Valu
>       ScalarTy = SI->getValueOperand()->getType();
>     VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
>     // Find the cost of inserting/extracting values from the vector.
> -  // Check if the same elements are inserted several times and count them as
> -  // shuffle candidates.
> -  DenseSet<unsigned> ShuffledElements;
> -  DenseSet<Value *> UniqueElements;
> -  // Iterate in reverse order to consider insert elements with the high cost.
> -  for (unsigned I = VL.size(); I > 0; --I) {
> -    unsigned Idx = I - 1;
> -    if (!UniqueElements.insert(VL[Idx]).second)
> -      ShuffledElements.insert(Idx);
> -  }
> -  return getGatherCost(VecTy, ShuffledElements);
> +  return getGatherCost(VecTy);
>   }
>   
>   // Reorder commutative operations in alternate shuffle if the resulting vectors
> @@ -2828,7 +2715,7 @@ Value *BoUpSLP::Gather(ArrayRef<Value *>
>         if (TreeEntry *E = getTreeEntry(VL[i])) {
>           // Find which lane we need to extract.
>           int FoundLane = -1;
> -        for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) {
> +        for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
>             // Is this the lane of the scalar that we are looking for ?
>             if (E->Scalars[Lane] == VL[i]) {
>               FoundLane = Lane;
> @@ -2844,6 +2731,14 @@ Value *BoUpSLP::Gather(ArrayRef<Value *>
>     return Vec;
>   }
>   
> +Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const {
> +  if (const TreeEntry *En = getTreeEntry(OpValue)) {
> +    if (En->isSame(VL) && En->VectorizedValue)
> +      return En->VectorizedValue;
> +  }
> +  return nullptr;
> +}
> +
>   Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
>     InstructionsState S = getSameOpcode(VL);
>     if (S.Opcode) {
> @@ -2856,38 +2751,9 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<V
>     Type *ScalarTy = S.OpValue->getType();
>     if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
>       ScalarTy = SI->getValueOperand()->getType();
> -
> -  // Check that every instruction appears once in this bundle.
> -  SmallVector<unsigned, 4> ReuseShuffleIndicies;
> -  SmallVector<Value *, 4> UniqueValues;
> -  if (VL.size() > 2) {
> -    DenseMap<Value *, unsigned> UniquePositions;
> -    for (Value *V : VL) {
> -      auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
> -      ReuseShuffleIndicies.emplace_back(Res.first->second);
> -      if (Res.second || isa<Constant>(V))
> -        UniqueValues.emplace_back(V);
> -    }
> -    // Do not shuffle single element or if number of unique values is not power
> -    // of 2.
> -    if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
> -        !llvm::isPowerOf2_32(UniqueValues.size()))
> -      ReuseShuffleIndicies.clear();
> -    else
> -      VL = UniqueValues;
> -  }
>     VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
>   
> -  Value *V = Gather(VL, VecTy);
> -  if (!ReuseShuffleIndicies.empty()) {
> -    V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
> -                                    ReuseShuffleIndicies, "shuffle");
> -    if (auto *I = dyn_cast<Instruction>(V)) {
> -      GatherSeq.insert(I);
> -      CSEBlocks.insert(I->getParent());
> -    }
> -  }
> -  return V;
> +  return Gather(VL, VecTy);
>   }
>   
>   Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
> @@ -2905,19 +2771,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>       ScalarTy = SI->getValueOperand()->getType();
>     VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
>   
> -  bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
> -
>     if (E->NeedToGather) {
>       setInsertPointAfterBundle(E->Scalars, VL0);
>       auto *V = Gather(E->Scalars, VecTy);
> -    if (NeedToShuffleReuses) {
> -      V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
> -                                      E->ReuseShuffleIndices, "shuffle");
> -      if (auto *I = dyn_cast<Instruction>(V)) {
> -        GatherSeq.insert(I);
> -        CSEBlocks.insert(I->getParent());
> -      }
> -    }
>       E->VectorizedValue = V;
>       return V;
>     }
> @@ -2930,12 +2786,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>         Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
>         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
>         PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
> -      Value *V = NewPhi;
> -      if (NeedToShuffleReuses) {
> -        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
> -                                        E->ReuseShuffleIndices, "shuffle");
> -      }
> -      E->VectorizedValue = V;
> +      E->VectorizedValue = NewPhi;
>   
>         // PHINodes may have multiple entries from the same block. We want to
>         // visit every block once.
> @@ -2962,30 +2813,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>   
>         assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
>                "Invalid number of incoming values");
> -      return V;
> +      return NewPhi;
>       }
>   
>       case Instruction::ExtractElement: {
>         if (canReuseExtract(E->Scalars, VL0)) {
>           Value *V = VL0->getOperand(0);
> -        if (NeedToShuffleReuses) {
> -          Builder.SetInsertPoint(VL0);
> -          V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
> -                                          E->ReuseShuffleIndices, "shuffle");
> -        }
>           E->VectorizedValue = V;
>           return V;
>         }
>         setInsertPointAfterBundle(E->Scalars, VL0);
>         auto *V = Gather(E->Scalars, VecTy);
> -      if (NeedToShuffleReuses) {
> -        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
> -                                        E->ReuseShuffleIndices, "shuffle");
> -        if (auto *I = dyn_cast<Instruction>(V)) {
> -          GatherSeq.insert(I);
> -          CSEBlocks.insert(I->getParent());
> -        }
> -      }
>         E->VectorizedValue = V;
>         return V;
>       }
> @@ -2996,24 +2834,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>           PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
>           Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
>           LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
> -        Value *NewV = propagateMetadata(V, E->Scalars);
> -        if (NeedToShuffleReuses) {
> -          NewV = Builder.CreateShuffleVector(
> -              NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle");
> -        }
> -        E->VectorizedValue = NewV;
> -        return NewV;
> +        E->VectorizedValue = V;
> +        return propagateMetadata(V, E->Scalars);
>         }
>         setInsertPointAfterBundle(E->Scalars, VL0);
>         auto *V = Gather(E->Scalars, VecTy);
> -      if (NeedToShuffleReuses) {
> -        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
> -                                        E->ReuseShuffleIndices, "shuffle");
> -        if (auto *I = dyn_cast<Instruction>(V)) {
> -          GatherSeq.insert(I);
> -          CSEBlocks.insert(I->getParent());
> -        }
> -      }
>         E->VectorizedValue = V;
>         return V;
>       }
> @@ -3037,17 +2862,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>   
>         Value *InVec = vectorizeTree(INVL);
>   
> -      if (E->VectorizedValue) {
> -        DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
> -        return E->VectorizedValue;
> -      }
> +      if (Value *V = alreadyVectorized(E->Scalars, VL0))
> +        return V;
>   
>         CastInst *CI = dyn_cast<CastInst>(VL0);
>         Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
> -      if (NeedToShuffleReuses) {
> -        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
> -                                        E->ReuseShuffleIndices, "shuffle");
> -      }
>         E->VectorizedValue = V;
>         ++NumVectorInstructions;
>         return V;
> @@ -3065,10 +2884,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>         Value *L = vectorizeTree(LHSV);
>         Value *R = vectorizeTree(RHSV);
>   
> -      if (E->VectorizedValue) {
> -        DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
> -        return E->VectorizedValue;
> -      }
> +      if (Value *V = alreadyVectorized(E->Scalars, VL0))
> +        return V;
>   
>         CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
>         Value *V;
> @@ -3077,12 +2894,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>         else
>           V = Builder.CreateICmp(P0, L, R);
>   
> -      propagateIRFlags(V, E->Scalars, VL0);
> -      if (NeedToShuffleReuses) {
> -        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
> -                                        E->ReuseShuffleIndices, "shuffle");
> -      }
>         E->VectorizedValue = V;
> +      propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
>         ++NumVectorInstructions;
>         return V;
>       }
> @@ -3100,16 +2913,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>         Value *True = vectorizeTree(TrueVec);
>         Value *False = vectorizeTree(FalseVec);
>   
> -      if (E->VectorizedValue) {
> -        DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
> -        return E->VectorizedValue;
> -      }
> +      if (Value *V = alreadyVectorized(E->Scalars, VL0))
> +        return V;
>   
>         Value *V = Builder.CreateSelect(Cond, True, False);
> -      if (NeedToShuffleReuses) {
> -        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
> -                                        E->ReuseShuffleIndices, "shuffle");
> -      }
>         E->VectorizedValue = V;
>         ++NumVectorInstructions;
>         return V;
> @@ -3148,24 +2955,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>         Value *LHS = vectorizeTree(LHSVL);
>         Value *RHS = vectorizeTree(RHSVL);
>   
> -      if (E->VectorizedValue) {
> -        DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
> -        return E->VectorizedValue;
> -      }
> +      if (Value *V = alreadyVectorized(E->Scalars, VL0))
> +        return V;
>   
>         Value *V = Builder.CreateBinOp(
>             static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
> -      propagateIRFlags(V, E->Scalars, VL0);
> -      if (auto *I = dyn_cast<Instruction>(V))
> -        V = propagateMetadata(I, E->Scalars);
> -
> -      if (NeedToShuffleReuses) {
> -        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
> -                                        E->ReuseShuffleIndices, "shuffle");
> -      }
>         E->VectorizedValue = V;
> +      propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
>         ++NumVectorInstructions;
>   
> +      if (Instruction *I = dyn_cast<Instruction>(V))
> +        return propagateMetadata(I, E->Scalars);
> +
>         return V;
>       }
>       case Instruction::Load: {
> @@ -3193,14 +2994,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>           Alignment = DL->getABITypeAlignment(ScalarLoadTy);
>         }
>         LI->setAlignment(Alignment);
> -      Value *V = propagateMetadata(LI, E->Scalars);
> -      if (NeedToShuffleReuses) {
> -        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
> -                                        E->ReuseShuffleIndices, "shuffle");
> -      }
> -      E->VectorizedValue = V;
> +      E->VectorizedValue = LI;
>         ++NumVectorInstructions;
> -      return V;
> +      return propagateMetadata(LI, E->Scalars);
>       }
>       case Instruction::Store: {
>         StoreInst *SI = cast<StoreInst>(VL0);
> @@ -3228,14 +3024,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>           Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
>   
>         S->setAlignment(Alignment);
> -      Value *V = propagateMetadata(S, E->Scalars);
> -      if (NeedToShuffleReuses) {
> -        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
> -                                        E->ReuseShuffleIndices, "shuffle");
> -      }
> -      E->VectorizedValue = V;
> +      E->VectorizedValue = S;
>         ++NumVectorInstructions;
> -      return V;
> +      return propagateMetadata(S, E->Scalars);
>       }
>       case Instruction::GetElementPtr: {
>         setInsertPointAfterBundle(E->Scalars, VL0);
> @@ -3259,16 +3050,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>   
>         Value *V = Builder.CreateGEP(
>             cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
> -      if (Instruction *I = dyn_cast<Instruction>(V))
> -        V = propagateMetadata(I, E->Scalars);
> -
> -      if (NeedToShuffleReuses) {
> -        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
> -                                        E->ReuseShuffleIndices, "shuffle");
> -      }
>         E->VectorizedValue = V;
>         ++NumVectorInstructions;
>   
> +      if (Instruction *I = dyn_cast<Instruction>(V))
> +        return propagateMetadata(I, E->Scalars);
> +
>         return V;
>       }
>       case Instruction::Call: {
> @@ -3315,12 +3102,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>         if (ScalarArg && getTreeEntry(ScalarArg))
>           ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
>   
> -      propagateIRFlags(V, E->Scalars, VL0);
> -      if (NeedToShuffleReuses) {
> -        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
> -                                        E->ReuseShuffleIndices, "shuffle");
> -      }
>         E->VectorizedValue = V;
> +      propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
>         ++NumVectorInstructions;
>         return V;
>       }
> @@ -3334,10 +3117,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>         Value *LHS = vectorizeTree(LHSVL);
>         Value *RHS = vectorizeTree(RHSVL);
>   
> -      if (E->VectorizedValue) {
> -        DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
> -        return E->VectorizedValue;
> -      }
> +      if (Value *V = alreadyVectorized(E->Scalars, VL0))
> +        return V;
>   
>         // Create a vector of LHS op1 RHS
>         Value *V0 = Builder.CreateBinOp(
> @@ -3369,14 +3150,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry
>         propagateIRFlags(V1, OddScalars);
>   
>         Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
> -      if (Instruction *I = dyn_cast<Instruction>(V))
> -        V = propagateMetadata(I, E->Scalars);
> -      if (NeedToShuffleReuses) {
> -        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
> -                                        E->ReuseShuffleIndices, "shuffle");
> -      }
>         E->VectorizedValue = V;
>         ++NumVectorInstructions;
> +      if (Instruction *I = dyn_cast<Instruction>(V))
> +        return propagateMetadata(I, E->Scalars);
>   
>         return V;
>       }
> @@ -3546,12 +3323,14 @@ void BoUpSLP::optimizeGatherSequence() {
>     DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
>           << " gather sequences instructions.\n");
>     // LICM InsertElementInst sequences.
> -  for (Instruction *I : GatherSeq) {
> -    if (!isa<InsertElementInst>(I) && !isa<ShuffleVectorInst>(I))
> +  for (Instruction *it : GatherSeq) {
> +    InsertElementInst *Insert = dyn_cast<InsertElementInst>(it);
> +
> +    if (!Insert)
>         continue;
>   
>       // Check if this block is inside a loop.
> -    Loop *L = LI->getLoopFor(I->getParent());
> +    Loop *L = LI->getLoopFor(Insert->getParent());
>       if (!L)
>         continue;
>   
> @@ -3563,15 +3342,15 @@ void BoUpSLP::optimizeGatherSequence() {
>       // If the vector or the element that we insert into it are
>       // instructions that are defined in this basic block then we can't
>       // hoist this instruction.
> -    auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
> -    auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
> -    if (Op0 && L->contains(Op0))
> +    Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
> +    Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
> +    if (CurrVec && L->contains(CurrVec))
>         continue;
> -    if (Op1 && L->contains(Op1))
> +    if (NewElem && L->contains(NewElem))
>         continue;
>   
>       // We can hoist this instruction. Move it to the pre-header.
> -    I->moveBefore(PreHeader->getTerminator());
> +    Insert->moveBefore(PreHeader->getTerminator());
>     }
>   
>     // Make a list of all reachable blocks in our CSE queue.
>
> Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/PR32086.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/PR32086.ll?rev=323447&r1=323446&r2=323447&view=diff
> ==============================================================================
> --- llvm/trunk/test/Transforms/SLPVectorizer/X86/PR32086.ll (original)
> +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/PR32086.ll Thu Jan 25 09:28:12 2018
> @@ -4,14 +4,15 @@
>   define void @i64_simplified(i64* noalias %st, i64* noalias %ld) {
>   ; CHECK-LABEL: @i64_simplified(
>   ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1
> -; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>*
> -; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
> -; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
> +; CHECK-NEXT:    [[T0:%.*]] = load i64, i64* [[LD]], align 8
> +; CHECK-NEXT:    [[T1:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8
>   ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1
>   ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2
>   ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3
> -; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>*
> -; CHECK-NEXT:    store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8
> +; CHECK-NEXT:    store i64 [[T0]], i64* [[ST]], align 8
> +; CHECK-NEXT:    store i64 [[T1]], i64* [[ARRAYIDX3]], align 8
> +; CHECK-NEXT:    store i64 [[T0]], i64* [[ARRAYIDX4]], align 8
> +; CHECK-NEXT:    store i64 [[T1]], i64* [[ARRAYIDX5]], align 8
>   ; CHECK-NEXT:    ret void
>   ;
>     %arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1
>
> Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll?rev=323447&r1=323446&r2=323447&view=diff
> ==============================================================================
> --- llvm/trunk/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll (original)
> +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll Thu Jan 25 09:28:12 2018
> @@ -137,19 +137,17 @@ define i8 @k(<4 x i8> %x) {
>   
>   define i8 @k_bb(<4 x i8> %x) {
>   ; CHECK-LABEL: @k_bb(
> -; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
>   ; CHECK-NEXT:    br label [[BB1:%.*]]
>   ; CHECK:       bb1:
> -; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
> -; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
> -; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
> -; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X]], [[X]]
> -; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X0X0]], [[X3X3]]
> -; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
> -; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
> -; CHECK-NEXT:    [[TMP5:%.*]] = add i8 [[TMP3]], [[TMP4]]
> -; CHECK-NEXT:    [[TMP6:%.*]] = sdiv i8 [[TMP2]], [[TMP5]]
> -; CHECK-NEXT:    ret i8 [[TMP6]]
> +; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]]
> +; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> <i32 0, i32 1>
> +; CHECK-NEXT:    [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]]
> +; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> <i32 3, i32 2>
> +; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]]
> +; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0
> +; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1
> +; CHECK-NEXT:    [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]]
> +; CHECK-NEXT:    ret i8 [[TMP8]]
>   ;
>     %x0 = extractelement <4 x i8> %x, i32 0
>     br label %bb1
>
> Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/hoist.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/SLPVectorizer/X86/hoist.ll?rev=323447&r1=323446&r2=323447&view=diff
> ==============================================================================
> --- llvm/trunk/test/Transforms/SLPVectorizer/X86/hoist.ll (original)
> +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/hoist.ll Thu Jan 25 09:28:12 2018
> @@ -16,18 +16,19 @@ target triple = "i386-apple-macosx10.9.0
>   define i32 @foo(i32* nocapture %A, i32 %n, i32 %k) {
>   ; CHECK-LABEL: @foo(
>   ; CHECK-NEXT:  entry:
> -; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[N:%.*]], i32 0
> -; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[K:%.*]], i32 1
> -; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
> +; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
> +; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[K:%.*]], i32 1
> +; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[N]], i32 2
> +; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[K]], i32 3
>   ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
>   ; CHECK:       for.body:
>   ; CHECK-NEXT:    [[I_024:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ]
>   ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_024]]
> -; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
> -; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
> -; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[SHUFFLE]], [[TMP3]]
> -; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
> -; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
> +; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
> +; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
> +; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP5]]
> +; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
> +; CHECK-NEXT:    store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 4
>   ; CHECK-NEXT:    [[ADD10]] = add nsw i32 [[I_024]], 4
>   ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[ADD10]], 10000
>   ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits



More information about the llvm-commits mailing list