[llvm] r296863 - [SLP] Fixes the bug due to absence of in order uses of scalars which needs to be available

Mon Mar 6 16:08:02 PST 2017

I've reverted this again in r297103.

On Mon, Mar 6, 2017 at 3:44 PM, Michael Kuperstein <mkuper at google.com>
wrote:

> Hi Shahid,
>
> Unfortunately, this still fails in some cases - and it's partially my
> fault, since it's caused by a change I suggested on the review.
>
> The problem is that we end up having a single load bundle that's used in
> two different orders. Since we only have a single tree entry for that
> bundle, we generate a single mask, and use the same shuffle for both uses.
> Note that I I'm not sure computing the mask on-demand (like we did before)
> would always help. I think it would break if one of the uses was in-order,
> another was out-of-order, and we reached buildTree_vec with the in-order
> use first. We'd then set NeedToShuffle to false, and wouldn't generate a
> shuffle for the out-of-order use.
>
> I'm not sure if the right solution is to generate the mask on demand like
> we did before (but then we need to make sure the issue above doesn't
> happen), or have distinct tree entries.
>
> Reduced example:
>
> target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-pc-linux-gnu"
>
> %complex = type { { double, double } }
>
> define void @eggs(%complex* %p) local_unnamed_addr #0 {
> bb:
>   br i1 undef, label %bb2, label %bb1
>
> bb1:                                              ; preds = %bb
>   ret void
>
> bb2:                                              ; preds = %bb2, %bb
>   %phi0 = phi double [ %tmp21, %bb2 ], [ 0.000000e+00, %bb ]
>   %phi1 = phi double [ %tmp20, %bb2 ], [ 0.000000e+00, %bb ]
>   %phi2 = phi double [ %tmp29, %bb2 ], [ 0.000000e+00, %bb ]
>   %phi3 = phi double [ %tmp28, %bb2 ], [ 0.000000e+00, %bb ]
>
>   %gep0 = getelementptr inbounds %complex, %complex* %p, i64 0, i32 0, i32
> 0
>   %load0 = load double, double* %gep0, align 8
>   %gep1 = getelementptr inbounds %complex, %complex* %p, i64 0, i32 0, i32
> 1
>   %load1 = load double, double* %gep1, align 8
>   %gep2 = getelementptr inbounds %complex, %complex* %p, i64 1, i32 0, i32
> 0
>   %load2 = load double, double* %gep2, align 8
>   %gep3 = getelementptr inbounds %complex, %complex* %p, i64 1, i32 0, i32
> 1
>   %load3 = load double, double* %gep3, align 8
>
>   %tmp14 = fmul double 1.0, %load0
>   %tmp15 = fmul double 2.0, %load1
>   %tmp16 = fsub double %tmp14, %tmp15
>   %tmp17 = fmul double 1.0, %load1
>   %tmp18 = fmul double 2.0, %load0
>   %tmp19 = fsub double %tmp17, %tmp18
>   %tmp20 = fadd double %phi1, %tmp16
>   %tmp21 = fadd double %phi0, %tmp19
>   %tmp22 = fmul double 1.0, %load2
>   %tmp23 = fmul double 2.0, %load3
>   %tmp24 = fsub double %tmp22, %tmp23
>   %tmp25 = fmul double 1.0, %load3
>   %tmp26 = fmul double 2.0, %load2
>   %tmp27 = fsub double %tmp25, %tmp26
>   %tmp28 = fadd double %phi3, %tmp24
>   %tmp29 = fadd double %phi2, %tmp27
>   br label %bb2
> }
>
> Good output:
>
> bb2:                                              ; preds = %bb2, %bb
>   %0 = phi <4 x double> [ %10, %bb2 ], [ zeroinitializer, %bb ]
>   %gep0 = getelementptr inbounds %complex, %complex* %p, i64 0, i32 0, i32
> 0
>   %gep1 = getelementptr inbounds %complex, %complex* %p, i64 0, i32 0, i32
> 1
>   %gep2 = getelementptr inbounds %complex, %complex* %p, i64 1, i32 0, i32
> 0
>   %gep3 = getelementptr inbounds %complex, %complex* %p, i64 1, i32 0, i32
> 1
>   %1 = bitcast double* %gep0 to <4 x double>*
>   %2 = load <4 x double>, <4 x double>* %1, align 8
>   %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32
> 0, i32 1, i32 2, i32 3>
>   %4 = bitcast double* %gep0 to <4 x double>*
>   %5 = load <4 x double>, <4 x double>* %4, align 8
>   %6 = shufflevector <4 x double> %5, <4 x double> undef, <4 x i32> <i32
> 1, i32 0, i32 3, i32 2>
>   %7 = fmul <4 x double> <double 1.000000e+00, double 1.000000e+00, double
> 1.000000e+00, double 1.000000e+00>, %6
>   %8 = fmul <4 x double> <double 2.000000e+00, double 2.000000e+00, double
> 2.000000e+00, double 2.000000e+00>, %3
>   %9 = fsub <4 x double> %7, %8
>   %10 = fadd <4 x double> %0, %9
>   br label %bb2
>
> Bad output:
> bb2:                                              ; preds = %bb2, %bb
>   %0 = phi <4 x double> [ %10, %bb2 ], [ zeroinitializer, %bb ]
>   %gep0 = getelementptr inbounds %complex, %complex* %p, i64 0, i32 0, i32
> 0
>   %gep1 = getelementptr inbounds %complex, %complex* %p, i64 0, i32 0, i32
> 1
>   %gep2 = getelementptr inbounds %complex, %complex* %p, i64 1, i32 0, i32
> 0
>   %gep3 = getelementptr inbounds %complex, %complex* %p, i64 1, i32 0, i32
> 1
>   %1 = bitcast double* %gep0 to <4 x double>*
>   %2 = load <4 x double>, <4 x double>* %1, align 8
>   %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32
> 1, i32 0, i32 3, i32 2>
>   %4 = bitcast double* %gep0 to <4 x double>*
>   %5 = load <4 x double>, <4 x double>* %4, align 8
>   %6 = shufflevector <4 x double> %5, <4 x double> undef, <4 x i32> <i32
> 1, i32 0, i32 3, i32 2>
>   %7 = fmul <4 x double> <double 1.000000e+00, double 1.000000e+00, double
> 1.000000e+00, double 1.000000e+00>, %6
>   %8 = fmul <4 x double> <double 2.000000e+00, double 2.000000e+00, double
> 2.000000e+00, double 2.000000e+00>, %3
>   %9 = fsub <4 x double> %7, %8
>   %10 = fadd <4 x double> %0, %9
>   br label %bb2
>
> Note the difference in %3.
> (In this reproducer, the "good" %3 shuffle is a nop, but the original case
> I saw had a different order.)
>
>
>
> On Fri, Mar 3, 2017 at 2:02 AM, Mohammad Shahid via llvm-commits <
> llvm-commits at lists.llvm.org> wrote:
>
>> Author: ashahid
>> Date: Fri Mar  3 04:02:47 2017
>> New Revision: 296863
>>
>> URL: http://llvm.org/viewvc/llvm-project?rev=296863&view=rev
>> Log:
>> [SLP] Fixes the bug due to absence of in order uses of scalars which
>> needs to be available
>> for VectorizeTree() API.This API uses it for proper mask computation to
>> be used in shufflevector IR.
>> The fix is to compute the mask for out of order memory accesses while
>> building the vectorizable tree
>> instead of actual vectorization of vectorizable tree.It also needs to
>> recompute the proper Lane for
>> external use of vectorizable scalars based on shuffle mask.
>>
>> Reviewers: mkuper
>>
>> Differential Revision: https://reviews.llvm.org/D30159
>>
>> Change-Id: Ide8773ce0ad3562f3cf4d1a0ad0f487e2f60ce5d
>>
>> Added:
>>     llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-bug.ll
>> Modified:
>>     llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h
>>     llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp
>>     llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
>>     llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-same.ll
>>
>> Modified: llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/
>> Analysis/LoopAccessAnalysis.h?rev=296863&r1=296862&r2=296863&view=diff
>> ============================================================
>> ==================
>> --- llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h (original)
>> +++ llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h Fri Mar  3
>> 04:02:47 2017
>> @@ -660,12 +660,15 @@ int64_t getPtrStride(PredicatedScalarEvo
>>  /// \brief Try to sort an array of loads / stores.
>>  ///
>>  /// An array of loads / stores can only be sorted if all pointer operands
>> -/// refer to the same object, and the differences between these pointers
>> +/// refer to the same object, and the differences between these pointers
>>  /// are known to be constant. If that is the case, this returns true,
>> and the
>>  /// sorted array is returned in \p Sorted. Otherwise, this returns
>> false, and
>>  /// \p Sorted is invalid.
>> +//  If \p Mask is not null, it also returns the \p Mask which is the
>> shuffle
>> +//  mask for actual memory access order.
>>  bool sortMemAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
>> -                     ScalarEvolution &SE, SmallVectorImpl<Value *>
>> &Sorted);
>> +                     ScalarEvolution &SE, SmallVectorImpl<Value *>
>> &Sorted,
>> +                     SmallVectorImpl<unsigned> *Mask = nullptr);
>>
>>  /// \brief Returns true if the memory operations \p A and \p B are
>> consecutive.
>>  /// This is a simple API that does not depend on the analysis pass.
>>
>> Modified: llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/
>> LoopAccessAnalysis.cpp?rev=296863&r1=296862&r2=296863&view=diff
>> ============================================================
>> ==================
>> --- llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp (original)
>> +++ llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp Fri Mar  3 04:02:47
>> 2017
>> @@ -1040,7 +1040,8 @@ static unsigned getAddressSpaceOperand(V
>>
>>  bool llvm::sortMemAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
>>                             ScalarEvolution &SE,
>> -                           SmallVectorImpl<Value *> &Sorted) {
>> +                           SmallVectorImpl<Value *> &Sorted,
>> +                           SmallVectorImpl<unsigned> *Mask) {
>>    SmallVector<std::pair<int64_t, Value *>, 4> OffValPairs;
>>    OffValPairs.reserve(VL.size());
>>    Sorted.reserve(VL.size());
>> @@ -1050,7 +1051,6 @@ bool llvm::sortMemAccesses(ArrayRef<Valu
>>    Value *Ptr0 = getPointerOperand(VL[0]);
>>    const SCEV *Scev0 = SE.getSCEV(Ptr0);
>>    Value *Obj0 = GetUnderlyingObject(Ptr0, DL);
>> -
>>    for (auto *Val : VL) {
>>      // The only kind of access we care about here is load.
>>      if (!isa<LoadInst>(Val))
>> @@ -1077,14 +1077,30 @@ bool llvm::sortMemAccesses(ArrayRef<Valu
>>      OffValPairs.emplace_back(Diff->getAPInt().getSExtValue(), Val);
>>    }
>>
>> -  std::sort(OffValPairs.begin(), OffValPairs.end(),
>> -            [](const std::pair<int64_t, Value *> &Left,
>> -               const std::pair<int64_t, Value *> &Right) {
>> -              return Left.first < Right.first;
>> +  SmallVector<unsigned, 4> UseOrder(VL.size());
>> +  for (unsigned i = 0; i < VL.size(); i++) {
>> +    UseOrder[i] = i;
>> +  }
>> +
>> +  // Sort the memory accesses and keep the order of their uses in
>> UseOrder.
>> +  std::sort(UseOrder.begin(), UseOrder.end(),
>> +            [&OffValPairs](unsigned Left, unsigned Right) {
>> +              return OffValPairs[Left].first < OffValPairs[Right].first;
>>              });
>>
>> -  for (auto &it : OffValPairs)
>> -    Sorted.push_back(it.second);
>> +  for (unsigned i = 0; i < VL.size(); i++)
>> +    Sorted.emplace_back(OffValPairs[UseOrder[i]].second);
>> +
>> +  // Sort UseOrder to compute the Mask.
>> +  if (Mask) {
>> +    Mask->reserve(VL.size());
>> +    for (unsigned i = 0; i < VL.size(); i++)
>> +      Mask->emplace_back(i);
>> +    std::sort(Mask->begin(), Mask->end(),
>> +              [&UseOrder](unsigned Left, unsigned Right) {
>> +                return UseOrder[Left] < UseOrder[Right];
>> +              });
>> +  }
>>
>>    return true;
>>  }
>>
>> Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transform
>> s/Vectorize/SLPVectorizer.cpp?rev=296863&r1=296862&r2=296863&view=diff
>> ============================================================
>> ==================
>> --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
>> +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Fri Mar  3
>> 04:02:47 2017
>> @@ -423,10 +423,8 @@ private:
>>    /// be vectorized to use the original vector (or aggregate "bitcast"
>> to a vector).
>>    bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;
>>
>> -  /// Vectorize a single entry in the tree. VL icontains all isomorphic
>> scalars
>> -  /// in order of its usage in a user program, for example ADD1, ADD2
>> and so on
>> -  /// or LOAD1 , LOAD2 etc.
>> -  Value *vectorizeTree(ArrayRef<Value *> VL, TreeEntry *E);
>> +  /// Vectorize a single entry in the tree.
>> +  Value *vectorizeTree(TreeEntry *E);
>>
>>    /// Vectorize a single entry in the tree, starting in \p VL.
>>    Value *vectorizeTree(ArrayRef<Value *> VL);
>> @@ -466,8 +464,8 @@ private:
>>                                        SmallVectorImpl<Value *> &Left,
>>                                        SmallVectorImpl<Value *> &Right);
>>    struct TreeEntry {
>> -    TreeEntry() : Scalars(), VectorizedValue(nullptr),
>> -    NeedToGather(0), NeedToShuffle(0) {}
>> +    TreeEntry()
>> +        : Scalars(), VectorizedValue(nullptr), NeedToGather(0),
>> ShuffleMask() {}
>>
>>      /// \returns true if the scalars in VL are equal to this entry.
>>      bool isSame(ArrayRef<Value *> VL) const {
>> @@ -495,19 +493,23 @@ private:
>>      /// Do we need to gather this sequence ?
>>      bool NeedToGather;
>>
>> -    /// Do we need to shuffle the load ?
>> -    bool NeedToShuffle;
>> +    /// Records optional suffle mask for jumbled memory accesses in this.
>> +    SmallVector<unsigned, 8> ShuffleMask;
>> +
>>    };
>>
>>    /// Create a new VectorizableTree entry.
>>    TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
>> -                          bool NeedToShuffle) {
>> +                          ArrayRef<unsigned> ShuffleMask = None) {
>>      VectorizableTree.emplace_back();
>>      int idx = VectorizableTree.size() - 1;
>>      TreeEntry *Last = &VectorizableTree[idx];
>>      Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
>>      Last->NeedToGather = !Vectorized;
>> -    Last->NeedToShuffle = NeedToShuffle;
>> +    if (!ShuffleMask.empty())
>> +      Last->ShuffleMask.insert(Last->ShuffleMask.begin(),
>> ShuffleMask.begin(),
>> +                               ShuffleMask.end());
>> +
>>      if (Vectorized) {
>>        for (int i = 0, e = VL.size(); i != e; ++i) {
>>          assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in
>> tree!");
>> @@ -1030,21 +1032,21 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>
>>    if (Depth == RecursionMaxDepth) {
>>      DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
>> -    newTreeEntry(VL, false, false);
>> +    newTreeEntry(VL, false);
>>      return;
>>    }
>>
>>    // Don't handle vectors.
>>    if (VL[0]->getType()->isVectorTy()) {
>>      DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
>> -    newTreeEntry(VL, false, false);
>> +    newTreeEntry(VL, false);
>>      return;
>>    }
>>
>>    if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
>>      if (SI->getValueOperand()->getType()->isVectorTy()) {
>>        DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
>> -      newTreeEntry(VL, false, false);
>> +      newTreeEntry(VL, false);
>>        return;
>>      }
>>    unsigned Opcode = getSameOpcode(VL);
>> @@ -1061,7 +1063,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>    // If all of the operands are identical or constant we have a simple
>> solution.
>>    if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {
>>      DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
>> -    newTreeEntry(VL, false, false);
>> +    newTreeEntry(VL, false);
>>      return;
>>    }
>>
>> @@ -1073,7 +1075,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>      if (EphValues.count(VL[i])) {
>>        DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
>>              ") is ephemeral.\n");
>> -      newTreeEntry(VL, false, false);
>> +      newTreeEntry(VL, false);
>>        return;
>>      }
>>    }
>> @@ -1086,7 +1088,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>        DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
>>        if (E->Scalars[i] != VL[i]) {
>>          DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
>> -        newTreeEntry(VL, false, false);
>> +        newTreeEntry(VL, false);
>>          return;
>>        }
>>      }
>> @@ -1099,7 +1101,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>      if (ScalarToTreeEntry.count(VL[i])) {
>>        DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
>>              ") is already in tree.\n");
>> -      newTreeEntry(VL, false, false);
>> +      newTreeEntry(VL, false);
>>        return;
>>      }
>>    }
>> @@ -1109,7 +1111,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>    for (unsigned i = 0, e = VL.size(); i != e; ++i) {
>>      if (MustGather.count(VL[i])) {
>>        DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
>> -      newTreeEntry(VL, false, false);
>> +      newTreeEntry(VL, false);
>>        return;
>>      }
>>    }
>> @@ -1123,7 +1125,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>      // Don't go into unreachable blocks. They may contain instructions
>> with
>>      // dependency cycles which confuse the final scheduling.
>>      DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
>> -    newTreeEntry(VL, false, false);
>> +    newTreeEntry(VL, false);
>>      return;
>>    }
>>
>> @@ -1132,7 +1134,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>      for (unsigned j = i+1; j < e; ++j)
>>        if (VL[i] == VL[j]) {
>>          DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
>> -        newTreeEntry(VL, false, false);
>> +        newTreeEntry(VL, false);
>>          return;
>>        }
>>
>> @@ -1147,7 +1149,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>      assert((!BS.getScheduleData(VL[0]) ||
>>              !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
>>             "tryScheduleBundle should cancelScheduling on failure");
>> -    newTreeEntry(VL, false, false);
>> +    newTreeEntry(VL, false);
>>      return;
>>    }
>>    DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
>> @@ -1164,12 +1166,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>            if (Term) {
>>              DEBUG(dbgs() << "SLP: Need to swizzle PHINodes
>> (TerminatorInst use).\n");
>>              BS.cancelScheduling(VL);
>> -            newTreeEntry(VL, false, false);
>> +            newTreeEntry(VL, false);
>>              return;
>>            }
>>          }
>>
>> -      newTreeEntry(VL, true, false);
>> +      newTreeEntry(VL, true);
>>        DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
>>
>>        for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
>> @@ -1191,7 +1193,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>        } else {
>>          BS.cancelScheduling(VL);
>>        }
>> -      newTreeEntry(VL, Reuse, false);
>> +      newTreeEntry(VL, Reuse);
>>        return;
>>      }
>>      case Instruction::Load: {
>> @@ -1207,7 +1209,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>        if (DL->getTypeSizeInBits(ScalarTy) !=
>>            DL->getTypeAllocSizeInBits(ScalarTy)) {
>>          BS.cancelScheduling(VL);
>> -        newTreeEntry(VL, false, false);
>> +        newTreeEntry(VL, false);
>>          DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
>>          return;
>>        }
>> @@ -1218,7 +1220,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>          LoadInst *L = cast<LoadInst>(VL[i]);
>>          if (!L->isSimple()) {
>>            BS.cancelScheduling(VL);
>> -          newTreeEntry(VL, false, false);
>> +          newTreeEntry(VL, false);
>>            DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
>>            return;
>>          }
>> @@ -1238,7 +1240,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>
>>        if (Consecutive) {
>>          ++NumLoadsWantToKeepOrder;
>> -        newTreeEntry(VL, true, false);
>> +        newTreeEntry(VL, true);
>>          DEBUG(dbgs() << "SLP: added a vector of loads.\n");
>>          return;
>>        }
>> @@ -1255,7 +1257,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>        if (VL.size() > 2 && !ReverseConsecutive) {
>>          bool ShuffledLoads = true;
>>          SmallVector<Value *, 8> Sorted;
>> -        if (sortMemAccesses(VL, *DL, *SE, Sorted)) {
>> +        SmallVector<unsigned, 4> Mask;
>> +        if (sortMemAccesses(VL, *DL, *SE, Sorted, &Mask)) {
>>            auto NewVL = makeArrayRef(Sorted.begin(), Sorted.end());
>>            for (unsigned i = 0, e = NewVL.size() - 1; i < e; ++i) {
>>              if (!isConsecutiveAccess(NewVL[i], NewVL[i + 1], *DL, *SE))
>> {
>> @@ -1264,14 +1267,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>              }
>>            }
>>            if (ShuffledLoads) {
>> -            newTreeEntry(NewVL, true, true);
>> +            newTreeEntry(NewVL, true, makeArrayRef(Mask.begin(),
>> Mask.end()));
>>              return;
>>            }
>>          }
>>        }
>>
>>        BS.cancelScheduling(VL);
>> -      newTreeEntry(VL, false, false);
>> +      newTreeEntry(VL, false);
>>
>>        if (ReverseConsecutive) {
>>          ++NumLoadsWantToChangeOrder;
>> @@ -1298,12 +1301,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>          Type *Ty = cast<Instruction>(Val)->getOperand(0)->getType();
>>          if (Ty != SrcTy || !isValidElementType(Ty)) {
>>            BS.cancelScheduling(VL);
>> -          newTreeEntry(VL, false, false);
>> +          newTreeEntry(VL, false);
>>            DEBUG(dbgs() << "SLP: Gathering casts with different src
>> types.\n");
>>            return;
>>          }
>>        }
>> -      newTreeEntry(VL, true, false);
>> +      newTreeEntry(VL, true);
>>        DEBUG(dbgs() << "SLP: added a vector of casts.\n");
>>
>>        for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
>> @@ -1326,13 +1329,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>          if (Cmp->getPredicate() != P0 ||
>>              Cmp->getOperand(0)->getType() != ComparedTy) {
>>            BS.cancelScheduling(VL);
>> -          newTreeEntry(VL, false, false);
>> +          newTreeEntry(VL, false);
>>            DEBUG(dbgs() << "SLP: Gathering cmp with different
>> predicate.\n");
>>            return;
>>          }
>>        }
>>
>> -      newTreeEntry(VL, true, false);
>> +      newTreeEntry(VL, true);
>>        DEBUG(dbgs() << "SLP: added a vector of compares.\n");
>>
>>        for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
>> @@ -1364,7 +1367,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>      case Instruction::And:
>>      case Instruction::Or:
>>      case Instruction::Xor: {
>> -      newTreeEntry(VL, true, false);
>> +      newTreeEntry(VL, true);
>>        DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
>>
>>        // Sort operands of the instructions so that each side is more
>> likely to
>> @@ -1393,7 +1396,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>          if (cast<Instruction>(Val)->getNumOperands() != 2) {
>>            DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested
>> indexes).\n");
>>            BS.cancelScheduling(VL);
>> -          newTreeEntry(VL, false, false);
>> +          newTreeEntry(VL, false);
>>            return;
>>          }
>>        }
>> @@ -1406,7 +1409,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>          if (Ty0 != CurTy) {
>>            DEBUG(dbgs() << "SLP: not-vectorizable GEP (different
>> types).\n");
>>            BS.cancelScheduling(VL);
>> -          newTreeEntry(VL, false, false);
>> +          newTreeEntry(VL, false);
>>            return;
>>          }
>>        }
>> @@ -1418,12 +1421,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>            DEBUG(
>>                dbgs() << "SLP: not-vectorizable GEP (non-constant
>> indexes).\n");
>>            BS.cancelScheduling(VL);
>> -          newTreeEntry(VL, false, false);
>> +          newTreeEntry(VL, false);
>>            return;
>>          }
>>        }
>>
>> -      newTreeEntry(VL, true, false);
>> +      newTreeEntry(VL, true);
>>        DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
>>        for (unsigned i = 0, e = 2; i < e; ++i) {
>>          ValueList Operands;
>> @@ -1440,12 +1443,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>        for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
>>          if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
>>            BS.cancelScheduling(VL);
>> -          newTreeEntry(VL, false, false);
>> +          newTreeEntry(VL, false);
>>            DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
>>            return;
>>          }
>>
>> -      newTreeEntry(VL, true, false);
>> +      newTreeEntry(VL, true);
>>        DEBUG(dbgs() << "SLP: added a vector of stores.\n");
>>
>>        ValueList Operands;
>> @@ -1463,7 +1466,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>        Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
>>        if (!isTriviallyVectorizable(ID)) {
>>          BS.cancelScheduling(VL);
>> -        newTreeEntry(VL, false, false);
>> +        newTreeEntry(VL, false);
>>          DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
>>          return;
>>        }
>> @@ -1477,7 +1480,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>              getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
>>              !CI->hasIdenticalOperandBundleSchema(*CI2)) {
>>            BS.cancelScheduling(VL);
>> -          newTreeEntry(VL, false, false);
>> +          newTreeEntry(VL, false);
>>            DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" <<
>> *VL[i]
>>                         << "\n");
>>            return;
>> @@ -1488,7 +1491,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>            Value *A1J = CI2->getArgOperand(1);
>>            if (A1I != A1J) {
>>              BS.cancelScheduling(VL);
>> -            newTreeEntry(VL, false, false);
>> +            newTreeEntry(VL, false);
>>              DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
>>                           << " argument "<< A1I<<"!=" << A1J
>>                           << "\n");
>> @@ -1501,14 +1504,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>                          CI->op_begin() + CI->getBundleOperandsEndIndex(
>> ),
>>                          CI2->op_begin() + CI2->getBundleOperandsStartIndex()))
>> {
>>            BS.cancelScheduling(VL);
>> -          newTreeEntry(VL, false, false);
>> +          newTreeEntry(VL, false);
>>            DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" <<
>> *CI << "!="
>>                         << *VL[i] << '\n');
>>            return;
>>          }
>>        }
>>
>> -      newTreeEntry(VL, true, false);
>> +      newTreeEntry(VL, true);
>>        for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
>>          ValueList Operands;
>>          // Prepare the operand vector.
>> @@ -1525,11 +1528,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>        // then do not vectorize this instruction.
>>        if (!isAltShuffle) {
>>          BS.cancelScheduling(VL);
>> -        newTreeEntry(VL, false, false);
>> +        newTreeEntry(VL, false);
>>          DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
>>          return;
>>        }
>> -      newTreeEntry(VL, true, false);
>> +      newTreeEntry(VL, true);
>>        DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
>>
>>        // Reorder operands if reordering would enable vectorization.
>> @@ -1553,7 +1556,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>      }
>>      default:
>>        BS.cancelScheduling(VL);
>> -      newTreeEntry(VL, false, false);
>> +      newTreeEntry(VL, false);
>>        DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
>>        return;
>>    }
>> @@ -1792,7 +1795,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>>              TTI->getMemoryOpCost(Instruction::Load, ScalarTy,
>> alignment, 0);
>>        int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
>>                                             VecTy, alignment, 0);
>> -      if (E->NeedToShuffle) {
>> +      if (!E->ShuffleMask.empty()) {
>>          VecLdCost += TTI->getShuffleCost(
>>              TargetTransformInfo::SK_PermuteSingleSrc, VecTy, 0);
>>        }
>> @@ -2358,8 +2361,9 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<V
>>    if (ScalarToTreeEntry.count(VL[0])) {
>>      int Idx = ScalarToTreeEntry[VL[0]];
>>      TreeEntry *E = &VectorizableTree[Idx];
>> -    if (E->isSame(VL) || (E->NeedToShuffle && E->isFoundJumbled(VL, *DL,
>> *SE)))
>> -      return vectorizeTree(VL, E);
>> +    if (E->isSame(VL) ||
>> +        (!E->ShuffleMask.empty() && E->isFoundJumbled(VL, *DL, *SE)))
>> +      return vectorizeTree(E);
>>    }
>>
>>    Type *ScalarTy = VL[0]->getType();
>> @@ -2370,10 +2374,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<V
>>    return Gather(VL, VecTy);
>>  }
>>
>> -Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, TreeEntry *E) {
>> +Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
>>    IRBuilder<>::InsertPointGuard Guard(Builder);
>>
>> -  if (E->VectorizedValue && !E->NeedToShuffle) {
>> +  if (E->VectorizedValue && E->ShuffleMask.empty()) {
>>      DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] <<
>> ".\n");
>>      return E->VectorizedValue;
>>    }
>> @@ -2611,27 +2615,18 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<V
>>
>>        // As program order of scalar loads are jumbled, the vectorized
>> 'load'
>>        // must be followed by a 'shuffle' with the required jumbled mask.
>> -      if (!VL.empty() && (E->NeedToShuffle)) {
>> -        assert(VL.size() == E->Scalars.size() &&
>> -               "Equal number of scalars expected");
>> +      if (!E->ShuffleMask.empty()) {
>>          SmallVector<Constant *, 8> Mask;
>> -        for (Value *Val : VL) {
>> -          if (ScalarToTreeEntry.count(Val)) {
>> -            int Idx = ScalarToTreeEntry[Val];
>> -            TreeEntry *E = &VectorizableTree[Idx];
>> -            for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
>> -              if (E->Scalars[Lane] == Val) {
>> -                Mask.push_back(Builder.getInt32(Lane));
>> -                break;
>> -              }
>> -            }
>> -          }
>> +        for (unsigned Lane = 0, LE = E->ShuffleMask.size(); Lane != LE;
>> +             ++Lane) {
>> +          Mask.push_back(Builder.getInt32(E->ShuffleMask[Lane]));
>>          }
>> -
>>          // Generate shuffle for jumbled memory access
>>          Value *Undef = UndefValue::get(VecTy);
>>          Value *Shuf = Builder.CreateShuffleVector((Value *)LI, Undef,
>>
>>  ConstantVector::get(Mask));
>> +        E->VectorizedValue = Shuf;
>> +        ++NumVectorInstructions;
>>          return Shuf;
>>        }
>>
>> @@ -2816,7 +2811,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebug
>>    }
>>
>>    Builder.SetInsertPoint(&F->getEntryBlock().front());
>> -  auto *VectorRoot = vectorizeTree(ArrayRef<Value *>(),
>> &VectorizableTree[0]);
>> +  auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
>>
>>    // If the vectorized tree can be rewritten in a smaller type, we
>> truncate the
>>    // vectorized root. InstCombine will then rewrite the entire
>> expression. We
>> @@ -2861,8 +2856,20 @@ BoUpSLP::vectorizeTree(ExtraValueToDebug
>>
>>      Value *Vec = E->VectorizedValue;
>>      assert(Vec && "Can't find vectorizable value");
>> -
>> -    Value *Lane = Builder.getInt32(ExternalUse.Lane);
>> +    unsigned i = 0;
>> +    Value *Lane;
>> +    // In case vectorizable scalars use are not in-order, scalars would
>> have
>> +    // been shuffled.Recompute the proper Lane of ExternalUse.
>> +    if (!E->ShuffleMask.empty()) {
>> +      SmallVector<unsigned, 4> Val(E->ShuffleMask.size());
>> +      for (; i < E->ShuffleMask.size(); i++) {
>> +        if (E->ShuffleMask[i] == (unsigned)ExternalUse.Lane)
>> +          break;
>> +      }
>> +      Lane = Builder.getInt32(i);
>> +    } else {
>> +      Lane = Builder.getInt32(ExternalUse.Lane);
>> +    }
>>      // If User == nullptr, the Scalar is used as extra arg. Generate
>>      // ExtractElement instruction and update the record for this scalar
>> in
>>      // ExternallyUsedValues.
>>
>> Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-bug.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transfor
>> ms/SLPVectorizer/X86/jumbled-load-bug.ll?rev=296863&view=auto
>> ============================================================
>> ==================
>> --- llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-bug.ll
>> (added)
>> +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-bug.ll Fri
>> Mar  3 04:02:47 2017
>> @@ -0,0 +1,43 @@
>> +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
>> +; RUN: opt < %s -S -slp-vectorizer | FileCheck %s
>> +
>> +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
>> +target triple = "x86_64-unknown-linux-gnu"
>> +
>> +define <4 x i32> @zot() #0 {
>> +; CHECK-LABEL: @zot(
>> +; CHECK-NEXT:  bb:
>> +; CHECK-NEXT:    [[P0:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]*
>> undef, i64 undef, i64 0
>> +; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]*
>> undef, i64 undef, i64 1
>> +; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]*
>> undef, i64 undef, i64 2
>> +; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]*
>> undef, i64 undef, i64 3
>> +; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
>> +; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align
>> 1
>> +; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x
>> i8> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
>> +; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 0
>> +; CHECK-NEXT:    [[I0:%.*]] = insertelement <4 x i8> undef, i8 [[TMP3]],
>> i32 0
>> +; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i8> [[TMP2]], i32 1
>> +; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x i8> [[I0]], i8
>> [[TMP4]], i32 1
>> +; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i8> [[TMP2]], i32 2
>> +; CHECK-NEXT:    [[I2:%.*]] = insertelement <4 x i8> [[I1]], i8
>> [[TMP5]], i32 2
>> +; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3
>> +; CHECK-NEXT:    [[I3:%.*]] = insertelement <4 x i8> [[I2]], i8
>> [[TMP6]], i32 3
>> +; CHECK-NEXT:    [[RET:%.*]] = zext <4 x i8> [[I3]] to <4 x i32>
>> +; CHECK-NEXT:    ret <4 x i32> [[RET]]
>> +;
>> +bb:
>> +  %p0 = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64
>> 0
>> +  %p1 = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64
>> 1
>> +  %p2 = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64
>> 2
>> +  %p3 = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64
>> 3
>> +  %v3 = load i8, i8* %p3, align 1
>> +  %v2 = load i8, i8* %p2, align 1
>> +  %v0 = load i8, i8* %p0, align 1
>> +  %v1 = load i8, i8* %p1, align 1
>> +  %i0 = insertelement <4 x i8> undef, i8 %v1, i32 0
>> +  %i1 = insertelement <4 x i8> %i0, i8 %v0, i32 1
>> +  %i2 = insertelement <4 x i8> %i1, i8 %v2, i32 2
>> +  %i3 = insertelement <4 x i8> %i2, i8 %v3, i32 3
>> +  %ret = zext <4 x i8> %i3 to <4 x i32>
>> +  ret <4 x i32> %ret
>> +}
>>
>> Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-same.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transfor
>> ms/SLPVectorizer/X86/jumbled-same.ll?rev=296863&r1=296862&
>> r2=296863&view=diff
>> ============================================================
>> ==================
>> --- llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-same.ll
>> (original)
>> +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-same.ll Fri
>> Mar  3 04:02:47 2017
>> @@ -13,7 +13,7 @@ define i32 @fn1() {
>>  ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x
>> i32]* @b to <4 x i32>*), align 4
>>  ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x
>> i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
>>  ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP1]],
>> zeroinitializer
>> -; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
>> +; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
>>  ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32
>> [[TMP3]], i32 0
>>  ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32
>> ptrtoint (i32 ()* @fn1 to i32), i32 1
>>  ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32
>> ptrtoint (i32 ()* @fn1 to i32), i32 2
>>
>>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20170306/2bcb0c3c/attachment.html>