[llvm] r296863 - [SLP] Fixes the bug due to absence of in order uses of scalars which needs to be available
Michael Kuperstein via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 6 16:08:02 PST 2017
I've reverted this again in r297103.
On Mon, Mar 6, 2017 at 3:44 PM, Michael Kuperstein <mkuper at google.com>
wrote:
> Hi Shahid,
>
> Unfortunately, this still fails in some cases - and it's partially my
> fault, since it's caused by a change I suggested on the review.
>
> The problem is that we end up having a single load bundle that's used in
> two different orders. Since we only have a single tree entry for that
> bundle, we generate a single mask, and use the same shuffle for both uses.
> Note that I I'm not sure computing the mask on-demand (like we did before)
> would always help. I think it would break if one of the uses was in-order,
> another was out-of-order, and we reached buildTree_vec with the in-order
> use first. We'd then set NeedToShuffle to false, and wouldn't generate a
> shuffle for the out-of-order use.
>
> I'm not sure if the right solution is to generate the mask on demand like
> we did before (but then we need to make sure the issue above doesn't
> happen), or have distinct tree entries.
>
> Reduced example:
>
> target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
> target triple = "x86_64-pc-linux-gnu"
>
> %complex = type { { double, double } }
>
> define void @eggs(%complex* %p) local_unnamed_addr #0 {
> bb:
> br i1 undef, label %bb2, label %bb1
>
> bb1: ; preds = %bb
> ret void
>
> bb2: ; preds = %bb2, %bb
> %phi0 = phi double [ %tmp21, %bb2 ], [ 0.000000e+00, %bb ]
> %phi1 = phi double [ %tmp20, %bb2 ], [ 0.000000e+00, %bb ]
> %phi2 = phi double [ %tmp29, %bb2 ], [ 0.000000e+00, %bb ]
> %phi3 = phi double [ %tmp28, %bb2 ], [ 0.000000e+00, %bb ]
>
> %gep0 = getelementptr inbounds %complex, %complex* %p, i64 0, i32 0, i32
> 0
> %load0 = load double, double* %gep0, align 8
> %gep1 = getelementptr inbounds %complex, %complex* %p, i64 0, i32 0, i32
> 1
> %load1 = load double, double* %gep1, align 8
> %gep2 = getelementptr inbounds %complex, %complex* %p, i64 1, i32 0, i32
> 0
> %load2 = load double, double* %gep2, align 8
> %gep3 = getelementptr inbounds %complex, %complex* %p, i64 1, i32 0, i32
> 1
> %load3 = load double, double* %gep3, align 8
>
> %tmp14 = fmul double 1.0, %load0
> %tmp15 = fmul double 2.0, %load1
> %tmp16 = fsub double %tmp14, %tmp15
> %tmp17 = fmul double 1.0, %load1
> %tmp18 = fmul double 2.0, %load0
> %tmp19 = fsub double %tmp17, %tmp18
> %tmp20 = fadd double %phi1, %tmp16
> %tmp21 = fadd double %phi0, %tmp19
> %tmp22 = fmul double 1.0, %load2
> %tmp23 = fmul double 2.0, %load3
> %tmp24 = fsub double %tmp22, %tmp23
> %tmp25 = fmul double 1.0, %load3
> %tmp26 = fmul double 2.0, %load2
> %tmp27 = fsub double %tmp25, %tmp26
> %tmp28 = fadd double %phi3, %tmp24
> %tmp29 = fadd double %phi2, %tmp27
> br label %bb2
> }
>
> Good output:
>
> bb2: ; preds = %bb2, %bb
> %0 = phi <4 x double> [ %10, %bb2 ], [ zeroinitializer, %bb ]
> %gep0 = getelementptr inbounds %complex, %complex* %p, i64 0, i32 0, i32
> 0
> %gep1 = getelementptr inbounds %complex, %complex* %p, i64 0, i32 0, i32
> 1
> %gep2 = getelementptr inbounds %complex, %complex* %p, i64 1, i32 0, i32
> 0
> %gep3 = getelementptr inbounds %complex, %complex* %p, i64 1, i32 0, i32
> 1
> %1 = bitcast double* %gep0 to <4 x double>*
> %2 = load <4 x double>, <4 x double>* %1, align 8
> %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32
> 0, i32 1, i32 2, i32 3>
> %4 = bitcast double* %gep0 to <4 x double>*
> %5 = load <4 x double>, <4 x double>* %4, align 8
> %6 = shufflevector <4 x double> %5, <4 x double> undef, <4 x i32> <i32
> 1, i32 0, i32 3, i32 2>
> %7 = fmul <4 x double> <double 1.000000e+00, double 1.000000e+00, double
> 1.000000e+00, double 1.000000e+00>, %6
> %8 = fmul <4 x double> <double 2.000000e+00, double 2.000000e+00, double
> 2.000000e+00, double 2.000000e+00>, %3
> %9 = fsub <4 x double> %7, %8
> %10 = fadd <4 x double> %0, %9
> br label %bb2
>
> Bad output:
> bb2: ; preds = %bb2, %bb
> %0 = phi <4 x double> [ %10, %bb2 ], [ zeroinitializer, %bb ]
> %gep0 = getelementptr inbounds %complex, %complex* %p, i64 0, i32 0, i32
> 0
> %gep1 = getelementptr inbounds %complex, %complex* %p, i64 0, i32 0, i32
> 1
> %gep2 = getelementptr inbounds %complex, %complex* %p, i64 1, i32 0, i32
> 0
> %gep3 = getelementptr inbounds %complex, %complex* %p, i64 1, i32 0, i32
> 1
> %1 = bitcast double* %gep0 to <4 x double>*
> %2 = load <4 x double>, <4 x double>* %1, align 8
> %3 = shufflevector <4 x double> %2, <4 x double> undef, <4 x i32> <i32
> 1, i32 0, i32 3, i32 2>
> %4 = bitcast double* %gep0 to <4 x double>*
> %5 = load <4 x double>, <4 x double>* %4, align 8
> %6 = shufflevector <4 x double> %5, <4 x double> undef, <4 x i32> <i32
> 1, i32 0, i32 3, i32 2>
> %7 = fmul <4 x double> <double 1.000000e+00, double 1.000000e+00, double
> 1.000000e+00, double 1.000000e+00>, %6
> %8 = fmul <4 x double> <double 2.000000e+00, double 2.000000e+00, double
> 2.000000e+00, double 2.000000e+00>, %3
> %9 = fsub <4 x double> %7, %8
> %10 = fadd <4 x double> %0, %9
> br label %bb2
>
> Note the difference in %3.
> (In this reproducer, the "good" %3 shuffle is a nop, but the original case
> I saw had a different order.)
>
>
>
> On Fri, Mar 3, 2017 at 2:02 AM, Mohammad Shahid via llvm-commits <
> llvm-commits at lists.llvm.org> wrote:
>
>> Author: ashahid
>> Date: Fri Mar 3 04:02:47 2017
>> New Revision: 296863
>>
>> URL: http://llvm.org/viewvc/llvm-project?rev=296863&view=rev
>> Log:
>> [SLP] Fixes the bug due to absence of in order uses of scalars which
>> needs to be available
>> for VectorizeTree() API.This API uses it for proper mask computation to
>> be used in shufflevector IR.
>> The fix is to compute the mask for out of order memory accesses while
>> building the vectorizable tree
>> instead of actual vectorization of vectorizable tree.It also needs to
>> recompute the proper Lane for
>> external use of vectorizable scalars based on shuffle mask.
>>
>> Reviewers: mkuper
>>
>> Differential Revision: https://reviews.llvm.org/D30159
>>
>> Change-Id: Ide8773ce0ad3562f3cf4d1a0ad0f487e2f60ce5d
>>
>> Added:
>> llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-bug.ll
>> Modified:
>> llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h
>> llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp
>> llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
>> llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-same.ll
>>
>> Modified: llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/
>> Analysis/LoopAccessAnalysis.h?rev=296863&r1=296862&r2=296863&view=diff
>> ============================================================
>> ==================
>> --- llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h (original)
>> +++ llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h Fri Mar 3
>> 04:02:47 2017
>> @@ -660,12 +660,15 @@ int64_t getPtrStride(PredicatedScalarEvo
>> /// \brief Try to sort an array of loads / stores.
>> ///
>> /// An array of loads / stores can only be sorted if all pointer operands
>> -/// refer to the same object, and the differences between these pointers
>> +/// refer to the same object, and the differences between these pointers
>> /// are known to be constant. If that is the case, this returns true,
>> and the
>> /// sorted array is returned in \p Sorted. Otherwise, this returns
>> false, and
>> /// \p Sorted is invalid.
>> +// If \p Mask is not null, it also returns the \p Mask which is the
>> shuffle
>> +// mask for actual memory access order.
>> bool sortMemAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
>> - ScalarEvolution &SE, SmallVectorImpl<Value *>
>> &Sorted);
>> + ScalarEvolution &SE, SmallVectorImpl<Value *>
>> &Sorted,
>> + SmallVectorImpl<unsigned> *Mask = nullptr);
>>
>> /// \brief Returns true if the memory operations \p A and \p B are
>> consecutive.
>> /// This is a simple API that does not depend on the analysis pass.
>>
>> Modified: llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/
>> LoopAccessAnalysis.cpp?rev=296863&r1=296862&r2=296863&view=diff
>> ============================================================
>> ==================
>> --- llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp (original)
>> +++ llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp Fri Mar 3 04:02:47
>> 2017
>> @@ -1040,7 +1040,8 @@ static unsigned getAddressSpaceOperand(V
>>
>> bool llvm::sortMemAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
>> ScalarEvolution &SE,
>> - SmallVectorImpl<Value *> &Sorted) {
>> + SmallVectorImpl<Value *> &Sorted,
>> + SmallVectorImpl<unsigned> *Mask) {
>> SmallVector<std::pair<int64_t, Value *>, 4> OffValPairs;
>> OffValPairs.reserve(VL.size());
>> Sorted.reserve(VL.size());
>> @@ -1050,7 +1051,6 @@ bool llvm::sortMemAccesses(ArrayRef<Valu
>> Value *Ptr0 = getPointerOperand(VL[0]);
>> const SCEV *Scev0 = SE.getSCEV(Ptr0);
>> Value *Obj0 = GetUnderlyingObject(Ptr0, DL);
>> -
>> for (auto *Val : VL) {
>> // The only kind of access we care about here is load.
>> if (!isa<LoadInst>(Val))
>> @@ -1077,14 +1077,30 @@ bool llvm::sortMemAccesses(ArrayRef<Valu
>> OffValPairs.emplace_back(Diff->getAPInt().getSExtValue(), Val);
>> }
>>
>> - std::sort(OffValPairs.begin(), OffValPairs.end(),
>> - [](const std::pair<int64_t, Value *> &Left,
>> - const std::pair<int64_t, Value *> &Right) {
>> - return Left.first < Right.first;
>> + SmallVector<unsigned, 4> UseOrder(VL.size());
>> + for (unsigned i = 0; i < VL.size(); i++) {
>> + UseOrder[i] = i;
>> + }
>> +
>> + // Sort the memory accesses and keep the order of their uses in
>> UseOrder.
>> + std::sort(UseOrder.begin(), UseOrder.end(),
>> + [&OffValPairs](unsigned Left, unsigned Right) {
>> + return OffValPairs[Left].first < OffValPairs[Right].first;
>> });
>>
>> - for (auto &it : OffValPairs)
>> - Sorted.push_back(it.second);
>> + for (unsigned i = 0; i < VL.size(); i++)
>> + Sorted.emplace_back(OffValPairs[UseOrder[i]].second);
>> +
>> + // Sort UseOrder to compute the Mask.
>> + if (Mask) {
>> + Mask->reserve(VL.size());
>> + for (unsigned i = 0; i < VL.size(); i++)
>> + Mask->emplace_back(i);
>> + std::sort(Mask->begin(), Mask->end(),
>> + [&UseOrder](unsigned Left, unsigned Right) {
>> + return UseOrder[Left] < UseOrder[Right];
>> + });
>> + }
>>
>> return true;
>> }
>>
>> Modified: llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transform
>> s/Vectorize/SLPVectorizer.cpp?rev=296863&r1=296862&r2=296863&view=diff
>> ============================================================
>> ==================
>> --- llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp (original)
>> +++ llvm/trunk/lib/Transforms/Vectorize/SLPVectorizer.cpp Fri Mar 3
>> 04:02:47 2017
>> @@ -423,10 +423,8 @@ private:
>> /// be vectorized to use the original vector (or aggregate "bitcast"
>> to a vector).
>> bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;
>>
>> - /// Vectorize a single entry in the tree. VL icontains all isomorphic
>> scalars
>> - /// in order of its usage in a user program, for example ADD1, ADD2
>> and so on
>> - /// or LOAD1 , LOAD2 etc.
>> - Value *vectorizeTree(ArrayRef<Value *> VL, TreeEntry *E);
>> + /// Vectorize a single entry in the tree.
>> + Value *vectorizeTree(TreeEntry *E);
>>
>> /// Vectorize a single entry in the tree, starting in \p VL.
>> Value *vectorizeTree(ArrayRef<Value *> VL);
>> @@ -466,8 +464,8 @@ private:
>> SmallVectorImpl<Value *> &Left,
>> SmallVectorImpl<Value *> &Right);
>> struct TreeEntry {
>> - TreeEntry() : Scalars(), VectorizedValue(nullptr),
>> - NeedToGather(0), NeedToShuffle(0) {}
>> + TreeEntry()
>> + : Scalars(), VectorizedValue(nullptr), NeedToGather(0),
>> ShuffleMask() {}
>>
>> /// \returns true if the scalars in VL are equal to this entry.
>> bool isSame(ArrayRef<Value *> VL) const {
>> @@ -495,19 +493,23 @@ private:
>> /// Do we need to gather this sequence ?
>> bool NeedToGather;
>>
>> - /// Do we need to shuffle the load ?
>> - bool NeedToShuffle;
>> + /// Records optional suffle mask for jumbled memory accesses in this.
>> + SmallVector<unsigned, 8> ShuffleMask;
>> +
>> };
>>
>> /// Create a new VectorizableTree entry.
>> TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
>> - bool NeedToShuffle) {
>> + ArrayRef<unsigned> ShuffleMask = None) {
>> VectorizableTree.emplace_back();
>> int idx = VectorizableTree.size() - 1;
>> TreeEntry *Last = &VectorizableTree[idx];
>> Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
>> Last->NeedToGather = !Vectorized;
>> - Last->NeedToShuffle = NeedToShuffle;
>> + if (!ShuffleMask.empty())
>> + Last->ShuffleMask.insert(Last->ShuffleMask.begin(),
>> ShuffleMask.begin(),
>> + ShuffleMask.end());
>> +
>> if (Vectorized) {
>> for (int i = 0, e = VL.size(); i != e; ++i) {
>> assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in
>> tree!");
>> @@ -1030,21 +1032,21 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>
>> if (Depth == RecursionMaxDepth) {
>> DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> return;
>> }
>>
>> // Don't handle vectors.
>> if (VL[0]->getType()->isVectorTy()) {
>> DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> return;
>> }
>>
>> if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
>> if (SI->getValueOperand()->getType()->isVectorTy()) {
>> DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> return;
>> }
>> unsigned Opcode = getSameOpcode(VL);
>> @@ -1061,7 +1063,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> // If all of the operands are identical or constant we have a simple
>> solution.
>> if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {
>> DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> return;
>> }
>>
>> @@ -1073,7 +1075,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> if (EphValues.count(VL[i])) {
>> DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
>> ") is ephemeral.\n");
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> return;
>> }
>> }
>> @@ -1086,7 +1088,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
>> if (E->Scalars[i] != VL[i]) {
>> DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> return;
>> }
>> }
>> @@ -1099,7 +1101,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> if (ScalarToTreeEntry.count(VL[i])) {
>> DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
>> ") is already in tree.\n");
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> return;
>> }
>> }
>> @@ -1109,7 +1111,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> for (unsigned i = 0, e = VL.size(); i != e; ++i) {
>> if (MustGather.count(VL[i])) {
>> DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> return;
>> }
>> }
>> @@ -1123,7 +1125,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> // Don't go into unreachable blocks. They may contain instructions
>> with
>> // dependency cycles which confuse the final scheduling.
>> DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> return;
>> }
>>
>> @@ -1132,7 +1134,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> for (unsigned j = i+1; j < e; ++j)
>> if (VL[i] == VL[j]) {
>> DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> return;
>> }
>>
>> @@ -1147,7 +1149,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> assert((!BS.getScheduleData(VL[0]) ||
>> !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
>> "tryScheduleBundle should cancelScheduling on failure");
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> return;
>> }
>> DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
>> @@ -1164,12 +1166,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> if (Term) {
>> DEBUG(dbgs() << "SLP: Need to swizzle PHINodes
>> (TerminatorInst use).\n");
>> BS.cancelScheduling(VL);
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> return;
>> }
>> }
>>
>> - newTreeEntry(VL, true, false);
>> + newTreeEntry(VL, true);
>> DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
>>
>> for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
>> @@ -1191,7 +1193,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> } else {
>> BS.cancelScheduling(VL);
>> }
>> - newTreeEntry(VL, Reuse, false);
>> + newTreeEntry(VL, Reuse);
>> return;
>> }
>> case Instruction::Load: {
>> @@ -1207,7 +1209,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> if (DL->getTypeSizeInBits(ScalarTy) !=
>> DL->getTypeAllocSizeInBits(ScalarTy)) {
>> BS.cancelScheduling(VL);
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
>> return;
>> }
>> @@ -1218,7 +1220,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> LoadInst *L = cast<LoadInst>(VL[i]);
>> if (!L->isSimple()) {
>> BS.cancelScheduling(VL);
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
>> return;
>> }
>> @@ -1238,7 +1240,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>>
>> if (Consecutive) {
>> ++NumLoadsWantToKeepOrder;
>> - newTreeEntry(VL, true, false);
>> + newTreeEntry(VL, true);
>> DEBUG(dbgs() << "SLP: added a vector of loads.\n");
>> return;
>> }
>> @@ -1255,7 +1257,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> if (VL.size() > 2 && !ReverseConsecutive) {
>> bool ShuffledLoads = true;
>> SmallVector<Value *, 8> Sorted;
>> - if (sortMemAccesses(VL, *DL, *SE, Sorted)) {
>> + SmallVector<unsigned, 4> Mask;
>> + if (sortMemAccesses(VL, *DL, *SE, Sorted, &Mask)) {
>> auto NewVL = makeArrayRef(Sorted.begin(), Sorted.end());
>> for (unsigned i = 0, e = NewVL.size() - 1; i < e; ++i) {
>> if (!isConsecutiveAccess(NewVL[i], NewVL[i + 1], *DL, *SE))
>> {
>> @@ -1264,14 +1267,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> }
>> }
>> if (ShuffledLoads) {
>> - newTreeEntry(NewVL, true, true);
>> + newTreeEntry(NewVL, true, makeArrayRef(Mask.begin(),
>> Mask.end()));
>> return;
>> }
>> }
>> }
>>
>> BS.cancelScheduling(VL);
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>>
>> if (ReverseConsecutive) {
>> ++NumLoadsWantToChangeOrder;
>> @@ -1298,12 +1301,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> Type *Ty = cast<Instruction>(Val)->getOperand(0)->getType();
>> if (Ty != SrcTy || !isValidElementType(Ty)) {
>> BS.cancelScheduling(VL);
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> DEBUG(dbgs() << "SLP: Gathering casts with different src
>> types.\n");
>> return;
>> }
>> }
>> - newTreeEntry(VL, true, false);
>> + newTreeEntry(VL, true);
>> DEBUG(dbgs() << "SLP: added a vector of casts.\n");
>>
>> for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
>> @@ -1326,13 +1329,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> if (Cmp->getPredicate() != P0 ||
>> Cmp->getOperand(0)->getType() != ComparedTy) {
>> BS.cancelScheduling(VL);
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> DEBUG(dbgs() << "SLP: Gathering cmp with different
>> predicate.\n");
>> return;
>> }
>> }
>>
>> - newTreeEntry(VL, true, false);
>> + newTreeEntry(VL, true);
>> DEBUG(dbgs() << "SLP: added a vector of compares.\n");
>>
>> for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
>> @@ -1364,7 +1367,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> case Instruction::And:
>> case Instruction::Or:
>> case Instruction::Xor: {
>> - newTreeEntry(VL, true, false);
>> + newTreeEntry(VL, true);
>> DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
>>
>> // Sort operands of the instructions so that each side is more
>> likely to
>> @@ -1393,7 +1396,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> if (cast<Instruction>(Val)->getNumOperands() != 2) {
>> DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested
>> indexes).\n");
>> BS.cancelScheduling(VL);
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> return;
>> }
>> }
>> @@ -1406,7 +1409,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> if (Ty0 != CurTy) {
>> DEBUG(dbgs() << "SLP: not-vectorizable GEP (different
>> types).\n");
>> BS.cancelScheduling(VL);
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> return;
>> }
>> }
>> @@ -1418,12 +1421,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> DEBUG(
>> dbgs() << "SLP: not-vectorizable GEP (non-constant
>> indexes).\n");
>> BS.cancelScheduling(VL);
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> return;
>> }
>> }
>>
>> - newTreeEntry(VL, true, false);
>> + newTreeEntry(VL, true);
>> DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
>> for (unsigned i = 0, e = 2; i < e; ++i) {
>> ValueList Operands;
>> @@ -1440,12 +1443,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
>> if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
>> BS.cancelScheduling(VL);
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
>> return;
>> }
>>
>> - newTreeEntry(VL, true, false);
>> + newTreeEntry(VL, true);
>> DEBUG(dbgs() << "SLP: added a vector of stores.\n");
>>
>> ValueList Operands;
>> @@ -1463,7 +1466,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
>> if (!isTriviallyVectorizable(ID)) {
>> BS.cancelScheduling(VL);
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
>> return;
>> }
>> @@ -1477,7 +1480,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
>> !CI->hasIdenticalOperandBundleSchema(*CI2)) {
>> BS.cancelScheduling(VL);
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" <<
>> *VL[i]
>> << "\n");
>> return;
>> @@ -1488,7 +1491,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> Value *A1J = CI2->getArgOperand(1);
>> if (A1I != A1J) {
>> BS.cancelScheduling(VL);
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
>> << " argument "<< A1I<<"!=" << A1J
>> << "\n");
>> @@ -1501,14 +1504,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> CI->op_begin() + CI->getBundleOperandsEndIndex(
>> ),
>> CI2->op_begin() + CI2->getBundleOperandsStartIndex()))
>> {
>> BS.cancelScheduling(VL);
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" <<
>> *CI << "!="
>> << *VL[i] << '\n');
>> return;
>> }
>> }
>>
>> - newTreeEntry(VL, true, false);
>> + newTreeEntry(VL, true);
>> for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
>> ValueList Operands;
>> // Prepare the operand vector.
>> @@ -1525,11 +1528,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> // then do not vectorize this instruction.
>> if (!isAltShuffle) {
>> BS.cancelScheduling(VL);
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
>> return;
>> }
>> - newTreeEntry(VL, true, false);
>> + newTreeEntry(VL, true);
>> DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
>>
>> // Reorder operands if reordering would enable vectorization.
>> @@ -1553,7 +1556,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Val
>> }
>> default:
>> BS.cancelScheduling(VL);
>> - newTreeEntry(VL, false, false);
>> + newTreeEntry(VL, false);
>> DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
>> return;
>> }
>> @@ -1792,7 +1795,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E)
>> TTI->getMemoryOpCost(Instruction::Load, ScalarTy,
>> alignment, 0);
>> int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
>> VecTy, alignment, 0);
>> - if (E->NeedToShuffle) {
>> + if (!E->ShuffleMask.empty()) {
>> VecLdCost += TTI->getShuffleCost(
>> TargetTransformInfo::SK_PermuteSingleSrc, VecTy, 0);
>> }
>> @@ -2358,8 +2361,9 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<V
>> if (ScalarToTreeEntry.count(VL[0])) {
>> int Idx = ScalarToTreeEntry[VL[0]];
>> TreeEntry *E = &VectorizableTree[Idx];
>> - if (E->isSame(VL) || (E->NeedToShuffle && E->isFoundJumbled(VL, *DL,
>> *SE)))
>> - return vectorizeTree(VL, E);
>> + if (E->isSame(VL) ||
>> + (!E->ShuffleMask.empty() && E->isFoundJumbled(VL, *DL, *SE)))
>> + return vectorizeTree(E);
>> }
>>
>> Type *ScalarTy = VL[0]->getType();
>> @@ -2370,10 +2374,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<V
>> return Gather(VL, VecTy);
>> }
>>
>> -Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, TreeEntry *E) {
>> +Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
>> IRBuilder<>::InsertPointGuard Guard(Builder);
>>
>> - if (E->VectorizedValue && !E->NeedToShuffle) {
>> + if (E->VectorizedValue && E->ShuffleMask.empty()) {
>> DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] <<
>> ".\n");
>> return E->VectorizedValue;
>> }
>> @@ -2611,27 +2615,18 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<V
>>
>> // As program order of scalar loads are jumbled, the vectorized
>> 'load'
>> // must be followed by a 'shuffle' with the required jumbled mask.
>> - if (!VL.empty() && (E->NeedToShuffle)) {
>> - assert(VL.size() == E->Scalars.size() &&
>> - "Equal number of scalars expected");
>> + if (!E->ShuffleMask.empty()) {
>> SmallVector<Constant *, 8> Mask;
>> - for (Value *Val : VL) {
>> - if (ScalarToTreeEntry.count(Val)) {
>> - int Idx = ScalarToTreeEntry[Val];
>> - TreeEntry *E = &VectorizableTree[Idx];
>> - for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
>> - if (E->Scalars[Lane] == Val) {
>> - Mask.push_back(Builder.getInt32(Lane));
>> - break;
>> - }
>> - }
>> - }
>> + for (unsigned Lane = 0, LE = E->ShuffleMask.size(); Lane != LE;
>> + ++Lane) {
>> + Mask.push_back(Builder.getInt32(E->ShuffleMask[Lane]));
>> }
>> -
>> // Generate shuffle for jumbled memory access
>> Value *Undef = UndefValue::get(VecTy);
>> Value *Shuf = Builder.CreateShuffleVector((Value *)LI, Undef,
>>
>> ConstantVector::get(Mask));
>> + E->VectorizedValue = Shuf;
>> + ++NumVectorInstructions;
>> return Shuf;
>> }
>>
>> @@ -2816,7 +2811,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebug
>> }
>>
>> Builder.SetInsertPoint(&F->getEntryBlock().front());
>> - auto *VectorRoot = vectorizeTree(ArrayRef<Value *>(),
>> &VectorizableTree[0]);
>> + auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
>>
>> // If the vectorized tree can be rewritten in a smaller type, we
>> truncate the
>> // vectorized root. InstCombine will then rewrite the entire
>> expression. We
>> @@ -2861,8 +2856,20 @@ BoUpSLP::vectorizeTree(ExtraValueToDebug
>>
>> Value *Vec = E->VectorizedValue;
>> assert(Vec && "Can't find vectorizable value");
>> -
>> - Value *Lane = Builder.getInt32(ExternalUse.Lane);
>> + unsigned i = 0;
>> + Value *Lane;
>> + // In case vectorizable scalars use are not in-order, scalars would
>> have
>> + // been shuffled.Recompute the proper Lane of ExternalUse.
>> + if (!E->ShuffleMask.empty()) {
>> + SmallVector<unsigned, 4> Val(E->ShuffleMask.size());
>> + for (; i < E->ShuffleMask.size(); i++) {
>> + if (E->ShuffleMask[i] == (unsigned)ExternalUse.Lane)
>> + break;
>> + }
>> + Lane = Builder.getInt32(i);
>> + } else {
>> + Lane = Builder.getInt32(ExternalUse.Lane);
>> + }
>> // If User == nullptr, the Scalar is used as extra arg. Generate
>> // ExtractElement instruction and update the record for this scalar
>> in
>> // ExternallyUsedValues.
>>
>> Added: llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-bug.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transfor
>> ms/SLPVectorizer/X86/jumbled-load-bug.ll?rev=296863&view=auto
>> ============================================================
>> ==================
>> --- llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-bug.ll
>> (added)
>> +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-load-bug.ll Fri
>> Mar 3 04:02:47 2017
>> @@ -0,0 +1,43 @@
>> +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
>> +; RUN: opt < %s -S -slp-vectorizer | FileCheck %s
>> +
>> +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
>> +target triple = "x86_64-unknown-linux-gnu"
>> +
>> +define <4 x i32> @zot() #0 {
>> +; CHECK-LABEL: @zot(
>> +; CHECK-NEXT: bb:
>> +; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]*
>> undef, i64 undef, i64 0
>> +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]*
>> undef, i64 undef, i64 1
>> +; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]*
>> undef, i64 undef, i64 2
>> +; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]*
>> undef, i64 undef, i64 3
>> +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
>> +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align
>> 1
>> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x
>> i8> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
>> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 0
>> +; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x i8> undef, i8 [[TMP3]],
>> i32 0
>> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i8> [[TMP2]], i32 1
>> +; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x i8> [[I0]], i8
>> [[TMP4]], i32 1
>> +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i8> [[TMP2]], i32 2
>> +; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x i8> [[I1]], i8
>> [[TMP5]], i32 2
>> +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3
>> +; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x i8> [[I2]], i8
>> [[TMP6]], i32 3
>> +; CHECK-NEXT: [[RET:%.*]] = zext <4 x i8> [[I3]] to <4 x i32>
>> +; CHECK-NEXT: ret <4 x i32> [[RET]]
>> +;
>> +bb:
>> + %p0 = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64
>> 0
>> + %p1 = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64
>> 1
>> + %p2 = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64
>> 2
>> + %p3 = getelementptr inbounds [4 x i8], [4 x i8]* undef, i64 undef, i64
>> 3
>> + %v3 = load i8, i8* %p3, align 1
>> + %v2 = load i8, i8* %p2, align 1
>> + %v0 = load i8, i8* %p0, align 1
>> + %v1 = load i8, i8* %p1, align 1
>> + %i0 = insertelement <4 x i8> undef, i8 %v1, i32 0
>> + %i1 = insertelement <4 x i8> %i0, i8 %v0, i32 1
>> + %i2 = insertelement <4 x i8> %i1, i8 %v2, i32 2
>> + %i3 = insertelement <4 x i8> %i2, i8 %v3, i32 3
>> + %ret = zext <4 x i8> %i3 to <4 x i32>
>> + ret <4 x i32> %ret
>> +}
>>
>> Modified: llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-same.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transfor
>> ms/SLPVectorizer/X86/jumbled-same.ll?rev=296863&r1=296862&
>> r2=296863&view=diff
>> ============================================================
>> ==================
>> --- llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-same.ll
>> (original)
>> +++ llvm/trunk/test/Transforms/SLPVectorizer/X86/jumbled-same.ll Fri
>> Mar 3 04:02:47 2017
>> @@ -13,7 +13,7 @@ define i32 @fn1() {
>> ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([4 x
>> i32]* @b to <4 x i32>*), align 4
>> ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x
>> i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
>> ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP1]],
>> zeroinitializer
>> -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
>> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
>> ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> undef, i32
>> [[TMP3]], i32 0
>> ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32
>> ptrtoint (i32 ()* @fn1 to i32), i32 1
>> ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32
>> ptrtoint (i32 ()* @fn1 to i32), i32 2
>>
>>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at lists.llvm.org
>> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20170306/2bcb0c3c/attachment.html>
More information about the llvm-commits
mailing list