[llvm] r184685 - LoopVectorize: Use the dependence test utility class
Nadav Rotem
nrotem at apple.com
Sun Jun 23 21:59:29 PDT 2013
Excellent! :)
On Jun 23, 2013, at 8:55 PM, Arnold Schwaighofer <aschwaighofer at apple.com> wrote:
> Author: arnolds
> Date: Sun Jun 23 22:55:48 2013
> New Revision: 184685
>
> URL: http://llvm.org/viewvc/llvm-project?rev=184685&view=rev
> Log:
> LoopVectorize: Use the dependence test utility class
>
> We now no longer need alias analysis - the cases that alias analysis would
> handle are now handled as accesses with a large dependence distance.
>
> We can now vectorize loops with simple constant dependence distances.
>
> for (i = 8; i < 256; ++i) {
> a[i] = a[i+4] * a[i+8];
> }
>
> for (i = 8; i < 256; ++i) {
> a[i] = a[i-4] * a[i-8];
> }
>
> We would be able to vectorize about 200 more loops (in many cases the cost model
> instructs us no to) in the test suite now. Results on x86-64 are a wash.
>
> I have seen one degradation in ammp. Interestingly, the function in which we
> now vectorize a loop is never executed so we probably see some instruction
> cache effects. There is a 2% improvement in h264ref. There is one or the other
> TSCV loop kernel that speeds up.
>
> radar://13681598
>
> Added:
> llvm/trunk/test/Transforms/LoopVectorize/memdep.ll
> Modified:
> llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
> llvm/trunk/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
> llvm/trunk/test/Transforms/LoopVectorize/runtime-check.ll
>
> Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=184685&r1=184684&r2=184685&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
> +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Sun Jun 23 22:55:48 2013
> @@ -54,7 +54,6 @@
> #include "llvm/ADT/SmallVector.h"
> #include "llvm/ADT/StringExtras.h"
> #include "llvm/Analysis/AliasAnalysis.h"
> -#include "llvm/Analysis/AliasSetTracker.h"
> #include "llvm/Analysis/Dominators.h"
> #include "llvm/Analysis/LoopInfo.h"
> #include "llvm/Analysis/LoopIterator.h"
> @@ -409,11 +408,10 @@ bool LoadHoisting::canHoistAllLoads() {
> class LoopVectorizationLegality {
> public:
> LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
> - DominatorTree *DT, TargetTransformInfo* TTI,
> - AliasAnalysis *AA, TargetLibraryInfo *TLI)
> - : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI),
> + DominatorTree *DT, TargetLibraryInfo *TLI)
> + : TheLoop(L), SE(SE), DL(DL), DT(DT), TLI(TLI),
> Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false),
> - LoadSpeculation(L, DT) {}
> + MaxSafeDepDistBytes(-1U), LoadSpeculation(L, DT) {}
>
> /// This enum represents the kinds of reductions that we support.
> enum ReductionKind {
> @@ -500,7 +498,8 @@ public:
> }
>
> /// Insert a pointer and calculate the start and end SCEVs.
> - void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr);
> + void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr,
> + unsigned DepSetId);
>
> /// This flag indicates if we need to add the runtime check.
> bool Need;
> @@ -512,6 +511,9 @@ public:
> SmallVector<const SCEV*, 2> Ends;
> /// Holds the information if this pointer is used for writing to memory.
> SmallVector<bool, 2> IsWritePtr;
> + /// Holds the id of the set of pointers that could be dependent because of a
> + /// shared underlying object.
> + SmallVector<unsigned, 2> DependencySetId;
> };
>
> /// A POD for saving information about induction variables.
> @@ -532,11 +534,6 @@ public:
> /// induction descriptor.
> typedef MapVector<PHINode*, InductionInfo> InductionList;
>
> - /// Alias(Multi)Map stores the values (GEPs or underlying objects and their
> - /// respective Store/Load instruction(s) to calculate aliasing.
> - typedef MapVector<Value*, Instruction* > AliasMap;
> - typedef DenseMap<Value*, std::vector<Instruction*> > AliasMultiMap;
> -
> /// Returns true if it is legal to vectorize this loop.
> /// This does not mean that it is profitable to vectorize this
> /// loop, only that it is legal to do so.
> @@ -583,6 +580,9 @@ public:
> /// This function returns the identity element (or neutral element) for
> /// the operation K.
> static Constant *getReductionIdentity(ReductionKind K, Type *Tp);
> +
> + unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
> +
> private:
> /// Check if a single basic block loop is vectorizable.
> /// At this point we know that this is a loop with a constant trip count
> @@ -623,16 +623,6 @@ private:
> /// Returns the induction kind of Phi. This function may return NoInduction
> /// if the PHI is not an induction variable.
> InductionKind isInductionVariable(PHINode *Phi);
> - /// Return true if can compute the address bounds of Ptr within the loop.
> - bool hasComputableBounds(Value *Ptr);
> - /// Return true if there is the chance of write reorder.
> - bool hasPossibleGlobalWriteReorder(Value *Object,
> - Instruction *Inst,
> - AliasMultiMap &WriteObjects,
> - unsigned MaxByteWidth);
> - /// Return the AA location for a load or a store.
> - AliasAnalysis::Location getLoadStoreLocation(Instruction *Inst);
> -
>
> /// The loop that we evaluate.
> Loop *TheLoop;
> @@ -642,10 +632,6 @@ private:
> DataLayout *DL;
> /// Dominators.
> DominatorTree *DT;
> - /// Target Info.
> - TargetTransformInfo *TTI;
> - /// Alias Analysis.
> - AliasAnalysis *AA;
> /// Target Library Info.
> TargetLibraryInfo *TLI;
>
> @@ -675,6 +661,8 @@ private:
> /// Can we assume the absence of NaNs.
> bool HasFunNoNaNAttr;
>
> + unsigned MaxSafeDepDistBytes;
> +
> /// Utility to determine whether loads can be speculated.
> LoadHoisting LoadSpeculation;
> };
> @@ -903,7 +891,6 @@ struct LoopVectorize : public LoopPass {
> LoopInfo *LI;
> TargetTransformInfo *TTI;
> DominatorTree *DT;
> - AliasAnalysis *AA;
> TargetLibraryInfo *TLI;
>
> virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
> @@ -916,7 +903,6 @@ struct LoopVectorize : public LoopPass {
> LI = &getAnalysis<LoopInfo>();
> TTI = &getAnalysis<TargetTransformInfo>();
> DT = &getAnalysis<DominatorTree>();
> - AA = getAnalysisIfAvailable<AliasAnalysis>();
> TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
>
> if (DL == NULL) {
> @@ -935,7 +921,7 @@ struct LoopVectorize : public LoopPass {
> }
>
> // Check if it is legal to vectorize the loop.
> - LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI);
> + LoopVectorizationLegality LVL(L, SE, DL, DT, TLI);
> if (!LVL.canVectorize()) {
> DEBUG(dbgs() << "LV: Not vectorizing.\n");
> return false;
> @@ -1010,7 +996,8 @@ struct LoopVectorize : public LoopPass {
> void
> LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
> Loop *Lp, Value *Ptr,
> - bool WritePtr) {
> + bool WritePtr,
> + unsigned DepSetId) {
> const SCEV *Sc = SE->getSCEV(Ptr);
> const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
> assert(AR && "Invalid addrec expression");
> @@ -1020,6 +1007,7 @@ LoopVectorizationLegality::RuntimePointe
> Starts.push_back(AR->getStart());
> Ends.push_back(ScEnd);
> IsWritePtr.push_back(WritePtr);
> + DependencySetId.push_back(DepSetId);
> }
>
> Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
> @@ -1357,10 +1345,9 @@ InnerLoopVectorizer::addRuntimeCheck(Loo
> if (!PtrRtCheck->Need)
> return NULL;
>
> - Instruction *MemoryRuntimeCheck = 0;
> unsigned NumPointers = PtrRtCheck->Pointers.size();
> - SmallVector<Value* , 2> Starts;
> - SmallVector<Value* , 2> Ends;
> + SmallVector<TrackingVH<Value> , 2> Starts;
> + SmallVector<TrackingVH<Value> , 2> Ends;
>
> SCEVExpander Exp(*SE, "induction");
>
> @@ -1387,13 +1374,18 @@ InnerLoopVectorizer::addRuntimeCheck(Loo
> }
>
> IRBuilder<> ChkBuilder(Loc);
> -
> + // Our instructions might fold to a constant.
> + Value *MemoryRuntimeCheck = 0;
> for (unsigned i = 0; i < NumPointers; ++i) {
> for (unsigned j = i+1; j < NumPointers; ++j) {
> // No need to check if two readonly pointers intersect.
> if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j])
> continue;
>
> + // Only need to check pointers between two different dependency sets.
> + if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j])
> + continue;
> +
> Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
> Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
> Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy, "bc");
> @@ -1405,12 +1397,18 @@ InnerLoopVectorizer::addRuntimeCheck(Loo
> if (MemoryRuntimeCheck)
> IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
> "conflict.rdx");
> -
> - MemoryRuntimeCheck = cast<Instruction>(IsConflict);
> + MemoryRuntimeCheck = IsConflict;
> }
> }
>
> - return MemoryRuntimeCheck;
> + // We have to do this trickery because the IRBuilder might fold the check to a
> + // constant expression in which case there is no Instruction anchored in a
> + // the block.
> + LLVMContext &Ctx = Loc->getContext();
> + Instruction * Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
> + ConstantInt::getTrue(Ctx));
> + ChkBuilder.Insert(Check, "memcheck.conflict");
> + return Check;
> }
>
> void
> @@ -2981,7 +2979,7 @@ bool AccessAnalysis::canCheckPtrAtRT(
> // Each access has its own dependence set.
> DepId = RunningDepId++;
>
> - //RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId);
> + RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId);
>
> DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr <<"\n");
> } else {
> @@ -3463,53 +3461,29 @@ MemoryDepChecker::areDepsSafe(AccessAnal
> return true;
> }
>
> -AliasAnalysis::Location
> -LoopVectorizationLegality::getLoadStoreLocation(Instruction *Inst) {
> - if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
> - return AA->getLocation(Store);
> - else if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
> - return AA->getLocation(Load);
> -
> - llvm_unreachable("Should be either load or store instruction");
> -}
> -
> -bool
> -LoopVectorizationLegality::hasPossibleGlobalWriteReorder(
> - Value *Object,
> - Instruction *Inst,
> - AliasMultiMap& WriteObjects,
> - unsigned MaxByteWidth) {
> -
> - AliasAnalysis::Location ThisLoc = getLoadStoreLocation(Inst);
> -
> - std::vector<Instruction*>::iterator
> - it = WriteObjects[Object].begin(),
> - end = WriteObjects[Object].end();
> -
> - for (; it != end; ++it) {
> - Instruction* I = *it;
> - if (I == Inst)
> - continue;
> -
> - AliasAnalysis::Location ThatLoc = getLoadStoreLocation(I);
> - if (AA->alias(ThisLoc.getWithNewSize(MaxByteWidth),
> - ThatLoc.getWithNewSize(MaxByteWidth)))
> - return true;
> - }
> - return false;
> -}
> -
> bool LoopVectorizationLegality::canVectorizeMemory() {
>
> typedef SmallVector<Value*, 16> ValueVector;
> typedef SmallPtrSet<Value*, 16> ValueSet;
> +
> + // Stores a pair of memory access location and whether the access is a store
> + // (true) or a load (false).
> + typedef std::pair<Value*, char> MemAccessInfo;
> + typedef DenseSet<MemAccessInfo> PtrAccessSet;
> +
> // Holds the Load and Store *instructions*.
> ValueVector Loads;
> ValueVector Stores;
> +
> + // Holds all the different accesses in the loop.
> + unsigned NumReads = 0;
> + unsigned NumReadWrites = 0;
> +
> PtrRtCheck.Pointers.clear();
> PtrRtCheck.Need = false;
>
> const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
> + MemoryDepChecker DepChecker(SE, DL, TheLoop);
>
> // For each block.
> for (Loop::block_iterator bb = TheLoop->block_begin(),
> @@ -3530,6 +3504,7 @@ bool LoopVectorizationLegality::canVecto
> return false;
> }
> Loads.push_back(Ld);
> + DepChecker.addAccess(Ld);
> continue;
> }
>
> @@ -3542,6 +3517,7 @@ bool LoopVectorizationLegality::canVecto
> return false;
> }
> Stores.push_back(St);
> + DepChecker.addAccess(St);
> }
> } // next instr.
> } // next block.
> @@ -3556,10 +3532,8 @@ bool LoopVectorizationLegality::canVecto
> return true;
> }
>
> - // Holds the read and read-write *pointers* that we find. These maps hold
> - // unique values for pointers (so no need for multi-map).
> - AliasMap Reads;
> - AliasMap ReadWrites;
> + AccessAnalysis::DepCandidates DependentAccesses;
> + AccessAnalysis Accesses(DL, DependentAccesses);
>
> // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
> // multiple times on the same object. If the ptr is accessed twice, once
> @@ -3578,10 +3552,12 @@ bool LoopVectorizationLegality::canVecto
> return false;
> }
>
> - // If we did *not* see this pointer before, insert it to
> - // the read-write list. At this phase it is only a 'write' list.
> - if (Seen.insert(Ptr))
> - ReadWrites.insert(std::make_pair(Ptr, ST));
> + // If we did *not* see this pointer before, insert it to the read-write
> + // list. At this phase it is only a 'write' list.
> + if (Seen.insert(Ptr)) {
> + ++NumReadWrites;
> + Accesses.addStore(Ptr);
> + }
> }
>
> if (IsAnnotatedParallel) {
> @@ -3591,6 +3567,7 @@ bool LoopVectorizationLegality::canVecto
> return true;
> }
>
> + SmallPtrSet<Value *, 16> ReadOnlyPtr;
> for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
> LoadInst *LD = cast<LoadInst>(*I);
> Value* Ptr = LD->getPointerOperand();
> @@ -3602,51 +3579,44 @@ bool LoopVectorizationLegality::canVecto
> // If the address of i is unknown (for example A[B[i]]) then we may
> // read a few words, modify, and write a few words, and some of the
> // words may be written to the same address.
> - if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr))
> - Reads.insert(std::make_pair(Ptr, LD));
> + bool IsReadOnlyPtr = false;
> + if (Seen.insert(Ptr) || !isStridedPtr(SE, DL, Ptr, TheLoop)) {
> + ++NumReads;
> + IsReadOnlyPtr = true;
> + }
> + Accesses.addLoad(Ptr, IsReadOnlyPtr);
> }
>
> // If we write (or read-write) to a single destination and there are no
> // other reads in this loop then is it safe to vectorize.
> - if (ReadWrites.size() == 1 && Reads.size() == 0) {
> + if (NumReadWrites == 1 && NumReads == 0) {
> DEBUG(dbgs() << "LV: Found a write-only loop!\n");
> return true;
> }
>
> - unsigned NumReadPtrs = 0;
> - unsigned NumWritePtrs = 0;
> + // Build dependence sets and check whether we need a runtime pointer bounds
> + // check.
> + Accesses.buildDependenceSets();
> + bool NeedRTCheck = Accesses.isRTCheckNeeded();
>
> // Find pointers with computable bounds. We are going to use this information
> // to place a runtime bound check.
> - bool CanDoRT = true;
> - AliasMap::iterator MI, ME;
> - for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
> - Value *V = (*MI).first;
> - if (hasComputableBounds(V)) {
> - PtrRtCheck.insert(SE, TheLoop, V, true);
> - NumWritePtrs++;
> - DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
> - } else {
> - CanDoRT = false;
> - break;
> - }
> - }
> - for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
> - Value *V = (*MI).first;
> - if (hasComputableBounds(V)) {
> - PtrRtCheck.insert(SE, TheLoop, V, false);
> - NumReadPtrs++;
> - DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
> - } else {
> - CanDoRT = false;
> - break;
> - }
> - }
> + unsigned NumComparisons = 0;
> + bool CanDoRT = false;
> + if (NeedRTCheck)
> + CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop);
> +
> +
> + DEBUG(dbgs() << "LV: We need to do " << NumComparisons <<
> + " pointer comparisons.\n");
> +
> + // If we only have one set of dependences to check pointers among we don't
> + // need a runtime check.
> + if (NumComparisons == 0 && NeedRTCheck)
> + NeedRTCheck = false;
>
> - // Check that we did not collect too many pointers or found a
> - // unsizeable pointer.
> - unsigned NumComparisons = (NumWritePtrs * (NumReadPtrs + NumWritePtrs - 1));
> - DEBUG(dbgs() << "LV: We need to compare " << NumComparisons << " ptrs.\n");
> + // Check that we did not collect too many pointers or found a unsizeable
> + // pointer.
> if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
> PtrRtCheck.reset();
> CanDoRT = false;
> @@ -3656,113 +3626,6 @@ bool LoopVectorizationLegality::canVecto
> DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
> }
>
> - bool NeedRTCheck = false;
> -
> - // Biggest vectorized access possible, vector width * unroll factor.
> - // TODO: We're being very pessimistic here, find a way to know the
> - // real access width before getting here.
> - unsigned MaxByteWidth = (TTI->getRegisterBitWidth(true) / 8) *
> - TTI->getMaximumUnrollFactor();
> - // Now that the pointers are in two lists (Reads and ReadWrites), we
> - // can check that there are no conflicts between each of the writes and
> - // between the writes to the reads.
> - // Note that WriteObjects duplicates the stores (indexed now by underlying
> - // objects) to avoid pointing to elements inside ReadWrites.
> - // TODO: Maybe create a new type where they can interact without duplication.
> - AliasMultiMap WriteObjects;
> - ValueVector TempObjects;
> -
> - // Check that the read-writes do not conflict with other read-write
> - // pointers.
> - bool AllWritesIdentified = true;
> - for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
> - Value *Val = (*MI).first;
> - Instruction *Inst = (*MI).second;
> -
> - GetUnderlyingObjects(Val, TempObjects, DL);
> - for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
> - UI != UE; ++UI) {
> - if (!isIdentifiedObject(*UI)) {
> - DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **UI <<"\n");
> - NeedRTCheck = true;
> - AllWritesIdentified = false;
> - }
> -
> - // Never seen it before, can't alias.
> - if (WriteObjects[*UI].empty()) {
> - DEBUG(dbgs() << "LV: Adding Underlying value:" << **UI <<"\n");
> - WriteObjects[*UI].push_back(Inst);
> - continue;
> - }
> - // Direct alias found.
> - if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
> - DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
> - << **UI <<"\n");
> - return false;
> - }
> - DEBUG(dbgs() << "LV: Found a conflicting global value:"
> - << **UI <<"\n");
> - DEBUG(dbgs() << "LV: While examining store:" << *Inst <<"\n");
> - DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
> -
> - // If global alias, make sure they do alias.
> - if (hasPossibleGlobalWriteReorder(*UI,
> - Inst,
> - WriteObjects,
> - MaxByteWidth)) {
> - DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **UI
> - << "\n");
> - return false;
> - }
> -
> - // Didn't alias, insert into map for further reference.
> - WriteObjects[*UI].push_back(Inst);
> - }
> - TempObjects.clear();
> - }
> -
> - /// Check that the reads don't conflict with the read-writes.
> - for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
> - Value *Val = (*MI).first;
> - GetUnderlyingObjects(Val, TempObjects, DL);
> - for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
> - UI != UE; ++UI) {
> - // If all of the writes are identified then we don't care if the read
> - // pointer is identified or not.
> - if (!AllWritesIdentified && !isIdentifiedObject(*UI)) {
> - DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **UI <<"\n");
> - NeedRTCheck = true;
> - }
> -
> - // Never seen it before, can't alias.
> - if (WriteObjects[*UI].empty())
> - continue;
> - // Direct alias found.
> - if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
> - DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
> - << **UI <<"\n");
> - return false;
> - }
> - DEBUG(dbgs() << "LV: Found a global value: "
> - << **UI <<"\n");
> - Instruction *Inst = (*MI).second;
> - DEBUG(dbgs() << "LV: While examining load:" << *Inst <<"\n");
> - DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
> -
> - // If global alias, make sure they do alias.
> - if (hasPossibleGlobalWriteReorder(*UI,
> - Inst,
> - WriteObjects,
> - MaxByteWidth)) {
> - DEBUG(dbgs() << "LV: Found a possible read-write reorder:" << **UI
> - << "\n");
> - return false;
> - }
> - }
> - TempObjects.clear();
> - }
> -
> - PtrRtCheck.Need = NeedRTCheck;
> if (NeedRTCheck && !CanDoRT) {
> DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<
> "the array bounds.\n");
> @@ -3770,9 +3633,20 @@ bool LoopVectorizationLegality::canVecto
> return false;
> }
>
> + PtrRtCheck.Need = NeedRTCheck;
> +
> + bool CanVecMem = true;
> + if (Accesses.isDependencyCheckNeeded()) {
> + DEBUG(dbgs() << "LV: Checking memory dependencies\n");
> + CanVecMem = DepChecker.areDepsSafe(DependentAccesses,
> + Accesses.getDependenciesToCheck());
> + MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes();
> + }
> +
> DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") <<
> " need a runtime memory check.\n");
> - return true;
> +
> + return CanVecMem;
> }
>
> static bool hasMultipleUsesOf(Instruction *I,
> @@ -4125,15 +3999,6 @@ bool LoopVectorizationLegality::blockCan
> return true;
> }
>
> -bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
> - const SCEV *PhiScev = SE->getSCEV(Ptr);
> - const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
> - if (!AR)
> - return false;
> -
> - return AR->isAffine();
> -}
> -
> LoopVectorizationCostModel::VectorizationFactor
> LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
> unsigned UserVF) {
> @@ -4150,6 +4015,10 @@ LoopVectorizationCostModel::selectVector
>
> unsigned WidestType = getWidestType();
> unsigned WidestRegister = TTI.getRegisterBitWidth(true);
> + unsigned MaxSafeDepDist = -1U;
> + if (Legal->getMaxSafeDepDistBytes() != -1U)
> + MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
> + WidestRegister = WidestRegister < MaxSafeDepDist ? WidestRegister : MaxSafeDepDist;
> unsigned MaxVectorSize = WidestRegister / WidestType;
> DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
> DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n");
> @@ -4283,6 +4152,10 @@ LoopVectorizationCostModel::selectUnroll
> if (OptForSize)
> return 1;
>
> + // We used the distance for the unroll factor.
> + if (Legal->getMaxSafeDepDistBytes() != -1U)
> + return 1;
> +
> // Do not unroll loops with a relatively small trip count.
> unsigned TC = SE->getSmallConstantTripCount(TheLoop,
> TheLoop->getLoopLatch());
> @@ -4679,7 +4552,6 @@ Type* LoopVectorizationCostModel::ToVect
> char LoopVectorize::ID = 0;
> static const char lv_name[] = "Loop Vectorization";
> INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
> -INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
> INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
> INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
> INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
>
> Modified: llvm/trunk/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/12-12-11-if-conv.ll?rev=184685&r1=184684&r2=184685&view=diff
> ==============================================================================
> --- llvm/trunk/test/Transforms/LoopVectorize/12-12-11-if-conv.ll (original)
> +++ llvm/trunk/test/Transforms/LoopVectorize/12-12-11-if-conv.ll Sun Jun 23 22:55:48 2013
> @@ -30,7 +30,7 @@ if.then:
> if.end: ; preds = %for.body, %if.then
> %z.0 = phi i32 [ %add1, %if.then ], [ 9, %for.body ]
> store i32 %z.0, i32* %arrayidx, align 4
> - %indvars.iv.next = add i64 %indvars.iv, 1
> + %indvars.iv.next = add nsw i64 %indvars.iv, 1
> %lftr.wideiv = trunc i64 %indvars.iv.next to i32
> %exitcond = icmp eq i32 %lftr.wideiv, %x
> br i1 %exitcond, label %for.end, label %for.body
>
> Added: llvm/trunk/test/Transforms/LoopVectorize/memdep.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/memdep.ll?rev=184685&view=auto
> ==============================================================================
> --- llvm/trunk/test/Transforms/LoopVectorize/memdep.ll (added)
> +++ llvm/trunk/test/Transforms/LoopVectorize/memdep.ll Sun Jun 23 22:55:48 2013
> @@ -0,0 +1,222 @@
> +; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-unroll=1 -S | FileCheck %s
> +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 -S | FileCheck %s -check-prefix=WIDTH
> +
> +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
> +
> +; Vectorization with dependence checks.
> +
> +; No plausible dependence - can be vectorized.
> +; for (i = 0; i < 1024; ++i)
> +; A[i] = A[i + 1] + 1;
> +
> +; CHECK: f1_vec
> +; CHECK: <2 x i32>
> +
> +define void @f1_vec(i32* %A) {
> +entry:
> + br label %for.body
> +
> +for.body:
> + %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
> + %indvars.iv.next = add i32 %indvars.iv, 1
> + %arrayidx = getelementptr inbounds i32* %A, i32 %indvars.iv.next
> + %0 = load i32* %arrayidx, align 4
> + %add1 = add nsw i32 %0, 1
> + %arrayidx3 = getelementptr inbounds i32* %A, i32 %indvars.iv
> + store i32 %add1, i32* %arrayidx3, align 4
> + %exitcond = icmp ne i32 %indvars.iv.next, 1024
> + br i1 %exitcond, label %for.body, label %for.end
> +
> +for.end:
> + ret void
> +}
> +
> +; Plausible dependence of distance 1 - can't be vectorized.
> +; for (i = 0; i < 1024; ++i)
> +; A[i+1] = A[i] + 1;
> +
> +; CHECK: f2_novec
> +; CHECK-NOT: <2 x i32>
> +
> +define void @f2_novec(i32* %A) {
> +entry:
> + br label %for.body
> +
> +for.body:
> + %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
> + %arrayidx = getelementptr inbounds i32* %A, i32 %indvars.iv
> + %0 = load i32* %arrayidx, align 4
> + %add = add nsw i32 %0, 1
> + %indvars.iv.next = add i32 %indvars.iv, 1
> + %arrayidx3 = getelementptr inbounds i32* %A, i32 %indvars.iv.next
> + store i32 %add, i32* %arrayidx3, align 4
> + %exitcond = icmp ne i32 %indvars.iv.next, 1024
> + br i1 %exitcond, label %for.body, label %for.end
> +
> +for.end:
> + ret void
> +}
> +
> +; Plausible dependence of distance 2 - can be vectorized with a width of 2.
> +; for (i = 0; i < 1024; ++i)
> +; A[i+2] = A[i] + 1;
> +
> +; CHECK: f3_vec_len
> +; CHECK: <2 x i32>
> +
> +; WIDTH: f3_vec_len
> +; WIDTH-NOT: <4 x i32>
> +
> +define void @f3_vec_len(i32* %A) {
> +entry:
> + br label %for.body
> +
> +for.body:
> + %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
> + %idxprom = sext i32 %i.01 to i64
> + %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
> + %0 = load i32* %arrayidx, align 4
> + %add = add nsw i32 %0, 1
> + %add1 = add nsw i32 %i.01, 2
> + %idxprom2 = sext i32 %add1 to i64
> + %arrayidx3 = getelementptr inbounds i32* %A, i64 %idxprom2
> + store i32 %add, i32* %arrayidx3, align 4
> + %inc = add nsw i32 %i.01, 1
> + %cmp = icmp slt i32 %inc, 1024
> + br i1 %cmp, label %for.body, label %for.end
> +
> +for.end:
> + ret void
> +}
> +
> +; Plausible dependence of distance 1 - cannot be vectorized (without reordering
> +; accesses).
> +; for (i = 0; i < 1024; ++i) {
> +; B[i] = A[i];
> +; A[i] = B[i + 1];
> +; }
> +
> +; CHECK: f5
> +; CHECK-NOT: <2 x i32>
> +
> +define void @f5(i32* %A, i32* %B) {
> +entry:
> + br label %for.body
> +
> +for.body:
> + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
> + %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
> + %0 = load i32* %arrayidx, align 4
> + %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
> + store i32 %0, i32* %arrayidx2, align 4
> + %indvars.iv.next = add nsw i64 %indvars.iv, 1
> + %arrayidx4 = getelementptr inbounds i32* %B, i64 %indvars.iv.next
> + %1 = load i32* %arrayidx4, align 4
> + store i32 %1, i32* %arrayidx, align 4
> + %lftr.wideiv = trunc i64 %indvars.iv.next to i32
> + %exitcond = icmp ne i32 %lftr.wideiv, 1024
> + br i1 %exitcond, label %for.body, label %for.end
> +
> +for.end:
> + ret void
> +}
> +
> +; Dependence through a phi node - must not vectorize.
> +; for (i = 0; i < 1024; ++i) {
> +; a[i+1] = tmp;
> +; tmp = a[i];
> +; }
> +
> +; CHECK: f6
> +; CHECK-NOT: <2 x i32>
> +
> +define i32 @f6(i32* %a, i32 %tmp) {
> +entry:
> + br label %for.body
> +
> +for.body:
> + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
> + %tmp.addr.08 = phi i32 [ %tmp, %entry ], [ %0, %for.body ]
> + %indvars.iv.next = add nsw i64 %indvars.iv, 1
> + %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv.next
> + store i32 %tmp.addr.08, i32* %arrayidx, align 4
> + %arrayidx3 = getelementptr inbounds i32* %a, i64 %indvars.iv
> + %0 = load i32* %arrayidx3, align 4
> + %lftr.wideiv = trunc i64 %indvars.iv.next to i32
> + %exitcond = icmp ne i32 %lftr.wideiv, 1024
> + br i1 %exitcond, label %for.body, label %for.end
> +
> +for.end:
> + ret i32 undef
> +}
> +
> +; Don't vectorize true loop carried dependencies that are not a multiple of the
> +; vector width.
> +; Example:
> +; for (int i = ...; ++i) {
> +; a[i] = a[i-3] + ...;
> +; It is a bad idea to vectorize this loop because store-load forwarding will not
> +; happen.
> +;
> +
> +; CHECK: @nostoreloadforward
> +; CHECK-NOT: <2 x i32>
> +
> +define void @nostoreloadforward(i32* %A) {
> +entry:
> + br label %for.body
> +
> +for.body:
> + %indvars.iv = phi i64 [ 16, %entry ], [ %indvars.iv.next, %for.body ]
> + %0 = add nsw i64 %indvars.iv, -3
> + %arrayidx = getelementptr inbounds i32* %A, i64 %0
> + %1 = load i32* %arrayidx, align 4
> + %2 = add nsw i64 %indvars.iv, 4
> + %arrayidx2 = getelementptr inbounds i32* %A, i64 %2
> + %3 = load i32* %arrayidx2, align 4
> + %add3 = add nsw i32 %3, %1
> + %arrayidx5 = getelementptr inbounds i32* %A, i64 %indvars.iv
> + store i32 %add3, i32* %arrayidx5, align 4
> + %indvars.iv.next = add i64 %indvars.iv, 1
> + %lftr.wideiv = trunc i64 %indvars.iv.next to i32
> + %exitcond = icmp ne i32 %lftr.wideiv, 128
> + br i1 %exitcond, label %for.body, label %for.end
> +
> +for.end:
> + ret void
> +}
> +
> +; Example:
> +; for (int i = ...; ++i) {
> +; a[i] = b[i];
> +; c[i] = a[i-3] + ...;
> +; It is a bad idea to vectorize this loop because store-load forwarding will not
> +; happen.
> +;
> +
> +; CHECK: @nostoreloadforward2
> +; CHECK-NOT: <2 x i32>
> +
> +define void @nostoreloadforward2(i32* noalias %A, i32* noalias %B, i32* noalias %C) {
> +entry:
> + br label %for.body
> +
> +for.body:
> + %indvars.iv = phi i64 [ 16, %entry ], [ %indvars.iv.next, %for.body ]
> + %arrayidx = getelementptr inbounds i32* %B, i64 %indvars.iv
> + %0 = load i32* %arrayidx, align 4
> + %arrayidx2 = getelementptr inbounds i32* %A, i64 %indvars.iv
> + store i32 %0, i32* %arrayidx2, align 4
> + %1 = add nsw i64 %indvars.iv, -3
> + %arrayidx4 = getelementptr inbounds i32* %A, i64 %1
> + %2 = load i32* %arrayidx4, align 4
> + %arrayidx6 = getelementptr inbounds i32* %C, i64 %indvars.iv
> + store i32 %2, i32* %arrayidx6, align 4
> + %indvars.iv.next = add i64 %indvars.iv, 1
> + %lftr.wideiv = trunc i64 %indvars.iv.next to i32
> + %exitcond = icmp ne i32 %lftr.wideiv, 128
> + br i1 %exitcond, label %for.body, label %for.end
> +
> +for.end:
> + ret void
> +}
>
> Modified: llvm/trunk/test/Transforms/LoopVectorize/runtime-check.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/runtime-check.ll?rev=184685&r1=184684&r2=184685&view=diff
> ==============================================================================
> --- llvm/trunk/test/Transforms/LoopVectorize/runtime-check.ll (original)
> +++ llvm/trunk/test/Transforms/LoopVectorize/runtime-check.ll Sun Jun 23 22:55:48 2013
> @@ -12,7 +12,7 @@ target triple = "x86_64-apple-macosx10.9
> ;CHECK: for.body.preheader:
> ;CHECK: br i1 %cmp.zero, label %middle.block, label %vector.memcheck
> ;CHECK: vector.memcheck:
> -;CHECK: br i1 %found.conflict, label %middle.block, label %vector.ph
> +;CHECK: br i1 %memcheck.conflict, label %middle.block, label %vector.ph
> ;CHECK: load <4 x float>
> define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtable ssp {
> entry:
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20130623/44bcbf8f/attachment.html>
More information about the llvm-commits
mailing list