[llvm] r184684 - LoopVectorize: Add utility class for checking dependency among accesses

Thu Oct 31 07:40:10 PDT 2013

----- Original Message -----
> Author: arnolds
> Date: Sun Jun 23 22:55:45 2013
> New Revision: 184684
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=184684&view=rev
> Log:
> LoopVectorize: Add utility class for checking dependency among
> accesses
> 
> This class checks dependences by subtracting two Scalar Evolution
> access
> functions allowing us to catch very simple linear dependences.
> 
> The checker assumes source order in determining whether vectorization
> is safe.
> We currently don't reorder accesses.
> Positive true dependencies need to be a multiple of VF otherwise we
> impede
> store-load forwarding.
> 
> Modified:
>     llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
> 
> Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
> URL:
> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=184684&r1=184683&r2=184684&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
> +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Sun Jun 23
> 22:55:45 2013
> @@ -3084,6 +3084,385 @@ void AccessAnalysis::processMemAccesses(
>    }
>  }
>  
> +/// \brief Checks memory dependences among accesses to the same
> underlying
> +/// object to determine whether there vectorization is legal or not
> (and at
> +/// which vectorization factor).
> +///
> +/// This class works under the assumption that we already checked
> that memory
> +/// locations with different underlying pointers are "must-not
> alias".
> +/// We use the ScalarEvolution framework to symbolically evalutate
> access
> +/// functions pairs. Since we currently don't restructure the loop
> we can rely
> +/// on the program order of memory accesses to determine their
> safety.
> +/// At the moment we will only deem accesses as safe for:
> +///  * A negative constant distance assuming program order.
> +///
> +///      Safe: tmp = a[i + 1];     OR     a[i + 1] = x;
> +///            a[i] = tmp;                y = a[i];
> +///
> +///   The latter case is safe because later checks guarantuee that
> there can't
> +///   be a cycle through a phi node (that is, we check that "x" and
> "y" is not
> +///   the same variable: a header phi can only be an induction or a
> reduction, a
> +///   reduction can't have a memory sink, an induction can't have a
> memory
> +///   source). This is important and must not be violated (or we
> have to
> +///   resort to checking for cycles through memory).
> +///
> +///  * A positive constant distance assuming program order that is
> bigger
> +///    than the biggest memory access.
> +///
> +///     tmp = a[i]        OR              b[i] = x
> +///     a[i+2] = tmp                      y = b[i+2];
> +///
> +///     Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]),
> respectively.
> +///
> +///  * Zero distances and all accesses have the same size.
> +///
> +class MemoryDepChecker {
> +public:
> +  typedef std::pair<Value*, char> MemAccessInfo;
> +
> +  MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop
> *L) :
> +    SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0) {}
> +
> +  /// \brief Register the location (instructions are given
> increasing numbers)
> +  /// of a write access.
> +  void addAccess(StoreInst *SI) {
> +    Value *Ptr = SI->getPointerOperand();
> +    Accesses[std::make_pair(Ptr, true)].push_back(AccessIdx);
> +    InstMap.push_back(SI);
> +    ++AccessIdx;
> +  }
> +
> +  /// \brief Register the location (instructions are given
> increasing numbers)
> +  /// of a write access.
> +  void addAccess(LoadInst *LI) {
> +    Value *Ptr = LI->getPointerOperand();
> +    Accesses[std::make_pair(Ptr, false)].push_back(AccessIdx);
> +    InstMap.push_back(LI);
> +    ++AccessIdx;
> +  }
> +
> +  /// \brief Check whether the dependencies between the accesses are
> safe.
> +  ///
> +  /// Only checks sets with elements in \p CheckDeps.
> +  bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
> +                   DenseSet<MemAccessInfo> &CheckDeps);
> +
> +  /// \brief The maximum number of bytes of a vector register we can
> vectorize
> +  /// the accesses safely with.
> +  unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
> +
> +private:
> +  ScalarEvolution *SE;
> +  DataLayout *DL;
> +  const Loop *InnermostLoop;
> +
> +  /// \brief Maps access locations (ptr, read/write) to program
> order.
> +  DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses;
> +
> +  /// \brief Memory access instructions in program order.
> +  SmallVector<Instruction *, 16> InstMap;
> +
> +  /// \brief The program order index to be used for the next
> instruction.
> +  unsigned AccessIdx;
> +
> +  // We can access this many bytes in parallel safely.
> +  unsigned MaxSafeDepDistBytes;
> +
> +  /// \brief Check whether there is a plausible dependence between
> the two
> +  /// accesses.
> +  ///
> +  /// Access \p A must happen before \p B in program order. The two
> indices
> +  /// identify the index into the program order map.
> +  ///
> +  /// This function checks  whether there is a plausible dependence
> (or the
> +  /// absence of such can't be proved) between the two accesses. If
> there is a
> +  /// plausible dependence but the dependence distance is bigger
> than one
> +  /// element access it records this distance in \p
> MaxSafeDepDistBytes (if this
> +  /// distance is smaller than any other distance encountered so
> far).
> +  /// Otherwise, this function returns true signaling a possible
> dependence.
> +  bool isDependent(const MemAccessInfo &A, unsigned AIdx,
> +                   const MemAccessInfo &B, unsigned BIdx);
> +
> +  /// \brief Check whether the data dependence could prevent
> store-load
> +  /// forwarding.
> +  bool couldPreventStoreLoadForward(unsigned Distance, unsigned
> TypeByteSize);
> +};
> +
> +static bool isInBoundsGep(Value *Ptr) {
> +  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
> +    return GEP->isInBounds();
> +  return false;
> +}
> +
> +/// \brief Check whether the access through \p Ptr has a constant
> stride.
> +static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value
> *Ptr,
> +                        const Loop *Lp) {
> +  const Type *PtrTy = Ptr->getType();
> +  assert(PtrTy->isPointerTy() && "Unexpected non ptr");
> +
> +  // Make sure that the pointer does not point to aggregate types.
> +  if
> (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType())
> {
> +    DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar
> type" << *Ptr
> +          << "\n");
> +    return 0;
> +  }
> +
> +  const SCEV *PtrScev = SE->getSCEV(Ptr);
> +  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
> +  if (!AR) {
> +    DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer "
> +          << *Ptr << " SCEV: " << *PtrScev << "\n");
> +    return 0;
> +  }
> +
> +  // The accesss function must stride over the innermost loop.
> +  if (Lp != AR->getLoop()) {
> +    DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost
> loop " << *Ptr
> +          << " SCEV: " << *PtrScev << "\n");
> +  }
> +
> +  // The address calculation must not wrap. Otherwise, a dependence
> could be
> +  // inverted. An inbounds getelementptr that is a AddRec with a
> unit stride
> +  // cannot wrap per definition. The unit stride requirement is
> checked later.
> +  bool IsInBoundsGEP = isInBoundsGep(Ptr);
> +  bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask);
> +  if (!IsNoWrapAddRec && !IsInBoundsGEP) {
> +    DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the
> address space "
> +          << *Ptr << " SCEV: " << *PtrScev << "\n");
> +    return 0;
> +  }
> +
> +  // Check the step is constant.
> +  const SCEV *Step = AR->getStepRecurrence(*SE);
> +
> +  // Calculate the pointer stride and check if it is consecutive.
> +  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
> +  if (!C) {
> +    DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " <<
> *Ptr <<
> +          " SCEV: " << *PtrScev << "\n");
> +    return 0;
> +  }
> +
> +  int64_t Size =
> DL->getTypeAllocSize(PtrTy->getPointerElementType());
> +  const APInt &APStepVal = C->getValue()->getValue();
> +
> +  // Huge step value - give up.
> +  if (APStepVal.getBitWidth() > 64)
> +    return 0;
> +
> +  int64_t StepVal = APStepVal.getSExtValue();
> +
> +  // Strided access.
> +  int64_t Stride = StepVal / Size;
> +  int64_t Rem = StepVal % Size;
> +  if (Rem)
> +    return 0;
> +
> +  // If the SCEV could wrap but we have an inbounds gep with a unit
> stride we
> +  // know we can't "wrap around the address space".
> +  if (!IsNoWrapAddRec && IsInBoundsGEP && Stride != 1 && Stride !=
> -1)
> +    return 0;
> +
> +  return Stride;
> +}
> +
> +bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned
> Distance,
> +                                                    unsigned
> TypeByteSize) {
> +  // If loads occur at a distance that is not a multiple of a
> feasible vector
> +  // factor store-load forwarding does not take place.
> +  // Positive dependences might cause troubles because vectorizing
> them might
> +  // prevent store-load forwarding making vectorized code run a lot
> slower.
> +  //   a[i] = a[i-3] ^ a[i-8];
> +  //   The stores to a[i:i+1] don't align with the stores to
> a[i-3:i-2] and
> +  //   hence on your typical architecture store-load forwarding does
> not take
> +  //   place. Vectorizing in such cases does not make sense.
> +  // Store-load forwarding distance.
> +  const unsigned NumCyclesForStoreLoadThroughMemory =
> 8*TypeByteSize;
> +  // Maximum vector factor.
> +  unsigned MaxVFWithoutSLForwardIssues =
> MaxVectorWidth*TypeByteSize;
> +  if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues)
> +    MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes;
> +
> +  for (unsigned vf = 2*TypeByteSize; vf <=
> MaxVFWithoutSLForwardIssues;
> +       vf *= 2) {
> +    if (Distance % vf && Distance / vf <
> NumCyclesForStoreLoadThroughMemory) {
> +      MaxVFWithoutSLForwardIssues = (vf >>=1);
> +      break;
> +    }
> +  }
> +
> +  if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) {
> +    DEBUG(dbgs() << "LV: Distance " << Distance <<
> +          " that could cause a store-load forwarding conflict\n");
> +    return true;
> +  }
> +
> +  if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes &&
> +      MaxVFWithoutSLForwardIssues != MaxVectorWidth*TypeByteSize)
> +    MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues;
> +  return false;
> +}
> +
> +bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned
> AIdx,
> +                                   const MemAccessInfo &B, unsigned
> BIdx) {
> +  assert (AIdx < BIdx && "Must pass arguments in program order");
> +
> +  Value *APtr = A.first;
> +  Value *BPtr = B.first;
> +  bool AIsWrite = A.second;
> +  bool BIsWrite = B.second;
> +
> +  // Two reads are independent.
> +  if (!AIsWrite && !BIsWrite)
> +    return false;
> +
> +  const SCEV *AScev = SE->getSCEV(APtr);
> +  const SCEV *BScev = SE->getSCEV(BPtr);
> +
> +  int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop);
> +  int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop);
> +
> +  const SCEV *Src = AScev;
> +  const SCEV *Sink = BScev;
> +
> +  // If the induction step is negative we have to invert source and
> sink of the
> +  // dependence.
> +  if (StrideAPtr < 0) {
> +    //Src = BScev;
> +    //Sink = AScev;
> +    std::swap(APtr, BPtr);
> +    std::swap(Src, Sink);
> +    std::swap(AIsWrite, BIsWrite);
> +    std::swap(AIdx, BIdx);
> +    std::swap(StrideAPtr, StrideBPtr);
> +  }
> +
> +  const SCEV *Dist = SE->getMinusSCEV(Sink, Src);
> +
> +  DEBUG(dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink
> +        << "(Induction step: " << StrideAPtr <<  ")\n");
> +  DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to "
> +        << *InstMap[BIdx] << ": " << *Dist << "\n");
> +
> +  // Need consecutive accesses. We don't want to vectorize
> +  // "A[B[i]] += ..." and similar code or pointer arithmetic that
> could wrap in
> +  // the address space.
> +  if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){
> +    DEBUG(dbgs() << "Non-consecutive pointer access\n");
> +    return true;
> +  }
> +
> +  const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
> +  if (!C) {
> +    DEBUG(dbgs() << "LV: Dependence because of non constant
> distance\n");
> +    return true;
> +  }
> +
> +  Type *ATy = APtr->getType()->getPointerElementType();
> +  Type *BTy = BPtr->getType()->getPointerElementType();
> +  unsigned TypeByteSize = DL->getTypeAllocSize(ATy);
> +
> +  // Negative distances are not plausible dependencies.
> +  const APInt &Val = C->getValue()->getValue();
> +  if (Val.isNegative()) {
> +    bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
> +    if (IsTrueDataDependence &&
> +        (couldPreventStoreLoadForward(Val.abs().getZExtValue(),
> TypeByteSize) ||
> +         ATy != BTy))

I don't understand why we're checking for store-load forwarding in this case. If I understand correctly, this handles cases where we have things like:
  for (int i = 0; i < n; ++i) {
    a[i] = a[i+3] // + b[i];
  }

where the load is always of 'future' values (that have not yet been stored). In these cases, we're never loading stored values; is there some other reason to apply this check?

Thanks again,
Hal

> +      return true;
> +
> +    DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n");
> +    return false;
> +  }
> +
> +  // Write to the same location with the same size.
> +  // Could be improved to assert type sizes are the same (i32 ==
> float, etc).
> +  if (Val == 0) {
> +    if (ATy == BTy)
> +      return false;
> +    DEBUG(dbgs() << "LV: Zero dependence difference but different
> types");
> +    return true;
> +  }
> +
> +  assert(Val.isStrictlyPositive() && "Expect a positive value");
> +
> +  // Positive distance bigger than max vectorization factor.
> +  if (ATy != BTy) {
> +    DEBUG(dbgs() <<
> +          "LV: ReadWrite-Write positive dependency with different
> types");
> +    return false;
> +  }
> +
> +  unsigned Distance = (unsigned) Val.getZExtValue();
> +
> +  // Bail out early if passed-in parameters make vectorization not
> feasible.
> +  unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor
> : 1;
> +  unsigned ForcedUnroll = VectorizationUnroll ? VectorizationUnroll
> : 1;
> +
> +  // The distance must be bigger than the size needed for a
> vectorized version
> +  // of the operation and the size of the vectorized operation must
> not be
> +  // bigger than the currrent maximum size.
> +  if (Distance < 2*TypeByteSize ||
> +      2*TypeByteSize > MaxSafeDepDistBytes ||
> +      Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
> +    DEBUG(dbgs() << "LV: Failure because of Positive distance "
> +        << Val.getSExtValue() << "\n");
> +    return true;
> +  }
> +
> +  MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
> +    Distance : MaxSafeDepDistBytes;
> +
> +  bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
> +  if (IsTrueDataDependence &&
> +      couldPreventStoreLoadForward(Distance, TypeByteSize))
> +     return true;
> +
> +  DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() <<
> +        " with max VF=" << MaxSafeDepDistBytes/TypeByteSize <<
> "\n");
> +
> +  return false;
> +}
> +
> +bool
> +MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates
> &AccessSets,
> +                              DenseSet<MemAccessInfo> &CheckDeps) {
> +
> +  MaxSafeDepDistBytes = -1U;
> +  while (!CheckDeps.empty()) {
> +    MemAccessInfo CurAccess = *CheckDeps.begin();
> +
> +    // Get the relevant memory access set.
> +    EquivalenceClasses<MemAccessInfo>::iterator I =
> +      AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
> +
> +    // Check accesses within this set.
> +    EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE;
> +    AI = AccessSets.member_begin(I), AE = AccessSets.member_end();
> +
> +    // Check every access pair.
> +    while (AI != AE) {
> +      CheckDeps.erase(*AI);
> +      EquivalenceClasses<MemAccessInfo>::member_iterator OI =
> llvm::next(AI);
> +      while (OI != AE) {
> +        // Check every accessing instruction pair in program order.
> +        for (std::vector<unsigned>::iterator I1 =
> Accesses[*AI].begin(),
> +             I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
> +          for (std::vector<unsigned>::iterator I2 =
> Accesses[*OI].begin(),
> +               I2E = Accesses[*OI].end(); I2 != I2E; ++I2) {
> +            if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2))
> +              return false;
> +            if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1))
> +              return false;
> +          }
> +        ++OI;
> +      }
> +      AI++;
> +    }
> +  }
> +  return true;
> +}
> +
>  AliasAnalysis::Location
>  LoopVectorizationLegality::getLoadStoreLocation(Instruction *Inst) {
>    if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
> 
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
> 

-- 
Hal Finkel
Assistant Computational Scientist
Leadership Computing Facility
Argonne National Laboratory