[llvm] [LoopVectorize] Add support for reverse loops in isDereferenceableAndAlignedInLoop (PR #96752)

Fri Dec 6 02:14:59 PST 2024

================
@@ -276,84 +277,85 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
 bool llvm::isDereferenceableAndAlignedInLoop(
     LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT,
     AssumptionCache *AC, SmallVectorImpl<const SCEVPredicate *> *Predicates) {
-  auto &DL = LI->getDataLayout();
-  Value *Ptr = LI->getPointerOperand();
-
-  APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()),
-                DL.getTypeStoreSize(LI->getType()).getFixedValue());
-  const Align Alignment = LI->getAlign();
+  const SCEV *Ptr = SE.getSCEV(LI->getPointerOperand());
+  auto *AddRec = dyn_cast<SCEVAddRecExpr>(Ptr);
 
-  Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI();
-
-  // If given a uniform (i.e. non-varying) address, see if we can prove the
-  // access is safe within the loop w/o needing predication.
-  if (L->isLoopInvariant(Ptr))
-    return isDereferenceableAndAlignedPointer(Ptr, Alignment, EltSize, DL,
-                                              HeaderFirstNonPHI, AC, &DT);
-
-  // Otherwise, check to see if we have a repeating access pattern where we can
-  // prove that all accesses are well aligned and dereferenceable.
-  auto *AddRec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Ptr));
+  // Check to see if we have a repeating access pattern and it's possible
+  // to prove all accesses are well aligned.
   if (!AddRec || AddRec->getLoop() != L || !AddRec->isAffine())
     return false;
+
   auto* Step = dyn_cast<SCEVConstant>(AddRec->getStepRecurrence(SE));
   if (!Step)
     return false;
 
-  auto TC = SE.getSmallConstantMaxTripCount(L, Predicates);
-  if (!TC)
+  // For the moment, restrict ourselves to the case where the access size is a
+  // multiple of the requested alignment and the base is aligned.
+  // TODO: generalize if a case found which warrants
+  const Align Alignment = LI->getAlign();
+  auto &DL = LI->getDataLayout();
+  APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()),
+                DL.getTypeStoreSize(LI->getType()).getFixedValue());
+  if (EltSize.urem(Alignment.value()) != 0)
     return false;
 
   // TODO: Handle overlapping accesses.
-  // We should be computing AccessSize as (TC - 1) * Step + EltSize.
-  if (EltSize.sgt(Step->getAPInt()))
+  if (EltSize.ugt(Step->getAPInt().abs()))
     return false;
 
-  // Compute the total access size for access patterns with unit stride and
-  // patterns with gaps. For patterns with unit stride, Step and EltSize are the
-  // same.
-  // For patterns with gaps (i.e. non unit stride), we are
-  // accessing EltSize bytes at every Step.
-  APInt AccessSize = TC * Step->getAPInt();
+  const SCEV *MaxBECount =
+      SE.getPredicatedSymbolicMaxBackedgeTakenCount(L, *Predicates);
+  if (isa<SCEVCouldNotCompute>(MaxBECount))
+    return false;
 
-  assert(SE.isLoopInvariant(AddRec->getStart(), L) &&
-         "implied by addrec definition");
-  Value *Base = nullptr;
-  if (auto *StartS = dyn_cast<SCEVUnknown>(AddRec->getStart())) {
-    Base = StartS->getValue();
-  } else if (auto *StartS = dyn_cast<SCEVAddExpr>(AddRec->getStart())) {
-    // Handle (NewBase + offset) as start value.
-    const auto *Offset = dyn_cast<SCEVConstant>(StartS->getOperand(0));
-    const auto *NewBase = dyn_cast<SCEVUnknown>(StartS->getOperand(1));
-    if (StartS->getNumOperands() == 2 && Offset && NewBase) {
-      // The following code below assumes the offset is unsigned, but GEP
-      // offsets are treated as signed so we can end up with a signed value
-      // here too. For example, suppose the initial PHI value is (i8 255),
-      // the offset will be treated as (i8 -1) and sign-extended to (i64 -1).
-      if (Offset->getAPInt().isNegative())
-        return false;
+  const auto &[AccessStart, AccessEnd] =
+      getStartAndEndForAccess(L, Ptr, LI->getType(), MaxBECount, &SE, nullptr);
+  if (isa<SCEVCouldNotCompute>(AccessStart) ||
+      isa<SCEVCouldNotCompute>(AccessEnd))
+    return false;
 
-      // For the moment, restrict ourselves to the case where the offset is a
-      // multiple of the requested alignment and the base is aligned.
-      // TODO: generalize if a case found which warrants
-      if (Offset->getAPInt().urem(Alignment.value()) != 0)
-        return false;
-      Base = NewBase->getValue();
-      bool Overflow = false;
-      AccessSize = AccessSize.uadd_ov(Offset->getAPInt(), Overflow);
-      if (Overflow)
-        return false;
-    }
-  }
+  // Try to get the access size.
+  const SCEV *PtrDiff = SE.getMinusSCEV(AccessEnd, AccessStart);
+  APInt MaxPtrDiff = SE.getUnsignedRangeMax(PtrDiff);
 
-  if (!Base)
+  // If the (max) pointer difference is > 32 bits then it's unlikely to be
+  // dereferenceable.
+  if (MaxPtrDiff.getActiveBits() > 32)
----------------
david-arm wrote:

Again, I put this code because in order to reuse `getStartAndEndForAccess` I have to pass in a SCEV value for the backedge taken count, since the original version uses it. That also means I can no longer call `getSmallConstantMaxTripCount` since that returns an unsigned value, however `getSmallConstantMaxTripCount` returns 0 if the max count > 32 bits. In order to maintain the same behaviour I added the explicit check here. I'm happy to remove it though if you feel comfortable with removing this restriction?

https://github.com/llvm/llvm-project/pull/96752