[llvm] [LAA] Allow vectorizing `A[NonZeroNonConstantStride*I] += 1` (PR #186262)

Fri Mar 13 10:50:24 PDT 2026

https://github.com/eas updated https://github.com/llvm/llvm-project/pull/186262

>From 942f8de8e68af75552e4af9d6892fa430044f58f Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov at sifive.com>
Date: Wed, 11 Mar 2026 12:45:41 -0700
Subject: [PATCH 1/3] [NFC][LAA] Add a test for a single strided-read-write
 access

---
 .../single_strided_readwrite.ll               | 243 ++++++++++++++++++
 1 file changed, 243 insertions(+)
 create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/single_strided_readwrite.ll

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/single_strided_readwrite.ll b/llvm/test/Analysis/LoopAccessAnalysis/single_strided_readwrite.ll
new file mode 100644
index 0000000000000..390e694c0b340
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/single_strided_readwrite.ll
@@ -0,0 +1,243 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes='print<access-info>' -disable-output < %s -enable-mem-access-versioning=false 2>&1 | FileCheck %s
+
+define void @known_safe(ptr %p, i8 %a) {
+; CHECK-LABEL: 'known_safe'
+; CHECK-NEXT:    header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            %ld = load i64, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %add, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %a.zext = zext i8 %a to i64
+  %stride = add i64 %a.zext, 1
+  br label %header
+
+header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ]
+  %iv.next = add nsw i64 %iv, 1
+  %idx = mul nsw nuw i64 %iv, %stride
+
+  %gep = getelementptr inbounds i64, ptr %p, i64 %idx
+  %ld = load i64, ptr %gep
+  %add = add i64 %ld, %iv
+  store i64 %add, ptr %gep
+
+  %exitcond = icmp slt i64 %iv.next, 128
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @known_safe_byte_gep(ptr %p, i8 %a) {
+; CHECK-LABEL: 'known_safe_byte_gep'
+; CHECK-NEXT:    header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            %ld = load i64, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %add, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %a.zext = zext i8 %a to i64
+  %stride.elts = add i64 %a.zext, 1
+  %stride = mul i64 %stride.elts, 8
+  br label %header
+
+header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ]
+  %iv.next = add nsw i64 %iv, 1
+  %idx = mul nsw nuw i64 %iv, %stride
+
+  %gep = getelementptr inbounds i8, ptr %p, i64 %idx
+  %ld = load i64, ptr %gep
+  %add = add i64 %ld, %iv
+  store i64 %add, ptr %gep
+
+  %exitcond = icmp slt i64 %iv.next, 128
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+; This would require `%a u> 0` RT check.
+define void @known_non_negative(ptr %p, i8 %a) {
+; CHECK-LABEL: 'known_non_negative'
+; CHECK-NEXT:    header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            %ld = load i64, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %add, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %stride = zext i8 %a to i64
+  br label %header
+
+header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ]
+  %iv.next = add nsw i64 %iv, 1
+  %idx = mul nsw nuw i64 %iv, %stride
+
+  %gep = getelementptr inbounds i64, ptr %p, i64 %idx
+  %ld = load i64, ptr %gep
+  %add = add i64 %ld, %iv
+  store i64 %add, ptr %gep
+
+  %exitcond = icmp slt i64 %iv.next, 128
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+; This would require `%a u> 0` RT check.
+define void @known_non_negative_scaled_for_byte_gep(ptr %p, i8 %a) {
+; CHECK-LABEL: 'known_non_negative_scaled_for_byte_gep'
+; CHECK-NEXT:    header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            %ld = load i64, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %add, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %a.zext = zext i8 %a to i64
+  %stride = mul nsw nuw i64 %a.zext, 8
+  br label %header
+
+header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ]
+  %iv.next = add nsw i64 %iv, 1
+  %idx = mul nsw nuw i64 %iv, %stride
+
+  %gep = getelementptr inbounds i8, ptr %p, i64 %idx
+  %ld = load i64, ptr %gep
+  %add = add i64 %ld, %iv
+  store i64 %add, ptr %gep
+
+  %exitcond = icmp slt i64 %iv.next, 128
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+; This would require `%a u> 8` RT check.
+define void @known_non_negative_nonscaled_for_byte_gep(ptr %p, i8 %a) {
+; CHECK-LABEL: 'known_non_negative_nonscaled_for_byte_gep'
+; CHECK-NEXT:    header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            %ld = load i64, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %add, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %stride = zext i8 %a to i64
+  br label %header
+
+header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ]
+  %iv.next = add nsw i64 %iv, 1
+  %idx = mul nsw nuw i64 %iv, %stride
+
+  %gep = getelementptr inbounds i8, ptr %p, i64 %idx
+  %ld = load i64, ptr %gep
+  %add = add i64 %ld, %iv
+  store i64 %add, ptr %gep
+
+  %exitcond = icmp slt i64 %iv.next, 128
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+; This would require `abs(%a) u> 8` RT check.
+define void @arbitrary(ptr %p, i64 %stride) {
+; CHECK-LABEL: 'arbitrary'
+; CHECK-NEXT:    header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            %ld = load i64, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %add, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %header
+
+header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ]
+  %iv.next = add nsw i64 %iv, 1
+  %idx = mul nsw nuw i64 %iv, %stride
+
+  %gep = getelementptr inbounds i8, ptr %p, i64 %idx
+  %ld = load i64, ptr %gep
+  %add = add i64 %ld, %iv
+  store i64 %add, ptr %gep
+
+  %exitcond = icmp slt i64 %iv.next, 128
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}

>From 44adee81c74057b700f8890c4f5cd81c8c742881 Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov at sifive.com>
Date: Thu, 12 Mar 2026 16:11:23 -0700
Subject: [PATCH 2/3] [NFCI][LAA] Add `getPtrStrideScev` function

---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp | 95 ++++++++++++++----------
 1 file changed, 57 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index d2f78d2d14fc8..1b2df207f211d 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -977,9 +977,9 @@ class AccessAnalysis {
 
 } // end anonymous namespace
 
-/// Try to compute a constant stride for \p AR. Used by getPtrStride and
-/// isNoWrap.
-static std::optional<int64_t>
+/// Try to compute a loop invariant stride for \p AR. Used by getPtrStrideScev
+/// and isNoWrap.
+static std::optional<const SCEV *>
 getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
                     Value *Ptr, PredicatedScalarEvolution &PSE) {
   if (isa<ScalableVectorType>(AccessTy)) {
@@ -988,44 +988,47 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
     return std::nullopt;
   }
 
-  // The access function must stride over the innermost loop.
-  if (Lp != AR->getLoop()) {
+  auto BadStride = [&](auto Str) {
     LLVM_DEBUG({
-      dbgs() << "LAA: Bad stride - Not striding over innermost loop ";
+      dbgs() << "LAA: Bad stride - " << Str << " ";
       if (Ptr)
         dbgs() << *Ptr << " ";
 
       dbgs() << "SCEV: " << *AR << "\n";
     });
     return std::nullopt;
-  }
+  };
 
-  // Check the step is constant.
-  const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
+  // The access function must stride over the innermost loop.
+  if (Lp != AR->getLoop())
+    return BadStride("Not striding over innermost loop");
 
-  // Calculate the pointer stride and check if it is constant.
-  const APInt *APStepVal;
-  if (!match(Step, m_scev_APInt(APStepVal))) {
-    LLVM_DEBUG({
-      dbgs() << "LAA: Bad stride - Not a constant strided ";
-      if (Ptr)
-        dbgs() << *Ptr << " ";
-      dbgs() << "SCEV: " << *AR << "\n";
-    });
+  // Check the step is loop invariant.
+  if (!AR->isAffine())
     return std::nullopt;
-  }
 
-  const auto &DL = Lp->getHeader()->getDataLayout();
-  TypeSize AllocSize = DL.getTypeAllocSize(AccessTy);
-  int64_t Size = AllocSize.getFixedValue();
+  const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
 
-  // Huge step value - give up.
-  std::optional<int64_t> StepVal = APStepVal->trySExtValue();
-  if (!StepVal)
-    return std::nullopt;
+  auto *SE = PSE.getSE();
+  const SCEV *AbsStep = SE->getAbsExpr(Step, false);
+
+  const SCEV *TypeSizeScev = SE->getSizeOfExpr(
+      Step->getType(), SE->getDataLayout().getTypeAllocSize(AccessTy));
+
+  if (!SE->getURemExpr(AbsStep, TypeSizeScev)->isZero())
+    return BadStride("Not a multiple of access size");
+
+  // There is no ScalarEvolution::getSDiv, emulate that via AbsStep/TypeSize
+  // if the Step sign is known statically.
+  if (!(SE->isKnownNonPositive(Step) || SE->isKnownNonNegative(Step)))
+    return BadStride("Unknown sign");
+
+  const SCEV *AbsStepInElements = SE->getUDivExpr(AbsStep, TypeSizeScev);
+  const SCEV *StepInElements = SE->isKnownNonNegative(Step)
+                                   ? AbsStepInElements
+                                   : SE->getNegativeSCEV(AbsStepInElements);
 
-  // Strided access.
-  return *StepVal % Size ? std::nullopt : std::make_optional(*StepVal / Size);
+  return StepInElements;
 }
 
 /// Check whether \p AR is a non-wrapping AddRec. If \p Ptr is not nullptr, use
@@ -1033,7 +1036,7 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
 static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
                      Value *Ptr, Type *AccessTy, const Loop *L, bool Assume,
                      const DominatorTree &DT,
-                     std::optional<int64_t> Stride = std::nullopt) {
+                     std::optional<const SCEV *> Stride = std::nullopt) {
   // FIXME: This should probably only return true for NUW.
   if (AR->getNoWrapFlags(SCEV::NoWrapMask))
     return true;
@@ -1070,7 +1073,7 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
     // assumes the object in memory is aligned to the natural alignment.
     unsigned AddrSpace = AR->getType()->getPointerAddressSpace();
     if (!NullPointerIsDefined(L->getHeader()->getParent(), AddrSpace) &&
-        (Stride == 1 || Stride == -1))
+        PSE.getSE()->getAbsExpr(*Stride, false)->isOne())
       return true;
   }
 
@@ -1654,15 +1657,15 @@ void AccessAnalysis::processMemAccesses() {
   }
 }
 
-/// Check whether the access through \p Ptr has a constant stride.
-std::optional<int64_t>
-llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
-                   const Loop *Lp, const DominatorTree &DT,
-                   const DenseMap<Value *, const SCEV *> &StridesMap,
-                   bool Assume, bool ShouldCheckWrap) {
+/// Check whether the access through \p Ptr has a loop invariant stride of a
+/// statically known sign.
+std::optional<const SCEV *> static getPtrStrideScev(
+    PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp,
+    const DominatorTree &DT, const DenseMap<Value *, const SCEV *> &StridesMap,
+    bool Assume, bool ShouldCheckWrap) {
   const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
   if (PSE.getSE()->isLoopInvariant(PtrScev, Lp))
-    return 0;
+    return PSE.getSE()->getZero(Type::getInt64Ty(AccessTy->getContext()));
 
   assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
 
@@ -1676,7 +1679,7 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
     return std::nullopt;
   }
 
-  std::optional<int64_t> Stride =
+  std::optional<const SCEV *> Stride =
       getStrideFromAddRec(AR, Lp, AccessTy, Ptr, PSE);
   if (!ShouldCheckWrap || !Stride)
     return Stride;
@@ -1690,6 +1693,22 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
   return std::nullopt;
 }
 
+std::optional<int64_t>
+llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
+                   const Loop *Lp, const DominatorTree &DT,
+                   const DenseMap<Value *, const SCEV *> &StridesMap,
+                   bool Assume, bool ShouldCheckWrap) {
+  std::optional<const SCEV *> StrideScev = getPtrStrideScev(
+      PSE, AccessTy, Ptr, Lp, DT, StridesMap, Assume, ShouldCheckWrap);
+  if (!StrideScev)
+    return std::nullopt;
+  const APInt *APStride = nullptr;
+  if (!match(*StrideScev, m_scev_APInt(APStride)))
+    return std::nullopt;
+
+  return APStride->trySExtValue();
+}
+
 std::optional<int64_t> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA,
                                              Type *ElemTyB, Value *PtrB,
                                              const DataLayout &DL,

>From 0b50f06dbc0ff44e1d11adafe509b00fd899c21c Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov at sifive.com>
Date: Wed, 11 Mar 2026 14:47:09 -0700
Subject: [PATCH 3/3] [LAA] Allow vectorizing `A[NonZeroNonConstantStride*I] +=
 1`

In this patch only do that when we can statically prove that
non-constant stride is non-zero and the resulting index doesn't
overflow. That can later be extended to introduce run-time check when
not provable in compile-time.

My main motivation for this is to move unit-strideness speculation to a
VPlan-based transformation. However, it cannot be done right now because
sometimes such speculation affects legality and we simply avoid
vectorizing loop if it's not done. As such, we need to extend LAA to
properly support dependence analysis/RT checks for strided access
without speculating for it being one. This PR is expected to be the
first one on that journey.
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 28 ++++--
 .../single_strided_readwrite.ll               | 95 ++++++++++++++++---
 2 files changed, 104 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 1b2df207f211d..621807b3f860e 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2755,15 +2755,29 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
     // read list. If we *did* see it before, then it is already in
     // the read-write list. This allows us to vectorize expressions
     // such as A[i] += x;  Because the address of A[i] is a read-write
-    // pointer. This only works if the index of A[i] is consecutive.
-    // If the address of i is unknown (for example A[B[i]]) then we may
-    // read a few words, modify, and write a few words, and some of the
-    // words may be written to the same address.
+    // pointer. This only works if the index of A[i] is strictly monotonic. We
+    // approximate (conservatively) that by checking for a strided access (with
+    // non-zero stride). If the address of i is unknown (for example A[B[i]])
+    // then we may read a few words, modify, and write a few words, and some of
+    // the words may be written to the same address.
     bool IsReadOnlyPtr = false;
     Type *AccessTy = getLoadStoreType(LD);
-    if (Seen.insert({Ptr, AccessTy}).second ||
-        !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, *DT, SymbolicStrides, false,
-                      true)) {
+    auto IsSafeReadWrite = [&] {
+      std::optional<const SCEV *> Stride = getPtrStrideScev(
+          *PSE, AccessTy, Ptr, TheLoop, *DT, SymbolicStrides, false, true);
+      if (!Stride)
+        return false;
+
+      // Statically known invariant address, preserve old behavior for the
+      // LoopDistributePass. For LoopVectorizer we will detect a load from the
+      // uniform store pointer and bail out further below.
+      if ((*Stride)->isZero())
+        return true;
+
+      auto *SE = PSE->getSE();
+      return SE->isKnownPositive(SE->getAbsExpr((*Stride), false));
+    };
+    if (Seen.insert({Ptr, AccessTy}).second || !IsSafeReadWrite()) {
       ++NumReads;
       IsReadOnlyPtr = true;
     }
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/single_strided_readwrite.ll b/llvm/test/Analysis/LoopAccessAnalysis/single_strided_readwrite.ll
index 390e694c0b340..43133aeb20531 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/single_strided_readwrite.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/single_strided_readwrite.ll
@@ -4,13 +4,8 @@
 define void @known_safe(ptr %p, i8 %a) {
 ; CHECK-LABEL: 'known_safe'
 ; CHECK-NEXT:    header:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        IndirectUnsafe:
-; CHECK-NEXT:            %ld = load i64, ptr %gep, align 4 ->
-; CHECK-NEXT:            store i64 %add, ptr %gep, align 4
-; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
@@ -44,13 +39,8 @@ exit:
 define void @known_safe_byte_gep(ptr %p, i8 %a) {
 ; CHECK-LABEL: 'known_safe_byte_gep'
 ; CHECK-NEXT:    header:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        IndirectUnsafe:
-; CHECK-NEXT:            %ld = load i64, ptr %gep, align 4 ->
-; CHECK-NEXT:            store i64 %add, ptr %gep, align 4
-; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
@@ -241,3 +231,84 @@ header:
 exit:
   ret void
 }
+
+; Not too important to actually support for now, the priority is to handle the
+; one below correctly.
+define void @known_safe_varying_stride(ptr %p, i8 %a) {
+; CHECK-LABEL: 'known_safe_varying_stride'
+; CHECK-NEXT:    header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            %ld = load i64, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %add, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %header
+
+header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ]
+  %iv.next = add nsw i64 %iv, 1
+  %mul = mul nsw nuw i64 %iv.next, %iv.next
+
+  %gep = getelementptr inbounds i64, ptr %p, i64 %mul
+  %ld = load i64, ptr %gep
+  %add = add i64 %ld, %iv
+  store i64 %add, ptr %gep
+
+  %exitcond = icmp slt i64 %iv.next, 128
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @unsafe_varying_stride(ptr %p) {
+; CHECK-LABEL: 'unsafe_varying_stride'
+; CHECK-NEXT:    header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            %ld = load i64, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %add, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %header
+
+header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ]
+  %iv.next = add nsw i64 %iv, 1
+  %mul = mul nsw nuw i64 %iv.next, %iv.next
+
+  ; 0, 0, 3, ...
+  %idx = sub nsw nuw i64 %mul, %iv
+
+  %gep = getelementptr inbounds i64, ptr %p, i64 %idx
+  %ld = load i64, ptr %gep
+  %add = add i64 %ld, %iv
+  store i64 %add, ptr %gep
+
+  %exitcond = icmp slt i64 %iv.next, 128
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}