[llvm] [LAA] Allow vectorizing `A[NonZeroNonConstantStride*I] += 1` (PR #186262)

Thu Mar 12 14:48:31 PDT 2026

https://github.com/eas created https://github.com/llvm/llvm-project/pull/186262

In this patch only do that when we can statically prove that
non-constant stride is non-zero and the resulting index doesn't
overflow. That can later be extended to introduce run-time check when
not provable in compile-time.

My main motivation for this is to move unit-strideness speculation to a
VPlan-based transformation. However, it cannot be done right now because
sometimes such speculation affects legality and we simply avoid
vectorizing loop if it's not done. As such, we need to extend LAA to
properly support dependence analysis/RT checks for strided access
without speculating for it being one. This PR is expected to be the
first one on that journey.

>From f0f9da389f390f74eee6ed908f525e0037455229 Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov at sifive.com>
Date: Wed, 11 Mar 2026 12:45:41 -0700
Subject: [PATCH 1/2] [NFC][LAA] Add a test for a single strided-read-write
 access

---
 .../single_strided_readwrite.ll               | 243 ++++++++++++++++++
 1 file changed, 243 insertions(+)
 create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/single_strided_readwrite.ll

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/single_strided_readwrite.ll b/llvm/test/Analysis/LoopAccessAnalysis/single_strided_readwrite.ll
new file mode 100644
index 0000000000000..390e694c0b340
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/single_strided_readwrite.ll
@@ -0,0 +1,243 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes='print<access-info>' -disable-output < %s -enable-mem-access-versioning=false 2>&1 | FileCheck %s
+
+define void @known_safe(ptr %p, i8 %a) {
+; CHECK-LABEL: 'known_safe'
+; CHECK-NEXT:    header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            %ld = load i64, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %add, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %a.zext = zext i8 %a to i64
+  %stride = add i64 %a.zext, 1
+  br label %header
+
+header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ]
+  %iv.next = add nsw i64 %iv, 1
+  %idx = mul nsw nuw i64 %iv, %stride
+
+  %gep = getelementptr inbounds i64, ptr %p, i64 %idx
+  %ld = load i64, ptr %gep
+  %add = add i64 %ld, %iv
+  store i64 %add, ptr %gep
+
+  %exitcond = icmp slt i64 %iv.next, 128
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @known_safe_byte_gep(ptr %p, i8 %a) {
+; CHECK-LABEL: 'known_safe_byte_gep'
+; CHECK-NEXT:    header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            %ld = load i64, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %add, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %a.zext = zext i8 %a to i64
+  %stride.elts = add i64 %a.zext, 1
+  %stride = mul i64 %stride.elts, 8
+  br label %header
+
+header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ]
+  %iv.next = add nsw i64 %iv, 1
+  %idx = mul nsw nuw i64 %iv, %stride
+
+  %gep = getelementptr inbounds i8, ptr %p, i64 %idx
+  %ld = load i64, ptr %gep
+  %add = add i64 %ld, %iv
+  store i64 %add, ptr %gep
+
+  %exitcond = icmp slt i64 %iv.next, 128
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+; This would require `%a u> 0` RT check.
+define void @known_non_negative(ptr %p, i8 %a) {
+; CHECK-LABEL: 'known_non_negative'
+; CHECK-NEXT:    header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            %ld = load i64, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %add, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %stride = zext i8 %a to i64
+  br label %header
+
+header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ]
+  %iv.next = add nsw i64 %iv, 1
+  %idx = mul nsw nuw i64 %iv, %stride
+
+  %gep = getelementptr inbounds i64, ptr %p, i64 %idx
+  %ld = load i64, ptr %gep
+  %add = add i64 %ld, %iv
+  store i64 %add, ptr %gep
+
+  %exitcond = icmp slt i64 %iv.next, 128
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+; This would require `%a u> 0` RT check.
+define void @known_non_negative_scaled_for_byte_gep(ptr %p, i8 %a) {
+; CHECK-LABEL: 'known_non_negative_scaled_for_byte_gep'
+; CHECK-NEXT:    header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            %ld = load i64, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %add, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %a.zext = zext i8 %a to i64
+  %stride = mul nsw nuw i64 %a.zext, 8
+  br label %header
+
+header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ]
+  %iv.next = add nsw i64 %iv, 1
+  %idx = mul nsw nuw i64 %iv, %stride
+
+  %gep = getelementptr inbounds i8, ptr %p, i64 %idx
+  %ld = load i64, ptr %gep
+  %add = add i64 %ld, %iv
+  store i64 %add, ptr %gep
+
+  %exitcond = icmp slt i64 %iv.next, 128
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+; This would require `%a u> 8` RT check.
+define void @known_non_negative_nonscaled_for_byte_gep(ptr %p, i8 %a) {
+; CHECK-LABEL: 'known_non_negative_nonscaled_for_byte_gep'
+; CHECK-NEXT:    header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            %ld = load i64, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %add, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %stride = zext i8 %a to i64
+  br label %header
+
+header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ]
+  %iv.next = add nsw i64 %iv, 1
+  %idx = mul nsw nuw i64 %iv, %stride
+
+  %gep = getelementptr inbounds i8, ptr %p, i64 %idx
+  %ld = load i64, ptr %gep
+  %add = add i64 %ld, %iv
+  store i64 %add, ptr %gep
+
+  %exitcond = icmp slt i64 %iv.next, 128
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+; This would require `abs(%a) u> 8` RT check.
+define void @arbitrary(ptr %p, i64 %stride) {
+; CHECK-LABEL: 'arbitrary'
+; CHECK-NEXT:    header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            %ld = load i64, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %add, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %header
+
+header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ]
+  %iv.next = add nsw i64 %iv, 1
+  %idx = mul nsw nuw i64 %iv, %stride
+
+  %gep = getelementptr inbounds i8, ptr %p, i64 %idx
+  %ld = load i64, ptr %gep
+  %add = add i64 %ld, %iv
+  store i64 %add, ptr %gep
+
+  %exitcond = icmp slt i64 %iv.next, 128
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}

>From 3271f2acb340fb8f1859cee046ab1c7720cd9545 Mon Sep 17 00:00:00 2001
From: Andrei Elovikov <andrei.elovikov at sifive.com>
Date: Wed, 11 Mar 2026 14:47:09 -0700
Subject: [PATCH 2/2] [LAA] Allow vectorizing `A[NonZeroNonConstantStride*I] +=
 1`

In this patch only do that when we can statically prove that
non-constant stride is non-zero and the resulting index doesn't
overflow. That can later be extended to introduce run-time check when
not provable in compile-time.

My main motivation for this is to move unit-strideness speculation to a
VPlan-based transformation. However, it cannot be done right now because
sometimes such speculation affects legality and we simply avoid
vectorizing loop if it's not done. As such, we need to extend LAA to
properly support dependence analysis/RT checks for strided access
without speculating for it being one. This PR is expected to be the
first one on that journey.
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 45 +++++++--
 .../LoopAccessAnalysis/pointer-phis.ll        |  8 ++
 .../single_strided_readwrite.ll               | 95 ++++++++++++++++---
 3 files changed, 130 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 50ad35a30ea84..d6971d77d8938 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2732,15 +2732,48 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
     // read list. If we *did* see it before, then it is already in
     // the read-write list. This allows us to vectorize expressions
     // such as A[i] += x;  Because the address of A[i] is a read-write
-    // pointer. This only works if the index of A[i] is consecutive.
-    // If the address of i is unknown (for example A[B[i]]) then we may
-    // read a few words, modify, and write a few words, and some of the
-    // words may be written to the same address.
+    // pointer. This only works if the index of A[i] is strictly monotonic. We
+    // approximate (conservatively) that by checking for a strided access (with
+    // non-zero stride). If the address of i is unknown (for example A[B[i]])
+    // then we may read a few words, modify, and write a few words, and some of
+    // the words may be written to the same address.
     bool IsReadOnlyPtr = false;
     Type *AccessTy = getLoadStoreType(LD);
+    auto IsMonotonicAndAtLeastAccessSizeStep = [&] {
+      const SCEV *PtrScev =
+          replaceSymbolicStrideSCEV(*PSE, SymbolicStrides, Ptr);
+
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
+      if (!AR)
+        return false;
+
+      // Non-linear AddRecs might not be strictly monotonic, e.g., i^2-i would
+      // result in the following accesses:
+      //   0, 0, 3, ...
+      if (!AR->isAffine())
+        return false;
+
+      // We can probably compare against the max possible type size, but skip
+      // for now.
+      if (isa<ScalableVectorType>(AccessTy))
+        return false;
+
+      auto *SE = PSE->getSE();
+      const SCEV *Step = AR->getStepRecurrence(*SE);
+      const SCEV *AbsStep = SE->getAbsExpr(Step, false);
+
+      const SCEV *TypeSize = SE->getSizeOfExpr(
+          Step->getType(), SE->getDataLayout().getTypeStoreSize(AccessTy));
+      if (!SE->isKnownNonNegative(SE->getMinusSCEV(AbsStep, TypeSize)))
+        return false;
+
+      if (!isNoWrap(*PSE, AR, Ptr, AccessTy, TheLoop, false, *DT))
+        return false;
+
+      return true;
+    };
     if (Seen.insert({Ptr, AccessTy}).second ||
-        !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, *DT, SymbolicStrides, false,
-                      true)) {
+        !IsMonotonicAndAtLeastAccessSizeStep()) {
       ++NumReads;
       IsReadOnlyPtr = true;
     }
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll b/llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll
index 6fbe0e45976b6..58b7dee968d60 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll
@@ -505,6 +505,14 @@ define void @phi_load_store_memdep_check(i1 %c, ptr %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:            %lv3 = load i16, ptr %c.sink, align 2 ->
 ; CHECK-NEXT:            store i16 %add, ptr %c.sink, align 1
 ; CHECK-EMPTY:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %lv = load i16, ptr %A, align 1 ->
+; CHECK-NEXT:            store i16 %lv, ptr %A, align 1
+; CHECK-EMPTY:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            store i16 %lv, ptr %A, align 1 ->
+; CHECK-NEXT:            %lv2 = load i16, ptr %A, align 1
+; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Check 0:
 ; CHECK-NEXT:        Comparing group GRP0:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/single_strided_readwrite.ll b/llvm/test/Analysis/LoopAccessAnalysis/single_strided_readwrite.ll
index 390e694c0b340..43133aeb20531 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/single_strided_readwrite.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/single_strided_readwrite.ll
@@ -4,13 +4,8 @@
 define void @known_safe(ptr %p, i8 %a) {
 ; CHECK-LABEL: 'known_safe'
 ; CHECK-NEXT:    header:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        IndirectUnsafe:
-; CHECK-NEXT:            %ld = load i64, ptr %gep, align 4 ->
-; CHECK-NEXT:            store i64 %add, ptr %gep, align 4
-; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
@@ -44,13 +39,8 @@ exit:
 define void @known_safe_byte_gep(ptr %p, i8 %a) {
 ; CHECK-LABEL: 'known_safe_byte_gep'
 ; CHECK-NEXT:    header:
-; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        IndirectUnsafe:
-; CHECK-NEXT:            %ld = load i64, ptr %gep, align 4 ->
-; CHECK-NEXT:            store i64 %add, ptr %gep, align 4
-; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
@@ -241,3 +231,84 @@ header:
 exit:
   ret void
 }
+
+; Not too important to actually support for now, the priority is to handle the
+; one below correctly.
+define void @known_safe_varying_stride(ptr %p, i8 %a) {
+; CHECK-LABEL: 'known_safe_varying_stride'
+; CHECK-NEXT:    header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            %ld = load i64, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %add, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %header
+
+header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ]
+  %iv.next = add nsw i64 %iv, 1
+  %mul = mul nsw nuw i64 %iv.next, %iv.next
+
+  %gep = getelementptr inbounds i64, ptr %p, i64 %mul
+  %ld = load i64, ptr %gep
+  %add = add i64 %ld, %iv
+  store i64 %add, ptr %gep
+
+  %exitcond = icmp slt i64 %iv.next, 128
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @unsafe_varying_stride(ptr %p) {
+; CHECK-LABEL: 'unsafe_varying_stride'
+; CHECK-NEXT:    header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unsafe indirect dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        IndirectUnsafe:
+; CHECK-NEXT:            %ld = load i64, ptr %gep, align 4 ->
+; CHECK-NEXT:            store i64 %add, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %header
+
+header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %header ]
+  %iv.next = add nsw i64 %iv, 1
+  %mul = mul nsw nuw i64 %iv.next, %iv.next
+
+  ; 0, 0, 3, ...
+  %idx = sub nsw nuw i64 %mul, %iv
+
+  %gep = getelementptr inbounds i64, ptr %p, i64 %idx
+  %ld = load i64, ptr %gep
+  %add = add i64 %ld, %iv
+  store i64 %add, ptr %gep
+
+  %exitcond = icmp slt i64 %iv.next, 128
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}