[llvm] [LAA] Compute pointer bounds for pattern with urem operation (PR #106574)

Thu Aug 29 08:48:35 PDT 2024

https://github.com/skachkov-sc created https://github.com/llvm/llvm-project/pull/106574

**Motivation**
Let's consider the following example: https://godbolt.org/z/7r8KaaKz4
Despite to very similar code, only the first test can be vectorized. The reason is that `(i + 1) % N` is replaced to `i == (N - 1) ? 0 : i + 1` by IndVarSimplify, and each part of this select instruction is supported by LoopAccessAnalysis as a "forked pointer" (`src[0]` is loop-invariant, and `src[i+1]` is affine AddRec). In the second test, `(i + 2) % N` is represented as urem instruction, and this case is currently not supported by LAA -- but its bounds still can be determined.

**Proposed solution**
In this patch I'm trying to match SCEV in form of:
```
PtrScev = BasePtr + (Dividend urem Divisor) * ConstStride
```
(BasePtr and Divisor must be loop-invariant). If it's successful, access bounds are [BasePtr, BasePtr + (Divisor - 1) * ConstStride + ElemSize).

First commit also contains a small refactoring: currently, logic of processing access bounds is distributed accross 3 functions: hasComputableBounds(), getStartAndEndForAccess() and some checks in findForkedPointer(). I've tried to rework this in a way that it's handled from 2 places: hasComputableBounds and getStartAndEndForAccess (ideally, this should be the one place, but it's harder to implement).

llvm-test-suite results:
```
Program                                                              loop-vectorize.LoopsVectorized                
                                                                     before                         after     diff 
 test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test   98.00                          99.00    1.0%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test   98.00                          99.00    1.0%
       test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test  364.00                         365.00    0.3%
```
(we have additionaly vectorized loop in SPEC2017 x264 and in 7zip-benchmark).


>From 3415b7e7ea53a74a87bf7048b297971a67ed7ada Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <sergey.kachkov at syntacore.com>
Date: Tue, 27 Aug 2024 17:48:02 +0300
Subject: [PATCH 1/3] [LAA][NFCI] Re-use hasComputableBounds in
 findForkedPointer and refactor createCheckForAccess

---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp | 40 ++++++++++--------------
 1 file changed, 17 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 8d53a27fb75eb4..22d62e35c45813 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -809,7 +809,8 @@ class AccessAnalysis {
 /// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr
 /// by adding run-time checks (overflow checks) if necessary.
 static bool hasComputableBounds(PredicatedScalarEvolution &PSE, Value *Ptr,
-                                const SCEV *PtrScev, Loop *L, bool Assume) {
+                                const SCEV *PtrScev, const Loop *L,
+                                bool Assume) {
   // The bounds for loop-invariant pointer is trivial.
   if (PSE.getSE()->isLoopInvariant(PtrScev, L))
     return true;
@@ -1041,13 +1042,10 @@ findForkedPointer(PredicatedScalarEvolution &PSE,
   SmallVector<PointerIntPair<const SCEV *, 1, bool>> Scevs;
   findForkedSCEVs(SE, L, Ptr, Scevs, MaxForkedSCEVDepth);
 
-  // For now, we will only accept a forked pointer with two possible SCEVs
-  // that are either SCEVAddRecExprs or loop invariant.
+  // For now, we will only accept a forked pointer with two possible SCEVs.
   if (Scevs.size() == 2 &&
-      (isa<SCEVAddRecExpr>(get<0>(Scevs[0])) ||
-       SE->isLoopInvariant(get<0>(Scevs[0]), L)) &&
-      (isa<SCEVAddRecExpr>(get<0>(Scevs[1])) ||
-       SE->isLoopInvariant(get<0>(Scevs[1]), L))) {
+      hasComputableBounds(PSE, Ptr, get<0>(Scevs[0]), L, /*Assume*/ false) &&
+      hasComputableBounds(PSE, Ptr, get<0>(Scevs[1]), L, /*Assume*/ false)) {
     LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n");
     LLVM_DEBUG(dbgs() << "\t(1) " << *get<0>(Scevs[0]) << "\n");
     LLVM_DEBUG(dbgs() << "\t(2) " << *get<0>(Scevs[1]) << "\n");
@@ -1069,30 +1067,26 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
   SmallVector<PointerIntPair<const SCEV *, 1, bool>> TranslatedPtrs =
       findForkedPointer(PSE, StridesMap, Ptr, TheLoop);
 
-  for (const auto &P : TranslatedPtrs) {
-    const SCEV *PtrExpr = get<0>(P);
-    if (!hasComputableBounds(PSE, Ptr, PtrExpr, TheLoop, Assume))
+  if (TranslatedPtrs.size() == 1) {
+    auto &TranslatedPtr = TranslatedPtrs.front();
+    if (!hasComputableBounds(PSE, Ptr, get<0>(TranslatedPtr), TheLoop, Assume))
       return false;
 
     // When we run after a failing dependency check we have to make sure
     // we don't have wrapping pointers.
-    if (ShouldCheckWrap) {
-      // Skip wrap checking when translating pointers.
-      if (TranslatedPtrs.size() > 1)
+    if (ShouldCheckWrap && !isNoWrap(PSE, StridesMap, Ptr, AccessTy, TheLoop)) {
+      const SCEV *Expr = PSE.getSCEV(Ptr);
+      if (!Assume || !isa<SCEVAddRecExpr>(Expr))
         return false;
-
-      if (!isNoWrap(PSE, StridesMap, Ptr, AccessTy, TheLoop)) {
-        const SCEV *Expr = PSE.getSCEV(Ptr);
-        if (!Assume || !isa<SCEVAddRecExpr>(Expr))
-          return false;
-        PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
-      }
+      PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
     }
     // If there's only one option for Ptr, look it up after bounds and wrap
     // checking, because assumptions might have been added to PSE.
-    if (TranslatedPtrs.size() == 1)
-      TranslatedPtrs[0] = {replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr),
-                           false};
+    TranslatedPtr = {replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false};
+  } else {
+    // Skip wrap checking when translating pointers.
+    if (ShouldCheckWrap)
+      return false;
   }
 
   for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) {

>From 6e6d0fa7c920bd1d89e6961da08c33224e3116e5 Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <sergey.kachkov at syntacore.com>
Date: Mon, 19 Aug 2024 17:25:40 +0300
Subject: [PATCH 2/3] [LAA][NFC] Add pre-commit tests for urem pattern

---
 .../LoopAccessAnalysis/urem-pattern.ll        | 68 +++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/urem-pattern.ll

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/urem-pattern.ll b/llvm/test/Analysis/LoopAccessAnalysis/urem-pattern.ll
new file mode 100644
index 00000000000000..b4502e9e510093
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/urem-pattern.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -disable-output -passes='print<access-info>' %s 2>&1 | FileCheck %s
+
+define void @test_stride_1(ptr writeonly %dst, ptr readonly %src, i64 %n, i64 %offset) {
+; CHECK-LABEL: 'test_stride_1'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: cannot identify array bounds
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %loop, label %exit
+
+loop:
+  %i = phi i64 [ %inc, %loop ], [ 0, %entry ]
+  %add = add i64 %i, %offset
+  %rem = urem i64 %add, %n
+  %arrayidx = getelementptr inbounds i8, ptr %src, i64 %rem
+  %0 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i
+  store i8 %0, ptr %arrayidx1, align 1
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_stride_4(ptr writeonly %dst, ptr readonly %src, i64 %n, i64 %offset) {
+; CHECK-LABEL: 'test_stride_4'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: cannot identify array bounds
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %loop, label %exit
+
+loop:
+  %i = phi i64 [ %inc, %loop ], [ 0, %entry ]
+  %add = add i64 %i, %offset
+  %rem = urem i64 %add, %n
+  %arrayidx = getelementptr inbounds i32, ptr %src, i64 %rem
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, ptr %dst, i64 %i
+  store i32 %0, ptr %arrayidx1, align 4
+  %inc = add nuw nsw i64 %i, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}

>From fc265dfe0fef8557179bac9be11f90887e6fd2f5 Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <sergey.kachkov at syntacore.com>
Date: Tue, 27 Aug 2024 18:17:17 +0300
Subject: [PATCH 3/3] [LAA] Compute pointer bounds for pattern with urem
 operation

---
 llvm/include/llvm/Analysis/ScalarEvolution.h  |  5 +++
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 34 ++++++++++++++++++-
 llvm/lib/Analysis/ScalarEvolution.cpp         | 34 +++++++++++++++++++
 .../LoopAccessAnalysis/urem-pattern.ll        | 26 ++++++++++++--
 4 files changed, 96 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index fe46a504bce5d1..030ccf166881a2 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1027,6 +1027,11 @@ class ScalarEvolution {
   bool isKnownToBeAPowerOfTwo(const SCEV *S, bool OrZero = false,
                               bool OrNegative = false);
 
+  /// Test if Expr can be represented as (A urem B) * Multiplier. If successful,
+  /// assign A and B to LHS and RHS, respectively.
+  bool isURemWithKnownMultiplier(const SCEV *Expr, const SCEV *Multiplier,
+                                 const SCEV *&LHS, const SCEV *&RHS);
+
   /// Splits SCEV expression \p S into two SCEVs. One of them is obtained from
   /// \p S by substitution of all AddRec sub-expression related to loop \p L
   /// with initial value of that SCEV. The second is obtained from \p S by
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 22d62e35c45813..3e01ae5cb42e56 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -190,6 +190,32 @@ RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
   Members.push_back(Index);
 }
 
+// Match expression in form of: PtrBase + (Dividend urem Divisor) * ConstStride,
+// where PtrBase and Divisor are loop-invariant and ConstStride is non-negative.
+// In this case Start = PtrBase, End = PtrBase + (Divisor - 1) * ConstStride.
+static std::optional<std::pair<const SCEV *, const SCEV *>>
+getStartAndEndForURemAccess(ScalarEvolution &SE, const SCEV *PtrScev,
+                            const Loop *L) {
+  const SCEV *PtrBase = SE.getPointerBase(PtrScev);
+  if (!SE.isLoopInvariant(PtrBase, L))
+    return std::nullopt;
+  const SCEV *PtrAddend = SE.removePointerBase(PtrScev);
+  auto ConstStride = SE.getConstantMultiple(PtrAddend);
+  if (ConstStride.isNegative())
+    return std::nullopt;
+  const SCEV *StrideScev = SE.getConstant(ConstStride);
+  const SCEV *Dividend, *Divisor;
+  if (!SE.isURemWithKnownMultiplier(PtrAddend, StrideScev, Dividend, Divisor))
+    return std::nullopt;
+  if (!SE.isLoopInvariant(Divisor, L))
+    return std::nullopt;
+  const SCEV *DivisorMinusOne =
+      SE.getAddExpr(Divisor, SE.getMinusOne(Divisor->getType()));
+  return std::make_pair(
+      PtrBase,
+      SE.getAddExpr(PtrBase, SE.getMulExpr(DivisorMinusOne, StrideScev)));
+}
+
 /// Calculate Start and End points of memory access.
 /// Let's assume A is the first access and B is a memory access on N-th loop
 /// iteration. Then B is calculated as:
@@ -221,6 +247,8 @@ static std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess(
 
   if (SE->isLoopInvariant(PtrExpr, Lp)) {
     ScStart = ScEnd = PtrExpr;
+  } else if (auto Bounds = getStartAndEndForURemAccess(*SE, PtrExpr, Lp)) {
+    std::tie(ScStart, ScEnd) = *Bounds;
   } else if (auto *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr)) {
     const SCEV *Ex = PSE.getSymbolicMaxBackedgeTakenCount();
 
@@ -811,8 +839,12 @@ class AccessAnalysis {
 static bool hasComputableBounds(PredicatedScalarEvolution &PSE, Value *Ptr,
                                 const SCEV *PtrScev, const Loop *L,
                                 bool Assume) {
+  ScalarEvolution *SE = PSE.getSE();
   // The bounds for loop-invariant pointer is trivial.
-  if (PSE.getSE()->isLoopInvariant(PtrScev, L))
+  if (SE->isLoopInvariant(PtrScev, L))
+    return true;
+
+  if (getStartAndEndForURemAccess(*SE, PtrScev, L))
     return true;
 
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 54dde8401cdff0..e14090744ff46c 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -10871,6 +10871,40 @@ bool ScalarEvolution::isKnownToBeAPowerOfTwo(const SCEV *S, bool OrZero,
   return all_of(Mul->operands(), NonRecursive) && (OrZero || isKnownNonZero(S));
 }
 
+bool ScalarEvolution::isURemWithKnownMultiplier(const SCEV *Expr,
+                                                const SCEV *Multiplier,
+                                                const SCEV *&LHS,
+                                                const SCEV *&RHS) {
+  // Case with Multiplier == 1: just match URem expr.
+  if (Multiplier->isOne())
+    return matchURem(Expr, LHS, RHS);
+  // In case of Multiplier != 1, try to match Expr in form of:
+  // (-Multiplier * (Dividend /u Divisor) * Divisor) + (Multiplier * Dividend).
+  const auto *Add = dyn_cast<SCEVAddExpr>(Expr);
+  if (!Add || Add->getNumOperands() != 2)
+    return false;
+  const auto *Mul = dyn_cast<SCEVMulExpr>(Add->getOperand(0));
+  if (!Mul || Mul->getNumOperands() != 3 ||
+      Mul->getOperand(0) != getNegativeSCEV(Multiplier))
+    return false;
+  const auto *A = Add->getOperand(1);
+
+  const auto MatchDividend = [&](const SCEV *Expr, const SCEV *Divisor) {
+    const auto *UDiv = dyn_cast<SCEVUDivExpr>(Expr);
+    if (!UDiv || UDiv->getRHS() != Divisor)
+      return false;
+    const auto *Dividend = UDiv->getLHS();
+    if (getMulExpr(Dividend, Multiplier) != A)
+      return false;
+    LHS = Dividend;
+    RHS = Divisor;
+    return true;
+  };
+
+  return MatchDividend(Mul->getOperand(1), Mul->getOperand(2)) ||
+         MatchDividend(Mul->getOperand(2), Mul->getOperand(1));
+}
+
 std::pair<const SCEV *, const SCEV *>
 ScalarEvolution::SplitIntoInitAndPostInc(const Loop *L, const SCEV *S) {
   // Compute SCEV on entry of loop L.
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/urem-pattern.ll b/llvm/test/Analysis/LoopAccessAnalysis/urem-pattern.ll
index b4502e9e510093..4650cad04b677d 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/urem-pattern.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/urem-pattern.ll
@@ -4,10 +4,21 @@
 define void @test_stride_1(ptr writeonly %dst, ptr readonly %src, i64 %n, i64 %offset) {
 ; CHECK-LABEL: 'test_stride_1'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: cannot identify array bounds
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP1:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i
+; CHECK-NEXT:        Against group ([[GRP2:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds i8, ptr %src, i64 %rem
 ; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP1]]:
+; CHECK-NEXT:          (Low: %dst High: (%n + %dst))
+; CHECK-NEXT:            Member: {%dst,+,1}<nuw><%loop>
+; CHECK-NEXT:        Group [[GRP2]]:
+; CHECK-NEXT:          (Low: %src High: (%n + %src))
+; CHECK-NEXT:            Member: ((-1 * ({%offset,+,1}<nw><%loop> /u %n) * %n) + {(%offset + %src),+,1}<nw><%loop>)
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
@@ -37,10 +48,21 @@ exit:
 define void @test_stride_4(ptr writeonly %dst, ptr readonly %src, i64 %n, i64 %offset) {
 ; CHECK-LABEL: 'test_stride_4'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: cannot identify array bounds
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP3:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx1 = getelementptr inbounds i32, ptr %dst, i64 %i
+; CHECK-NEXT:        Against group ([[GRP4:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds i32, ptr %src, i64 %rem
 ; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP3]]:
+; CHECK-NEXT:          (Low: %dst High: ((4 * %n) + %dst))
+; CHECK-NEXT:            Member: {%dst,+,4}<nuw><%loop>
+; CHECK-NEXT:        Group [[GRP4]]:
+; CHECK-NEXT:          (Low: %src High: ((4 * %n) + %src))
+; CHECK-NEXT:            Member: ((-4 * ({%offset,+,1}<nw><%loop> /u %n) * %n) + {((4 * %offset) + %src),+,4}<%loop>)
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions: