[llvm] [LAA] Compute pointer bounds for pattern with urem operation (PR #106574)
Sergey Kachkov via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 29 08:48:35 PDT 2024
https://github.com/skachkov-sc created https://github.com/llvm/llvm-project/pull/106574
**Motivation**
Let's consider the following example: https://godbolt.org/z/7r8KaaKz4
Despite to very similar code, only the first test can be vectorized. The reason is that `(i + 1) % N` is replaced to `i == (N - 1) ? 0 : i + 1` by IndVarSimplify, and each part of this select instruction is supported by LoopAccessAnalysis as a "forked pointer" (`src[0]` is loop-invariant, and `src[i+1]` is affine AddRec). In the second test, `(i + 2) % N` is represented as urem instruction, and this case is currently not supported by LAA -- but its bounds still can be determined.
**Proposed solution**
In this patch I'm trying to match SCEV in form of:
```
PtrScev = BasePtr + (Dividend urem Divisor) * ConstStride
```
(BasePtr and Divisor must be loop-invariant). If it's successful, access bounds are [BasePtr, BasePtr + (Divisor - 1) * ConstStride + ElemSize).
First commit also contains a small refactoring: currently, logic of processing access bounds is distributed accross 3 functions: hasComputableBounds(), getStartAndEndForAccess() and some checks in findForkedPointer(). I've tried to rework this in a way that it's handled from 2 places: hasComputableBounds and getStartAndEndForAccess (ideally, this should be the one place, but it's harder to implement).
llvm-test-suite results:
```
Program loop-vectorize.LoopsVectorized
before after diff
test-suite :: External/SPEC/CINT2017rate/525.x264_r/525.x264_r.test 98.00 99.00 1.0%
test-suite :: External/SPEC/CINT2017speed/625.x264_s/625.x264_s.test 98.00 99.00 1.0%
test-suite :: MultiSource/Benchmarks/7zip/7zip-benchmark.test 364.00 365.00 0.3%
```
(we have additionaly vectorized loop in SPEC2017 x264 and in 7zip-benchmark).
>From 3415b7e7ea53a74a87bf7048b297971a67ed7ada Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <sergey.kachkov at syntacore.com>
Date: Tue, 27 Aug 2024 17:48:02 +0300
Subject: [PATCH 1/3] [LAA][NFCI] Re-use hasComputableBounds in
findForkedPointer and refactor createCheckForAccess
---
llvm/lib/Analysis/LoopAccessAnalysis.cpp | 40 ++++++++++--------------
1 file changed, 17 insertions(+), 23 deletions(-)
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 8d53a27fb75eb4..22d62e35c45813 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -809,7 +809,8 @@ class AccessAnalysis {
/// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr
/// by adding run-time checks (overflow checks) if necessary.
static bool hasComputableBounds(PredicatedScalarEvolution &PSE, Value *Ptr,
- const SCEV *PtrScev, Loop *L, bool Assume) {
+ const SCEV *PtrScev, const Loop *L,
+ bool Assume) {
// The bounds for loop-invariant pointer is trivial.
if (PSE.getSE()->isLoopInvariant(PtrScev, L))
return true;
@@ -1041,13 +1042,10 @@ findForkedPointer(PredicatedScalarEvolution &PSE,
SmallVector<PointerIntPair<const SCEV *, 1, bool>> Scevs;
findForkedSCEVs(SE, L, Ptr, Scevs, MaxForkedSCEVDepth);
- // For now, we will only accept a forked pointer with two possible SCEVs
- // that are either SCEVAddRecExprs or loop invariant.
+ // For now, we will only accept a forked pointer with two possible SCEVs.
if (Scevs.size() == 2 &&
- (isa<SCEVAddRecExpr>(get<0>(Scevs[0])) ||
- SE->isLoopInvariant(get<0>(Scevs[0]), L)) &&
- (isa<SCEVAddRecExpr>(get<0>(Scevs[1])) ||
- SE->isLoopInvariant(get<0>(Scevs[1]), L))) {
+ hasComputableBounds(PSE, Ptr, get<0>(Scevs[0]), L, /*Assume*/ false) &&
+ hasComputableBounds(PSE, Ptr, get<0>(Scevs[1]), L, /*Assume*/ false)) {
LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n");
LLVM_DEBUG(dbgs() << "\t(1) " << *get<0>(Scevs[0]) << "\n");
LLVM_DEBUG(dbgs() << "\t(2) " << *get<0>(Scevs[1]) << "\n");
@@ -1069,30 +1067,26 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
SmallVector<PointerIntPair<const SCEV *, 1, bool>> TranslatedPtrs =
findForkedPointer(PSE, StridesMap, Ptr, TheLoop);
- for (const auto &P : TranslatedPtrs) {
- const SCEV *PtrExpr = get<0>(P);
- if (!hasComputableBounds(PSE, Ptr, PtrExpr, TheLoop, Assume))
+ if (TranslatedPtrs.size() == 1) {
+ auto &TranslatedPtr = TranslatedPtrs.front();
+ if (!hasComputableBounds(PSE, Ptr, get<0>(TranslatedPtr), TheLoop, Assume))
return false;
// When we run after a failing dependency check we have to make sure
// we don't have wrapping pointers.
- if (ShouldCheckWrap) {
- // Skip wrap checking when translating pointers.
- if (TranslatedPtrs.size() > 1)
+ if (ShouldCheckWrap && !isNoWrap(PSE, StridesMap, Ptr, AccessTy, TheLoop)) {
+ const SCEV *Expr = PSE.getSCEV(Ptr);
+ if (!Assume || !isa<SCEVAddRecExpr>(Expr))
return false;
-
- if (!isNoWrap(PSE, StridesMap, Ptr, AccessTy, TheLoop)) {
- const SCEV *Expr = PSE.getSCEV(Ptr);
- if (!Assume || !isa<SCEVAddRecExpr>(Expr))
- return false;
- PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
- }
+ PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
}
// If there's only one option for Ptr, look it up after bounds and wrap
// checking, because assumptions might have been added to PSE.
- if (TranslatedPtrs.size() == 1)
- TranslatedPtrs[0] = {replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr),
- false};
+ TranslatedPtr = {replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false};
+ } else {
+ // Skip wrap checking when translating pointers.
+ if (ShouldCheckWrap)
+ return false;
}
for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) {
>From 6e6d0fa7c920bd1d89e6961da08c33224e3116e5 Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <sergey.kachkov at syntacore.com>
Date: Mon, 19 Aug 2024 17:25:40 +0300
Subject: [PATCH 2/3] [LAA][NFC] Add pre-commit tests for urem pattern
---
.../LoopAccessAnalysis/urem-pattern.ll | 68 +++++++++++++++++++
1 file changed, 68 insertions(+)
create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/urem-pattern.ll
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/urem-pattern.ll b/llvm/test/Analysis/LoopAccessAnalysis/urem-pattern.ll
new file mode 100644
index 00000000000000..b4502e9e510093
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/urem-pattern.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -disable-output -passes='print<access-info>' %s 2>&1 | FileCheck %s
+
+define void @test_stride_1(ptr writeonly %dst, ptr readonly %src, i64 %n, i64 %offset) {
+; CHECK-LABEL: 'test_stride_1'
+; CHECK-NEXT: loop:
+; CHECK-NEXT: Report: cannot identify array bounds
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+;
+entry:
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %loop, label %exit
+
+loop:
+ %i = phi i64 [ %inc, %loop ], [ 0, %entry ]
+ %add = add i64 %i, %offset
+ %rem = urem i64 %add, %n
+ %arrayidx = getelementptr inbounds i8, ptr %src, i64 %rem
+ %0 = load i8, ptr %arrayidx, align 1
+ %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i
+ store i8 %0, ptr %arrayidx1, align 1
+ %inc = add nuw nsw i64 %i, 1
+ %exitcond.not = icmp eq i64 %inc, %n
+ br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @test_stride_4(ptr writeonly %dst, ptr readonly %src, i64 %n, i64 %offset) {
+; CHECK-LABEL: 'test_stride_4'
+; CHECK-NEXT: loop:
+; CHECK-NEXT: Report: cannot identify array bounds
+; CHECK-NEXT: Dependences:
+; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT: SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT: Expressions re-written:
+;
+entry:
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %loop, label %exit
+
+loop:
+ %i = phi i64 [ %inc, %loop ], [ 0, %entry ]
+ %add = add i64 %i, %offset
+ %rem = urem i64 %add, %n
+ %arrayidx = getelementptr inbounds i32, ptr %src, i64 %rem
+ %0 = load i32, ptr %arrayidx, align 4
+ %arrayidx1 = getelementptr inbounds i32, ptr %dst, i64 %i
+ store i32 %0, ptr %arrayidx1, align 4
+ %inc = add nuw nsw i64 %i, 1
+ %exitcond.not = icmp eq i64 %inc, %n
+ br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+ ret void
+}
>From fc265dfe0fef8557179bac9be11f90887e6fd2f5 Mon Sep 17 00:00:00 2001
From: Sergey Kachkov <sergey.kachkov at syntacore.com>
Date: Tue, 27 Aug 2024 18:17:17 +0300
Subject: [PATCH 3/3] [LAA] Compute pointer bounds for pattern with urem
operation
---
llvm/include/llvm/Analysis/ScalarEvolution.h | 5 +++
llvm/lib/Analysis/LoopAccessAnalysis.cpp | 34 ++++++++++++++++++-
llvm/lib/Analysis/ScalarEvolution.cpp | 34 +++++++++++++++++++
.../LoopAccessAnalysis/urem-pattern.ll | 26 ++++++++++++--
4 files changed, 96 insertions(+), 3 deletions(-)
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index fe46a504bce5d1..030ccf166881a2 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1027,6 +1027,11 @@ class ScalarEvolution {
bool isKnownToBeAPowerOfTwo(const SCEV *S, bool OrZero = false,
bool OrNegative = false);
+ /// Test if Expr can be represented as (A urem B) * Multiplier. If successful,
+ /// assign A and B to LHS and RHS, respectively.
+ bool isURemWithKnownMultiplier(const SCEV *Expr, const SCEV *Multiplier,
+ const SCEV *&LHS, const SCEV *&RHS);
+
/// Splits SCEV expression \p S into two SCEVs. One of them is obtained from
/// \p S by substitution of all AddRec sub-expression related to loop \p L
/// with initial value of that SCEV. The second is obtained from \p S by
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 22d62e35c45813..3e01ae5cb42e56 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -190,6 +190,32 @@ RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
Members.push_back(Index);
}
+// Match expression in form of: PtrBase + (Dividend urem Divisor) * ConstStride,
+// where PtrBase and Divisor are loop-invariant and ConstStride is non-negative.
+// In this case Start = PtrBase, End = PtrBase + (Divisor - 1) * ConstStride.
+static std::optional<std::pair<const SCEV *, const SCEV *>>
+getStartAndEndForURemAccess(ScalarEvolution &SE, const SCEV *PtrScev,
+ const Loop *L) {
+ const SCEV *PtrBase = SE.getPointerBase(PtrScev);
+ if (!SE.isLoopInvariant(PtrBase, L))
+ return std::nullopt;
+ const SCEV *PtrAddend = SE.removePointerBase(PtrScev);
+ auto ConstStride = SE.getConstantMultiple(PtrAddend);
+ if (ConstStride.isNegative())
+ return std::nullopt;
+ const SCEV *StrideScev = SE.getConstant(ConstStride);
+ const SCEV *Dividend, *Divisor;
+ if (!SE.isURemWithKnownMultiplier(PtrAddend, StrideScev, Dividend, Divisor))
+ return std::nullopt;
+ if (!SE.isLoopInvariant(Divisor, L))
+ return std::nullopt;
+ const SCEV *DivisorMinusOne =
+ SE.getAddExpr(Divisor, SE.getMinusOne(Divisor->getType()));
+ return std::make_pair(
+ PtrBase,
+ SE.getAddExpr(PtrBase, SE.getMulExpr(DivisorMinusOne, StrideScev)));
+}
+
/// Calculate Start and End points of memory access.
/// Let's assume A is the first access and B is a memory access on N-th loop
/// iteration. Then B is calculated as:
@@ -221,6 +247,8 @@ static std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess(
if (SE->isLoopInvariant(PtrExpr, Lp)) {
ScStart = ScEnd = PtrExpr;
+ } else if (auto Bounds = getStartAndEndForURemAccess(*SE, PtrExpr, Lp)) {
+ std::tie(ScStart, ScEnd) = *Bounds;
} else if (auto *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr)) {
const SCEV *Ex = PSE.getSymbolicMaxBackedgeTakenCount();
@@ -811,8 +839,12 @@ class AccessAnalysis {
static bool hasComputableBounds(PredicatedScalarEvolution &PSE, Value *Ptr,
const SCEV *PtrScev, const Loop *L,
bool Assume) {
+ ScalarEvolution *SE = PSE.getSE();
// The bounds for loop-invariant pointer is trivial.
- if (PSE.getSE()->isLoopInvariant(PtrScev, L))
+ if (SE->isLoopInvariant(PtrScev, L))
+ return true;
+
+ if (getStartAndEndForURemAccess(*SE, PtrScev, L))
return true;
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 54dde8401cdff0..e14090744ff46c 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -10871,6 +10871,40 @@ bool ScalarEvolution::isKnownToBeAPowerOfTwo(const SCEV *S, bool OrZero,
return all_of(Mul->operands(), NonRecursive) && (OrZero || isKnownNonZero(S));
}
+bool ScalarEvolution::isURemWithKnownMultiplier(const SCEV *Expr,
+ const SCEV *Multiplier,
+ const SCEV *&LHS,
+ const SCEV *&RHS) {
+ // Case with Multiplier == 1: just match URem expr.
+ if (Multiplier->isOne())
+ return matchURem(Expr, LHS, RHS);
+ // In case of Multiplier != 1, try to match Expr in form of:
+ // (-Multiplier * (Dividend /u Divisor) * Divisor) + (Multiplier * Dividend).
+ const auto *Add = dyn_cast<SCEVAddExpr>(Expr);
+ if (!Add || Add->getNumOperands() != 2)
+ return false;
+ const auto *Mul = dyn_cast<SCEVMulExpr>(Add->getOperand(0));
+ if (!Mul || Mul->getNumOperands() != 3 ||
+ Mul->getOperand(0) != getNegativeSCEV(Multiplier))
+ return false;
+ const auto *A = Add->getOperand(1);
+
+ const auto MatchDividend = [&](const SCEV *Expr, const SCEV *Divisor) {
+ const auto *UDiv = dyn_cast<SCEVUDivExpr>(Expr);
+ if (!UDiv || UDiv->getRHS() != Divisor)
+ return false;
+ const auto *Dividend = UDiv->getLHS();
+ if (getMulExpr(Dividend, Multiplier) != A)
+ return false;
+ LHS = Dividend;
+ RHS = Divisor;
+ return true;
+ };
+
+ return MatchDividend(Mul->getOperand(1), Mul->getOperand(2)) ||
+ MatchDividend(Mul->getOperand(2), Mul->getOperand(1));
+}
+
std::pair<const SCEV *, const SCEV *>
ScalarEvolution::SplitIntoInitAndPostInc(const Loop *L, const SCEV *S) {
// Compute SCEV on entry of loop L.
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/urem-pattern.ll b/llvm/test/Analysis/LoopAccessAnalysis/urem-pattern.ll
index b4502e9e510093..4650cad04b677d 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/urem-pattern.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/urem-pattern.ll
@@ -4,10 +4,21 @@
define void @test_stride_1(ptr writeonly %dst, ptr readonly %src, i64 %n, i64 %offset) {
; CHECK-LABEL: 'test_stride_1'
; CHECK-NEXT: loop:
-; CHECK-NEXT: Report: cannot identify array bounds
+; CHECK-NEXT: Memory dependences are safe with run-time checks
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Check 0:
+; CHECK-NEXT: Comparing group ([[GRP1:0x[0-9a-f]+]]):
+; CHECK-NEXT: %arrayidx1 = getelementptr inbounds i8, ptr %dst, i64 %i
+; CHECK-NEXT: Against group ([[GRP2:0x[0-9a-f]+]]):
+; CHECK-NEXT: %arrayidx = getelementptr inbounds i8, ptr %src, i64 %rem
; CHECK-NEXT: Grouped accesses:
+; CHECK-NEXT: Group [[GRP1]]:
+; CHECK-NEXT: (Low: %dst High: (%n + %dst))
+; CHECK-NEXT: Member: {%dst,+,1}<nuw><%loop>
+; CHECK-NEXT: Group [[GRP2]]:
+; CHECK-NEXT: (Low: %src High: (%n + %src))
+; CHECK-NEXT: Member: ((-1 * ({%offset,+,1}<nw><%loop> /u %n) * %n) + {(%offset + %src),+,1}<nw><%loop>)
; CHECK-EMPTY:
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
; CHECK-NEXT: SCEV assumptions:
@@ -37,10 +48,21 @@ exit:
define void @test_stride_4(ptr writeonly %dst, ptr readonly %src, i64 %n, i64 %offset) {
; CHECK-LABEL: 'test_stride_4'
; CHECK-NEXT: loop:
-; CHECK-NEXT: Report: cannot identify array bounds
+; CHECK-NEXT: Memory dependences are safe with run-time checks
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Run-time memory checks:
+; CHECK-NEXT: Check 0:
+; CHECK-NEXT: Comparing group ([[GRP3:0x[0-9a-f]+]]):
+; CHECK-NEXT: %arrayidx1 = getelementptr inbounds i32, ptr %dst, i64 %i
+; CHECK-NEXT: Against group ([[GRP4:0x[0-9a-f]+]]):
+; CHECK-NEXT: %arrayidx = getelementptr inbounds i32, ptr %src, i64 %rem
; CHECK-NEXT: Grouped accesses:
+; CHECK-NEXT: Group [[GRP3]]:
+; CHECK-NEXT: (Low: %dst High: ((4 * %n) + %dst))
+; CHECK-NEXT: Member: {%dst,+,4}<nuw><%loop>
+; CHECK-NEXT: Group [[GRP4]]:
+; CHECK-NEXT: (Low: %src High: ((4 * %n) + %src))
+; CHECK-NEXT: Member: ((-4 * ({%offset,+,1}<nw><%loop> /u %n) * %n) + {((4 * %offset) + %src),+,4}<%loop>)
; CHECK-EMPTY:
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
; CHECK-NEXT: SCEV assumptions:
More information about the llvm-commits
mailing list