[llvm] [LAA] Determine Dst and Src overlapping by SCEV of Src and Dist (PR #79947)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 29 21:49:17 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
@llvm/pr-subscribers-llvm-analysis
Author: None (ShivaChen)
<details>
<summary>Changes</summary>
Consider the following case:
for (int j = 0; j < 256; j++) // Loop j
for (int i = j+1; i < 256; i++)// Loop i
a[i] -= aa[j][i] * a[j];
Given that SCEV of `&a[j]` is `{@<!-- -->a,+,4}<Loop j>`, a[j] will be treated as scalar when vectorizing Loop i. If the `accessing size of a[j]` <= `Dist(a[j], a[i])`, there is no overlapped and can be vectorized.
In this case, accessing size of a[j] is 4 byte(float) and Dist(a[j], a[i]) is {4,+,4} which bring the minimum distance as 4
---
Full diff: https://github.com/llvm/llvm-project/pull/79947.diff
2 Files Affected:
- (modified) llvm/lib/Analysis/LoopAccessAnalysis.cpp (+71)
- (added) llvm/test/Transforms/LoopVectorize/vectorize-s115.ll (+58)
``````````diff
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index dd6b88fee415a..67beec09949f4 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1917,6 +1917,74 @@ isLoopVariantIndirectAddress(ArrayRef<const Value *> UnderlyingObjects,
});
}
+static bool isAffectedByLoop(const SCEV *Expr, const Loop *L,
+ ScalarEvolution &SE) {
+ const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr);
+ if (!AddRec)
+ return false;
+
+ if (AddRec->getLoop() == L)
+ return true;
+
+ const SCEV *Start = AddRec->getStart();
+ const SCEV *Step = AddRec->getStepRecurrence(SE);
+ return isAffectedByLoop(Start, L, SE) || isAffectedByLoop(Step, L, SE);
+}
+
+// Consider the following case:
+//
+// for (int j = 0; j < 256; j++) // Loop j
+// for (int i = j+1; i < 256; i++)// Loop i
+// a[i] -= aa[j][i] * a[j];
+//
+// Given that SCEV of &a[j] is {@a,+,4}<Loop j>, a[j] will be treated as scalar
+// when vectorizing Loop i. If the accessing size of a[j] <= Dist(a[j], a[i]),
+// there is no overlapped and can be vectorized.
+//
+// In this case, accessing size of a[j] is 4 byte(float) and Dist(a[j], a[i])
+// is {4,+,4} which bring the minimum distance as 4.
+//
+// Return true if Dist is equal or greater than the accessing size of Src.
+static bool isSrcNoOverlap(const SCEV *Src, Instruction *AInst,
+ const SCEV *Dist, const Loop *InnermostLoop,
+ ScalarEvolution &SE) {
+ // If the Src is not affected by InnermostLoop, when vectorizing
+ // InnermostLoop, Src will be treated as scalar instead of widening to vector.
+ if (isAffectedByLoop(Src, InnermostLoop, SE))
+ return false;
+
+ if (!isa<SCEVAddRecExpr>(Dist))
+ return false;
+
+ auto *Diff = cast<SCEVAddRecExpr>(Dist);
+
+ if (Diff->getLoop() != InnermostLoop)
+ return false;
+
+ if (!isa<SCEVConstant>(Diff->getStart()))
+ return false;
+
+ if (!isa<SCEVConstant>(Diff->getStepRecurrence(SE)))
+ return false;
+
+ const SCEVConstant *DiffInc = cast<SCEVConstant>(Diff->getStepRecurrence(SE));
+ if (DiffInc->getAPInt().isNegative())
+ return false;
+
+ // If the step of Diff is positve and the Start of diff is constant,
+ // we can get the minimum diff between Src and Dst.
+ const SCEVConstant *MinDiff = cast<SCEVConstant>(Diff->getStart());
+
+ // If we get here, Src won't be vectorized, so we only need to consider the
+ // scalar load/store size. If the minimum diff between Src and Dst is equal
+ // or greater than the load/store size, there is no overlapped.
+ if (MinDiff->getAPInt().getSExtValue() >=
+ getLoadStoreType(AInst)->getScalarSizeInBits() / 8)
+ return true;
+
+ return false;
+}
+
// Get the dependence distance, stride, type size in whether i is a write for
// the dependence between A and B. Returns a DepType, if we can prove there's
// no dependence or the analysis fails. Outlined to lambda to limit he scope
@@ -1979,6 +2047,9 @@ getDependenceDistanceStrideAndSize(
InnermostLoop))
return MemoryDepChecker::Dependence::IndirectUnsafe;
+ if (isSrcNoOverlap(Src, AInst, Dist, InnermostLoop, SE))
+ return MemoryDepChecker::Dependence::NoDep;
+
// Need accesses with constant stride. We don't want to vectorize
// "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap
// in the address space.
diff --git a/llvm/test/Transforms/LoopVectorize/vectorize-s115.ll b/llvm/test/Transforms/LoopVectorize/vectorize-s115.ll
new file mode 100644
index 0000000000000..e17236a8f23a1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vectorize-s115.ll
@@ -0,0 +1,58 @@
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -S | FileCheck %s
+
+ at aa = global [256 x [256 x float]] zeroinitializer, align 4
+ at a = global [32000 x float] zeroinitializer, align 4
+
+;; Given that SCEV of &a[j] is {@a,+,4}<Loop j>, a[j] will be treated as scalar
+;; when vectorizing Loop i. If the accessing size of a[j] <= Dist(a[j], a[i]),
+;; there is no overlapped and can be vectorized.
+;;
+;; In this case, accessing size of a[j] is 4 byte(float) and Dist(a[j], a[i])
+;; is {4,+,4} which bring the minimum distance as 4.
+;;
+;; for (int j = 0; j < 256; j++) // Loop j
+;; for (int i = j+1; i < 256; i++)// Loop i
+;; a[i] -= aa[j][i] * a[j];
+
+; CHECK: vector.body:
+
+define signext i32 @s115() {
+entry:
+ br label %for.body
+
+for.cond.loopexit.loopexit: ; preds = %for.body4
+ br label %for.cond.loopexit
+
+for.cond.loopexit: ; preds = %for.cond.loopexit.loopexit, %for.body
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond28.not = icmp eq i64 %indvars.iv.next27, 256
+ br i1 %exitcond28.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.cond.loopexit
+ ret i32 0
+
+for.body: ; preds = %entry, %for.cond.loopexit
+ %indvars.iv26 = phi i64 [ 0, %entry ], [ %indvars.iv.next27, %for.cond.loopexit ]
+ %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.cond.loopexit ]
+ %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1
+ %cmp221 = icmp ult i64 %indvars.iv26, 255
+ br i1 %cmp221, label %for.body4.lr.ph, label %for.cond.loopexit
+
+for.body4.lr.ph: ; preds = %for.body
+ %arrayidx8 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv26
+ br label %for.body4
+
+for.body4: ; preds = %for.body4.lr.ph, %for.body4
+ %indvars.iv24 = phi i64 [ %indvars.iv, %for.body4.lr.ph ], [ %indvars.iv.next25, %for.body4 ]
+ %arrayidx6 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv26, i64 %indvars.iv24
+ %0 = load float, ptr %arrayidx6, align 4
+ %1 = load float, ptr %arrayidx8, align 4
+ %arrayidx10 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv24
+ %2 = load float, ptr %arrayidx10, align 4
+ %neg = fneg float %0
+ %3 = tail call float @llvm.fmuladd.f32(float %neg, float %1, float %2)
+ store float %3, ptr %arrayidx10, align 4
+ %indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next25, 256
+ br i1 %exitcond.not, label %for.cond.loopexit.loopexit, label %for.body4
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/79947
More information about the llvm-commits
mailing list