[llvm] [LAA] Determine Dst and Src overlapping by SCEV of Src and Dist (PR #79947)

via llvm-commits llvm-commits at lists.llvm.org
Tue Jan 30 05:56:00 PST 2024

https://github.com/ShivaChen updated https://github.com/llvm/llvm-project/pull/79947

>From dd985f8599920e69259b26209b5f3698b5eddcd8 Mon Sep 17 00:00:00 2001
From: Shiva Chen <shiva.chen at imgtec.com>
Date: Tue, 30 Jan 2024 03:52:25 +0000
Subject: [PATCH 1/3] Add vectorize-s115.ll

 .../LoopVectorize/vectorize-s115.ll           | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/vectorize-s115.ll

diff --git a/llvm/test/Transforms/LoopVectorize/vectorize-s115.ll b/llvm/test/Transforms/LoopVectorize/vectorize-s115.ll
new file mode 100644
index 000000000000..327b8b8fd7c7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/vectorize-s115.ll
@@ -0,0 +1,51 @@
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -S | FileCheck %s
+ at aa = global [256 x [256 x float]] zeroinitializer, align 4
+ at a = global [32000 x float] zeroinitializer, align 4
+;; for (int j = 0; j < 256; j++)
+;;   for (int i = j+1; i < 256; i++)
+;;     a[i] -= aa[j][i] * a[j];
+; CHECK-NOT: vector.body:
+define signext i32 @s115() {
+  br label %for.body
+for.cond.loopexit.loopexit:                       ; preds = %for.body4
+  br label %for.cond.loopexit
+for.cond.loopexit:                                ; preds = %for.cond.loopexit.loopexit, %for.body
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond28.not = icmp eq i64 %indvars.iv.next27, 256
+  br i1 %exitcond28.not, label %for.cond.cleanup, label %for.body
+for.cond.cleanup:                                 ; preds = %for.cond.loopexit
+  ret i32 0
+for.body:                                         ; preds = %entry, %for.cond.loopexit
+  %indvars.iv26 = phi i64 [ 0, %entry ], [ %indvars.iv.next27, %for.cond.loopexit ]
+  %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.cond.loopexit ]
+  %indvars.iv.next27 = add nuw nsw i64 %indvars.iv26, 1
+  %cmp221 = icmp ult i64 %indvars.iv26, 255
+  br i1 %cmp221, label %for.body4.lr.ph, label %for.cond.loopexit
+for.body4.lr.ph:                                  ; preds = %for.body
+  %arrayidx8 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv26
+  br label %for.body4
+for.body4:                                        ; preds = %for.body4.lr.ph, %for.body4
+  %indvars.iv24 = phi i64 [ %indvars.iv, %for.body4.lr.ph ], [ %indvars.iv.next25, %for.body4 ]
+  %arrayidx6 = getelementptr inbounds [256 x [256 x float]], ptr @aa, i64 0, i64 %indvars.iv26, i64 %indvars.iv24
+  %0 = load float, ptr %arrayidx6, align 4
+  %1 = load float, ptr %arrayidx8, align 4
+  %arrayidx10 = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 %indvars.iv24
+  %2 = load float, ptr %arrayidx10, align 4
+  %neg = fneg float %0
+  %3 = tail call float @llvm.fmuladd.f32(float %neg, float %1, float %2)
+  store float %3, ptr %arrayidx10, align 4
+  %indvars.iv.next25 = add nuw nsw i64 %indvars.iv24, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next25, 256
+  br i1 %exitcond.not, label %for.cond.loopexit.loopexit, label %for.body4

>From 5f8408237b5835d5baf967b8a7b493c80b5bc412 Mon Sep 17 00:00:00 2001
From: Shiva Chen <shiva.chen at imgtec.com>
Date: Mon, 29 Jan 2024 08:12:04 +0000
Subject: [PATCH 2/3] [LAA] Determine Dst and Src overlapping by SCEV of Src
 and Dist

Consider the following case:

  for (int j = 0; j < 256; j++)    // Loop j
    for (int i = j+1; i < 256; i++)// Loop i
      a[i] -= aa[j][i] * a[j];

Given that SCEV of &a[j] is {@a,+,4}<Loop j>, a[j] will be treated as scalar
when vectorizing Loop i. If the accessing size of a[j] <= Dist(a[j], a[i]),
there is no overlapped and can be vectorized.

In this case, accessing size of a[j] is 4 byte(float) and Dist(a[j], a[i])
is {4,+,4} which bring the minimum distance as 4
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 71 +++++++++++++++++++
 .../LoopVectorize/vectorize-s115.ll           | 13 +++-
 2 files changed, 81 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index dd6b88fee415..67beec09949f 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1917,6 +1917,74 @@ isLoopVariantIndirectAddress(ArrayRef<const Value *> UnderlyingObjects,
+static bool isAffectedByLoop(const SCEV *Expr, const Loop *L,
+                             ScalarEvolution &SE) {
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr);
+  if (!AddRec)
+    return false;
+  if (AddRec->getLoop() == L)
+    return true;
+  const SCEV *Start = AddRec->getStart();
+  const SCEV *Step = AddRec->getStepRecurrence(SE);
+  return isAffectedByLoop(Start, L, SE) || isAffectedByLoop(Step, L, SE);
+// Consider the following case:
+// for (int j = 0; j < 256; j++)    // Loop j
+//   for (int i = j+1; i < 256; i++)// Loop i
+//     a[i] -= aa[j][i] * a[j];
+// Given that SCEV of &a[j] is {@a,+,4}<Loop j>, a[j] will be treated as scalar
+// when vectorizing Loop i. If the accessing size of a[j] <= Dist(a[j], a[i]),
+// there is no overlapped and can be vectorized.
+// In this case, accessing size of a[j] is 4 byte(float) and Dist(a[j], a[i])
+// is {4,+,4} which bring the minimum distance as 4.
+// Return true if Dist is equal or greater than the accessing size of Src.
+static bool isSrcNoOverlap(const SCEV *Src, Instruction *AInst,
+                           const SCEV *Dist, const Loop *InnermostLoop,
+                           ScalarEvolution &SE) {
+  // If the Src is not affected by InnermostLoop, when vectorizing
+  // InnermostLoop, Src will be treated as scalar instead of widening to vector.
+  if (isAffectedByLoop(Src, InnermostLoop, SE))
+    return false;
+  if (!isa<SCEVAddRecExpr>(Dist))
+    return false;
+  auto *Diff = cast<SCEVAddRecExpr>(Dist);
+  if (Diff->getLoop() != InnermostLoop)
+    return false;
+  if (!isa<SCEVConstant>(Diff->getStart()))
+    return false;
+  if (!isa<SCEVConstant>(Diff->getStepRecurrence(SE)))
+    return false;
+  const SCEVConstant *DiffInc = cast<SCEVConstant>(Diff->getStepRecurrence(SE));
+  if (DiffInc->getAPInt().isNegative())
+    return false;
+  // If the step of Diff is positve and the Start of diff is constant,
+  // we can get the minimum diff between Src and Dst.
+  const SCEVConstant *MinDiff = cast<SCEVConstant>(Diff->getStart());
+  // If we get here, Src won't be vectorized, so we only need to consider the
+  // scalar load/store size. If the minimum diff between Src and Dst is equal
+  // or greater than the load/store size, there is no overlapped.
+  if (MinDiff->getAPInt().getSExtValue() >=
+      getLoadStoreType(AInst)->getScalarSizeInBits() / 8)
+    return true;
+  return false;
 // Get the dependence distance, stride, type size in whether i is a write for
 // the dependence between A and B. Returns a DepType, if we can prove there's
 // no dependence or the analysis fails. Outlined to lambda to limit he scope
@@ -1979,6 +2047,9 @@ getDependenceDistanceStrideAndSize(
     return MemoryDepChecker::Dependence::IndirectUnsafe;
+  if (isSrcNoOverlap(Src, AInst, Dist, InnermostLoop, SE))
+    return MemoryDepChecker::Dependence::NoDep;
   // Need accesses with constant stride. We don't want to vectorize
   // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap
   // in the address space.
diff --git a/llvm/test/Transforms/LoopVectorize/vectorize-s115.ll b/llvm/test/Transforms/LoopVectorize/vectorize-s115.ll
index 327b8b8fd7c7..e17236a8f23a 100644
--- a/llvm/test/Transforms/LoopVectorize/vectorize-s115.ll
+++ b/llvm/test/Transforms/LoopVectorize/vectorize-s115.ll
@@ -3,11 +3,18 @@
 @aa = global [256 x [256 x float]] zeroinitializer, align 4
 @a = global [32000 x float] zeroinitializer, align 4
-;; for (int j = 0; j < 256; j++)
-;;   for (int i = j+1; i < 256; i++)
+;; Given that SCEV of &a[j] is {@a,+,4}<Loop j>, a[j] will be treated as scalar
+;; when vectorizing Loop i. If the accessing size of a[j] <= Dist(a[j], a[i]),
+;; there is no overlapped and can be vectorized.
+;; In this case, accessing size of a[j] is 4 byte(float) and Dist(a[j], a[i])
+;; is {4,+,4} which bring the minimum distance as 4.
+;; for (int j = 0; j < 256; j++)    // Loop j
+;;   for (int i = j+1; i < 256; i++)// Loop i
 ;;     a[i] -= aa[j][i] * a[j];
-; CHECK-NOT: vector.body:
+; CHECK: vector.body:
 define signext i32 @s115() {

>From 42c60180cd24a12e9ede63711148812dc81f770a Mon Sep 17 00:00:00 2001
From: Shiva Chen <shiva.chen at imgtec.com>
Date: Tue, 30 Jan 2024 13:47:21 +0000
Subject: [PATCH 3/3] Add -max-dependences=97 to the X86 test case

interleaved-accesses-use-after-free.ll is intended to exceed max-dependences
to make the Dependences from MemoryDepChecker unavailable.

The default value of max-dependences is 100.

This PR reduce three dependences which make the Dependences available.

Adding -max-dependences=97 to preserve the original semantic.
 .../LoopVectorize/X86/interleaved-accesses-use-after-free.ll  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-use-after-free.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-use-after-free.ll
index d5239d5a4e33..3b9d53ca600b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-use-after-free.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-use-after-free.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
-; RUN: opt -passes=loop-vectorize -debug-only=loop-accesses -force-vector-width=4 -disable-output %s 2>&1 | FileCheck %s -check-prefix=LOOP-ACCESS
-; RUN: opt -passes=loop-vectorize -debug-only=vectorutils -force-vector-width=4 -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -passes=loop-vectorize -debug-only=loop-accesses -force-vector-width=4 -max-dependences=97 -disable-output %s 2>&1 | FileCheck %s -check-prefix=LOOP-ACCESS
+; RUN: opt -passes=loop-vectorize -debug-only=vectorutils -force-vector-width=4 -max-dependences=97 -disable-output %s 2>&1 | FileCheck %s
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-redhat-linux-gnu"

More information about the llvm-commits mailing list