[llvm] [LAA] Use PSE::getSymbolicMaxBackedgeTakenCount. (PR #93499)

Tue Jun 4 12:56:59 PDT 2024

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/93499

>From 112223d6ee72dfbebcb8df116c7cb3cfc03802fd Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 8 May 2024 20:47:29 +0100
Subject: [PATCH 1/3] [LAA] Use getBackedgeTakenCountForCountableExits.

Update LAA to use getBackedgeTakenCountForCountableExits which returns
the minimum of the countable exits

When analyzing dependences and computing runtime checks, we need the
smallest upper bound on the number of iterations. In terms of memory
safety, it shouldn't matter if any uncomputable exits leave the loop,
as long as we prove that there are no dependences given the minimum of
the countable exits. The same should apply also for generating runtime
checks.

Note that this shifts the responsiblity of checking whether all exit
counts are computable or handling early-exits to the users of LAA.
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 12 +--
 .../Vectorize/LoopVectorizationLegality.cpp   | 10 ++
 .../early-exit-runtime-checks.ll              | 39 +++++++-
 .../memcheck-wrapping-pointers.ll             | 14 +--
 .../Transforms/LoopDistribute/early-exit.ll   | 96 +++++++++++++++++++
 .../Transforms/LoopLoadElim/early-exit.ll     | 61 ++++++++++++
 6 files changed, 216 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopDistribute/early-exit.ll
 create mode 100644 llvm/test/Transforms/LoopLoadElim/early-exit.ll

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index bd4c2a35ebf2c..03ceacdceee56 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -214,7 +214,7 @@ getStartAndEndForAccess(const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy,
   if (SE->isLoopInvariant(PtrExpr, Lp)) {
     ScStart = ScEnd = PtrExpr;
   } else if (auto *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr)) {
-    const SCEV *Ex = PSE.getBackedgeTakenCount();
+    const SCEV *Ex = PSE.getSymbolicMaxBackedgeTakenCount();
 
     ScStart = AR->getStart();
     ScEnd = AR->evaluateAtIteration(Ex, *SE);
@@ -2060,9 +2060,9 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
   // stride multiplied by the backedge taken count, the accesses are independet,
   // i.e. they are far enough appart that accesses won't access the same
   // location across all loop ierations.
-  if (HasSameSize &&
-      isSafeDependenceDistance(DL, SE, *(PSE.getBackedgeTakenCount()), *Dist,
-                               MaxStride, TypeByteSize))
+  if (HasSameSize && isSafeDependenceDistance(
+                         DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()),
+                         *Dist, MaxStride, TypeByteSize))
     return Dependence::NoDep;
 
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
@@ -2400,7 +2400,7 @@ bool LoopAccessInfo::canAnalyzeLoop() {
   }
 
   // ScalarEvolution needs to be able to find the exit count.
-  const SCEV *ExitCount = PSE->getBackedgeTakenCount();
+  const SCEV *ExitCount = PSE->getSymbolicMaxBackedgeTakenCount();
   if (isa<SCEVCouldNotCompute>(ExitCount)) {
     recordAnalysis("CantComputeNumberOfIterations")
         << "could not determine number of loop iterations";
@@ -3009,7 +3009,7 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
   // of various possible stride specializations, considering the alternatives
   // of using gather/scatters (if available).
 
-  const SCEV *BETakenCount = PSE->getBackedgeTakenCount();
+  const SCEV *BETakenCount = PSE->getSymbolicMaxBackedgeTakenCount();
 
   // Match the types so we can compare the stride and the BETakenCount.
   // The Stride can be positive/negative, so we sign extend Stride;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 9de49d1bcfeac..0c18c4e146de1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1506,6 +1506,16 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
       return false;
   }
 
+  if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
+    reportVectorizationFailure("could not determine number of loop iterations",
+                               "could not determine number of loop iterations",
+                               "CantComputeNumberOfIterations", ORE, TheLoop);
+    if (DoExtraAnalysis)
+      Result = false;
+    else
+      return false;
+  }
+
   LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
                     << (LAI->getRuntimePointerChecking()->Need
                             ? " (with a runtime bound check)"
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
index 0d85f11f06dce..a40aaa8ae99a0 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/early-exit-runtime-checks.ll
@@ -4,10 +4,21 @@
 define void @all_exits_dominate_latch_countable_exits_at_most_500_iterations(ptr %A, ptr %B) {
 ; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_500_iterations'
 ; CHECK-NEXT:    loop.header:
-; CHECK-NEXT:      Report: could not determine number of loop iterations
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP1:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP2:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
 ; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP1]]:
+; CHECK-NEXT:          (Low: %B High: (2000 + %B))
+; CHECK-NEXT:            Member: {%B,+,4}<nuw><%loop.header>
+; CHECK-NEXT:        Group [[GRP2]]:
+; CHECK-NEXT:          (Low: %A High: (2000 + %A))
+; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
@@ -53,10 +64,21 @@ e.2:
 define i32 @all_exits_dominate_latch_countable_exits_at_most_1000_iterations(ptr %A, ptr %B) {
 ; CHECK-LABEL: 'all_exits_dominate_latch_countable_exits_at_most_1000_iterations'
 ; CHECK-NEXT:    loop.header:
-; CHECK-NEXT:      Report: could not determine number of loop iterations
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP3:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP4:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
 ; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP3]]:
+; CHECK-NEXT:          (Low: %B High: (4004 + %B))
+; CHECK-NEXT:            Member: {%B,+,4}<nuw><%loop.header>
+; CHECK-NEXT:        Group [[GRP4]]:
+; CHECK-NEXT:          (Low: %A High: (4004 + %A))
+; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
@@ -145,10 +167,21 @@ e.2:
 define i32 @b3_does_not_dominate_latch(ptr %A, ptr %B) {
 ; CHECK-LABEL: 'b3_does_not_dominate_latch'
 ; CHECK-NEXT:    loop.header:
-; CHECK-NEXT:      Report: could not determine number of loop iterations
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP5:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.B = getelementptr inbounds i32, ptr %B, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP6:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.A = getelementptr inbounds i32, ptr %A, i64 %iv
 ; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP5]]:
+; CHECK-NEXT:          (Low: %B High: (4004 + %B))
+; CHECK-NEXT:            Member: {%B,+,4}<nuw><%loop.header>
+; CHECK-NEXT:        Group [[GRP6]]:
+; CHECK-NEXT:          (Low: %A High: (4004 + %A))
+; CHECK-NEXT:            Member: {%A,+,4}<nuw><%loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll b/llvm/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
index 6dbb4a0c0129a..4a1b3b2450e02 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
@@ -29,20 +29,20 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 ; CHECK-NEXT:    Run-time memory checks:
 ; CHECK-NEXT:    Check 0:
 ; CHECK-NEXT:      Comparing group
-; CHECK-NEXT:        %arrayidx = getelementptr inbounds i32, ptr %a, i64 %idxprom
-; CHECK-NEXT:      Against group
 ; CHECK-NEXT:        %arrayidx4 = getelementptr inbounds i32, ptr %b, i64 %conv11
+; CHECK-NEXT:      Against group
+; CHECK-NEXT:        %arrayidx = getelementptr inbounds i32, ptr %a, i64 %idxprom
 ; CHECK-NEXT:    Grouped accesses:
 ; CHECK-NEXT:      Group
-; CHECK-NEXT:        (Low: (4 + %a) High: (4 + (4 * (1 umax %x)) + %a))
-; CHECK-NEXT:          Member: {(4 + %a),+,4}<%for.body>
-; CHECK-NEXT:      Group
 ; CHECK-NEXT:        (Low: %b High: ((4 * (1 umax %x)) + %b))
 ; CHECK-NEXT:          Member: {%b,+,4}<%for.body>
+; CHECK-NEXT:      Group
+; CHECK-NEXT:        (Low: (4 + %a) High: (4 + (4 * (1 umax %x)) + %a))
+; CHECK-NEXT:          Member: {(4 + %a),+,4}<%for.body>
 ; CHECK:         Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:    SCEV assumptions:
-; CHECK-NEXT:    {1,+,1}<%for.body> Added Flags: <nusw>
 ; CHECK-NEXT:    {0,+,1}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:    {1,+,1}<%for.body> Added Flags: <nusw>
 ; CHECK:         Expressions re-written:
 ; CHECK-NEXT:    [PSE]  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %idxprom:
 ; CHECK-NEXT:      ((4 * (zext i32 {1,+,1}<%for.body> to i64))<nuw><nsw> + %a)<nuw>
@@ -84,8 +84,8 @@ exit:
 ; CHECK-LABEL: test2
 ; CHECK: Memory dependences are safe
 ; CHECK: SCEV assumptions:
-; CHECK-NEXT:   {1,+,1}<%for.body> Added Flags: <nusw>
 ; CHECK-NEXT:   {0,+,1}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:   {1,+,1}<%for.body> Added Flags: <nusw>
  define void @test2(i64 %x, ptr %a) {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopDistribute/early-exit.ll b/llvm/test/Transforms/LoopDistribute/early-exit.ll
new file mode 100644
index 0000000000000..e04811335e1bd
--- /dev/null
+++ b/llvm/test/Transforms/LoopDistribute/early-exit.ll
@@ -0,0 +1,96 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; REQUIRES: x86-registered-target
+; RUN: opt -aa-pipeline=basic-aa -passes=loop-distribute -enable-loop-distribute -verify-loop-info -verify-dom-info -S %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+ at B = common global ptr null, align 8
+ at A = common global ptr null, align 8
+ at C = common global ptr null, align 8
+ at D = common global ptr null, align 8
+ at E = common global ptr null, align 8
+
+define void @f() {
+; CHECK-LABEL: define void @f() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[A:%.*]] = load ptr, ptr @A, align 8
+; CHECK-NEXT:    [[B:%.*]] = load ptr, ptr @B, align 8
+; CHECK-NEXT:    [[C:%.*]] = load ptr, ptr @C, align 8
+; CHECK-NEXT:    [[D:%.*]] = load ptr, ptr @D, align 8
+; CHECK-NEXT:    [[E:%.*]] = load ptr, ptr @E, align 8
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IND:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[ADD:%.*]], %[[LATCH:.*]] ]
+; CHECK-NEXT:    [[ARRAYIDXA:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADA:%.*]] = load i32, ptr [[ARRAYIDXA]], align 4
+; CHECK-NEXT:    [[ARRAYIDXB:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADB:%.*]] = load i32, ptr [[ARRAYIDXB]], align 4
+; CHECK-NEXT:    [[UNCOUNTABLE_C:%.*]] = icmp eq i32 [[LOADB]], 10
+; CHECK-NEXT:    br i1 [[UNCOUNTABLE_C]], label %[[FOR_END:.*]], label %[[LATCH]]
+; CHECK:       [[LATCH]]:
+; CHECK-NEXT:    [[MULA:%.*]] = mul i32 [[LOADB]], [[LOADA]]
+; CHECK-NEXT:    [[ADD]] = add nuw nsw i64 [[IND]], 1
+; CHECK-NEXT:    [[ARRAYIDXA_PLUS_4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[ADD]]
+; CHECK-NEXT:    store i32 [[MULA]], ptr [[ARRAYIDXA_PLUS_4]], align 4
+; CHECK-NEXT:    [[ARRAYIDXD:%.*]] = getelementptr inbounds i32, ptr [[D]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADD:%.*]] = load i32, ptr [[ARRAYIDXD]], align 4
+; CHECK-NEXT:    [[ARRAYIDXE:%.*]] = getelementptr inbounds i32, ptr [[E]], i64 [[IND]]
+; CHECK-NEXT:    [[LOADE:%.*]] = load i32, ptr [[ARRAYIDXE]], align 4
+; CHECK-NEXT:    [[MULC:%.*]] = mul i32 [[LOADD]], [[LOADE]]
+; CHECK-NEXT:    [[ARRAYIDXC:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IND]]
+; CHECK-NEXT:    store i32 [[MULC]], ptr [[ARRAYIDXC]], align 4
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[ADD]], 20
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = load ptr, ptr @A, align 8
+  %b = load ptr, ptr @B, align 8
+  %c = load ptr, ptr @C, align 8
+  %d = load ptr, ptr @D, align 8
+  %e = load ptr, ptr @E, align 8
+  br label %for.body
+
+for.body:
+  %ind = phi i64 [ 0, %entry ], [ %add, %latch ]
+
+  %arrayidxA = getelementptr inbounds i32, ptr %a, i64 %ind
+  %loadA = load i32, ptr %arrayidxA, align 4
+
+  %arrayidxB = getelementptr inbounds i32, ptr %b, i64 %ind
+  %loadB = load i32, ptr %arrayidxB, align 4
+  %uncountable.c = icmp eq i32 %loadB, 10
+  br i1 %uncountable.c, label %for.end, label %latch
+
+latch:
+  %mulA = mul i32 %loadB, %loadA
+
+  %add = add nuw nsw i64 %ind, 1
+  %arrayidxA_plus_4 = getelementptr inbounds i32, ptr %a, i64 %add
+  store i32 %mulA, ptr %arrayidxA_plus_4, align 4
+
+  %arrayidxD = getelementptr inbounds i32, ptr %d, i64 %ind
+  %loadD = load i32, ptr %arrayidxD, align 4
+
+  %arrayidxE = getelementptr inbounds i32, ptr %e, i64 %ind
+  %loadE = load i32, ptr %arrayidxE, align 4
+
+  %mulC = mul i32 %loadD, %loadE
+
+  %arrayidxC = getelementptr inbounds i32, ptr %c, i64 %ind
+  store i32 %mulC, ptr %arrayidxC, align 4
+
+  %exitcond = icmp eq i64 %add, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind convergent }
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.distribute.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopLoadElim/early-exit.ll b/llvm/test/Transforms/LoopLoadElim/early-exit.ll
new file mode 100644
index 0000000000000..92ff4d852a87d
--- /dev/null
+++ b/llvm/test/Transforms/LoopLoadElim/early-exit.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-load-elim -S %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @f(ptr %A, ptr %B, ptr %C, i64 %N) {
+; CHECK-LABEL: define void @f(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[FOR_BODY_LVER_CHECK:.*]]:
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_LVER_CHECK]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[LATCH:.*]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[AIDX_NEXT:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[BIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[CIDX:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[AIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[B:%.*]] = load i32, ptr [[BIDX]], align 4
+; CHECK-NEXT:    [[UNCOUNTABLE_C:%.*]] = icmp eq i32 [[B]], 10
+; CHECK-NEXT:    br i1 [[UNCOUNTABLE_C]], label %[[LATCH]], label %[[FOR_END:.*]]
+; CHECK:       [[LATCH]]:
+; CHECK-NEXT:    [[A_P1:%.*]] = add i32 [[B]], 2
+; CHECK-NEXT:    store i32 [[A_P1]], ptr [[AIDX_NEXT]], align 4
+; CHECK-NEXT:    [[A:%.*]] = load i32, ptr [[AIDX]], align 1
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[A]], 2
+; CHECK-NEXT:    store i32 [[C]], ptr [[CIDX]], align 4
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_END]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %latch ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+
+  %Aidx_next = getelementptr inbounds i32, ptr %A, i64 %indvars.iv.next
+  %Bidx = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %Cidx = getelementptr inbounds i32, ptr %C, i64 %indvars.iv
+  %Aidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+
+  %b = load i32, ptr %Bidx, align 4
+  %uncountable.c = icmp eq i32 %b, 10
+  br i1 %uncountable.c, label  %latch, label %for.end
+
+latch:
+  %a_p1 = add i32 %b, 2
+  store i32 %a_p1, ptr %Aidx_next, align 4
+
+  %a = load i32, ptr %Aidx, align 1
+  %c = mul i32 %a, 2
+  store i32 %c, ptr %Cidx, align 4
+
+  %exitcond = icmp eq i64 %indvars.iv.next, %N
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}

>From 22afcf6437329d1c2f369e7310e1564451af03e0 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 28 May 2024 17:21:21 -0700
Subject: [PATCH 2/3] !fixup address comments, thanks!

---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 53 ++++++++++---------
 .../memcheck-wrapping-pointers.ll             | 14 ++---
 .../X86/vectorization-remarks-missed.ll       |  2 +-
 3 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 03ceacdceee56..1fd2cf79b0c18 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1796,28 +1796,28 @@ void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
 /// Given a dependence-distance \p Dist between two
 /// memory accesses, that have strides in the same direction whose absolute
 /// value of the maximum stride is given in \p MaxStride, and that have the same
-/// type size \p TypeByteSize, in a loop whose takenCount is \p
-/// BackedgeTakenCount, check if it is possible to prove statically that the
-/// dependence distance is larger than the range that the accesses will travel
-/// through the execution of the loop. If so, return true; false otherwise. This
-/// is useful for example in loops such as the following (PR31098):
+/// type size \p TypeByteSize, in a loop whose maximum backedge taken count is
+/// \p MaxBTC, check if it is possible to prove statically that the dependence
+/// distance is larger than the range that the accesses will travel through the
+/// execution of the loop. If so, return true; false otherwise. This is useful
+/// for example in loops such as the following (PR31098):
 ///     for (i = 0; i < D; ++i) {
 ///                = out[i];
 ///       out[i+D] =
 ///     }
 static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
-                                     const SCEV &BackedgeTakenCount,
-                                     const SCEV &Dist, uint64_t MaxStride,
+                                     const SCEV &MaxBTC, const SCEV &Dist,
+                                     uint64_t MaxStride,
                                      uint64_t TypeByteSize) {
 
   // If we can prove that
-  //      (**) |Dist| > BackedgeTakenCount * Step
+  //      (**) |Dist| > MaxBTC * Step
   // where Step is the absolute stride of the memory accesses in bytes,
   // then there is no dependence.
   //
   // Rationale:
   // We basically want to check if the absolute distance (|Dist/Step|)
-  // is >= the loop iteration count (or > BackedgeTakenCount).
+  // is >= the loop iteration count (or > MaxBTC).
   // This is equivalent to the Strong SIV Test (Practical Dependence Testing,
   // Section 4.2.1); Note, that for vectorization it is sufficient to prove
   // that the dependence distance is >= VF; This is checked elsewhere.
@@ -1828,8 +1828,8 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
   // also guarantees that distance >= VF.
   //
   const uint64_t ByteStride = MaxStride * TypeByteSize;
-  const SCEV *Step = SE.getConstant(BackedgeTakenCount.getType(), ByteStride);
-  const SCEV *Product = SE.getMulExpr(&BackedgeTakenCount, Step);
+  const SCEV *Step = SE.getConstant(MaxBTC.getType(), ByteStride);
+  const SCEV *Product = SE.getMulExpr(&MaxBTC, Step);
 
   const SCEV *CastedDist = &Dist;
   const SCEV *CastedProduct = Product;
@@ -1844,13 +1844,13 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
   else
     CastedDist = SE.getNoopOrSignExtend(&Dist, Product->getType());
 
-  // Is  Dist - (BackedgeTakenCount * Step) > 0 ?
+  // Is  Dist - (MaxBTC * Step) > 0 ?
   // (If so, then we have proven (**) because |Dist| >= Dist)
   const SCEV *Minus = SE.getMinusSCEV(CastedDist, CastedProduct);
   if (SE.isKnownPositive(Minus))
     return true;
 
-  // Second try: Is  -Dist - (BackedgeTakenCount * Step) > 0 ?
+  // Second try: Is  -Dist - (MaxBTC * Step) > 0 ?
   // (If so, then we have proven (**) because |Dist| >= -1*Dist)
   const SCEV *NegDist = SE.getNegativeSCEV(CastedDist);
   Minus = SE.getMinusSCEV(NegDist, CastedProduct);
@@ -2057,9 +2057,10 @@ MemoryDepChecker::Dependence::DepType MemoryDepChecker::isDependent(
   uint64_t MaxStride = std::max(StrideA, StrideB);
 
   // If the distance between the acecsses is larger than their maximum absolute
-  // stride multiplied by the backedge taken count, the accesses are independet,
-  // i.e. they are far enough appart that accesses won't access the same
-  // location across all loop ierations.
+  // stride multiplied by the symbolic maximum backedge taken count (which is an
+  // upper bound of the number of iterations), the accesses are independet, i.e.
+  // they are far enough appart that accesses won't access the same location
+  // across all loop ierations.
   if (HasSameSize && isSafeDependenceDistance(
                          DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()),
                          *Dist, MaxStride, TypeByteSize))
@@ -2399,7 +2400,9 @@ bool LoopAccessInfo::canAnalyzeLoop() {
     return false;
   }
 
-  // ScalarEvolution needs to be able to find the exit count.
+  // ScalarEvolution needs to be able to find the symbolic max backedge taken
+  // count, which is an upper bound on the number of loop iterations. The loop
+  // may execute fewer iterations, if it exits via an uncountable exit.
   const SCEV *ExitCount = PSE->getSymbolicMaxBackedgeTakenCount();
   if (isa<SCEVCouldNotCompute>(ExitCount)) {
     recordAnalysis("CantComputeNumberOfIterations")
@@ -3009,25 +3012,25 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
   // of various possible stride specializations, considering the alternatives
   // of using gather/scatters (if available).
 
-  const SCEV *BETakenCount = PSE->getSymbolicMaxBackedgeTakenCount();
+  const SCEV *MaxBTC = PSE->getSymbolicMaxBackedgeTakenCount();
 
-  // Match the types so we can compare the stride and the BETakenCount.
+  // Match the types so we can compare the stride and the MaxBTC.
   // The Stride can be positive/negative, so we sign extend Stride;
-  // The backedgeTakenCount is non-negative, so we zero extend BETakenCount.
+  // The backedgeTakenCount is non-negative, so we zero extend MaxBTC.
   const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
   uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(StrideExpr->getType());
-  uint64_t BETypeSizeBits = DL.getTypeSizeInBits(BETakenCount->getType());
+  uint64_t BETypeSizeBits = DL.getTypeSizeInBits(MaxBTC->getType());
   const SCEV *CastedStride = StrideExpr;
-  const SCEV *CastedBECount = BETakenCount;
+  const SCEV *CastedBECount = MaxBTC;
   ScalarEvolution *SE = PSE->getSE();
   if (BETypeSizeBits >= StrideTypeSizeBits)
-    CastedStride = SE->getNoopOrSignExtend(StrideExpr, BETakenCount->getType());
+    CastedStride = SE->getNoopOrSignExtend(StrideExpr, MaxBTC->getType());
   else
-    CastedBECount = SE->getZeroExtendExpr(BETakenCount, StrideExpr->getType());
+    CastedBECount = SE->getZeroExtendExpr(MaxBTC, StrideExpr->getType());
   const SCEV *StrideMinusBETaken = SE->getMinusSCEV(CastedStride, CastedBECount);
   // Since TripCount == BackEdgeTakenCount + 1, checking:
   // "Stride >= TripCount" is equivalent to checking:
-  // Stride - BETakenCount > 0
+  // Stride - MaxBTC> 0
   if (SE->isKnownPositive(StrideMinusBETaken)) {
     LLVM_DEBUG(
         dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll b/llvm/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
index 4a1b3b2450e02..6dbb4a0c0129a 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/memcheck-wrapping-pointers.ll
@@ -29,20 +29,20 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 ; CHECK-NEXT:    Run-time memory checks:
 ; CHECK-NEXT:    Check 0:
 ; CHECK-NEXT:      Comparing group
-; CHECK-NEXT:        %arrayidx4 = getelementptr inbounds i32, ptr %b, i64 %conv11
-; CHECK-NEXT:      Against group
 ; CHECK-NEXT:        %arrayidx = getelementptr inbounds i32, ptr %a, i64 %idxprom
+; CHECK-NEXT:      Against group
+; CHECK-NEXT:        %arrayidx4 = getelementptr inbounds i32, ptr %b, i64 %conv11
 ; CHECK-NEXT:    Grouped accesses:
 ; CHECK-NEXT:      Group
-; CHECK-NEXT:        (Low: %b High: ((4 * (1 umax %x)) + %b))
-; CHECK-NEXT:          Member: {%b,+,4}<%for.body>
-; CHECK-NEXT:      Group
 ; CHECK-NEXT:        (Low: (4 + %a) High: (4 + (4 * (1 umax %x)) + %a))
 ; CHECK-NEXT:          Member: {(4 + %a),+,4}<%for.body>
+; CHECK-NEXT:      Group
+; CHECK-NEXT:        (Low: %b High: ((4 * (1 umax %x)) + %b))
+; CHECK-NEXT:          Member: {%b,+,4}<%for.body>
 ; CHECK:         Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:    SCEV assumptions:
-; CHECK-NEXT:    {0,+,1}<%for.body> Added Flags: <nusw>
 ; CHECK-NEXT:    {1,+,1}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:    {0,+,1}<%for.body> Added Flags: <nusw>
 ; CHECK:         Expressions re-written:
 ; CHECK-NEXT:    [PSE]  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %idxprom:
 ; CHECK-NEXT:      ((4 * (zext i32 {1,+,1}<%for.body> to i64))<nuw><nsw> + %a)<nuw>
@@ -84,8 +84,8 @@ exit:
 ; CHECK-LABEL: test2
 ; CHECK: Memory dependences are safe
 ; CHECK: SCEV assumptions:
-; CHECK-NEXT:   {0,+,1}<%for.body> Added Flags: <nusw>
 ; CHECK-NEXT:   {1,+,1}<%for.body> Added Flags: <nusw>
+; CHECK-NEXT:   {0,+,1}<%for.body> Added Flags: <nusw>
  define void @test2(i64 %x, ptr %a) {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
index de2fe124ea63b..ac33f6e3e6f72 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
@@ -124,7 +124,7 @@
 ; YAML-NEXT:   - String:          'loop not vectorized: '
 ; YAML-NEXT:   - String:          could not determine number of loop iterations
 ; YAML-NEXT: ...
-; YAML-NEXT: --- !Missed
+; YAML:      --- !Missed
 ; YAML-NEXT: Pass:            loop-vectorize
 ; YAML-NEXT: Name:            MissedDetails
 ; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }

>From 34d7eab208042167fcd05c2112a871c0b194b73e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 4 Jun 2024 20:56:08 +0100
Subject: [PATCH 3/3] !fixup use predicated symbolic max BTC in
 generateOverflowCheck.

---
 llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 0feea0a4233cd..c437a44dda8d3 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -2079,7 +2079,7 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
   // FIXME: It is highly suspicious that we're ignoring the predicates here.
   SmallVector<const SCEVPredicate *, 4> Pred;
   const SCEV *ExitCount =
-      SE.getPredicatedBackedgeTakenCount(AR->getLoop(), Pred);
+      SE.getPredicatedSymbolicMaxBackedgeTakenCount(AR->getLoop(), Pred);
 
   assert(!isa<SCEVCouldNotCompute>(ExitCount) && "Invalid loop count");