[llvm] LAA: generalize strides over unequal type sizes (PR #108088)

Ramkumar Ramachandra via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 28 09:11:29 PST 2024


https://github.com/artagnon updated https://github.com/llvm/llvm-project/pull/108088

>From 91387edad50043885db3f87dadedcdc080e23a42 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Tue, 10 Sep 2024 11:54:02 +0100
Subject: [PATCH] LAA: generalize strides over unequal type sizes

getDepdenceDistanceStrideAndSize currently returns a non-zero
TypeByteSize only if the type-sizes of the source and sink are equal.
The non-zero TypeByteSize is then used by isDependent to scale the
strides, the distance between the accesses, and the VF. This restriction
is very artificial, as the stride sizes can be scaled by the respective
type-sizes in advance, freeing isDependent of this responsibility, and
removing the ugly special-case of zero-TypeByteSize. The patch also has
the side-effect of fixing the long-standing TODO of requesting
runtime-checks when the strides are unequal.

The test impact of this patch is that several false depdendencies are
eliminated, and several unknown depdendencies now come with
runtime-checks instead.
---
 .../llvm/Analysis/LoopAccessAnalysis.h        |  16 +-
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 144 ++++++++++--------
 .../LoopAccessAnalysis/depend_diff_types.ll   |  10 +-
 .../forward-loop-carried.ll                   |   4 -
 ...endence-distance-different-access-sizes.ll |  45 ++----
 ...interleave-allocsize-not-equal-typesize.ll |  31 +++-
 6 files changed, 135 insertions(+), 115 deletions(-)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index a35bc7402d1a89..7bc8c4deae1a36 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -366,16 +366,20 @@ class MemoryDepChecker {
 
   struct DepDistanceStrideAndSizeInfo {
     const SCEV *Dist;
-    uint64_t StrideA;
-    uint64_t StrideB;
+    uint64_t MaxStride;
+    std::optional<uint64_t> CommonStride;
+    bool ShouldRetryWithRuntimeCheck;
     uint64_t TypeByteSize;
     bool AIsWrite;
     bool BIsWrite;
 
-    DepDistanceStrideAndSizeInfo(const SCEV *Dist, uint64_t StrideA,
-                                 uint64_t StrideB, uint64_t TypeByteSize,
-                                 bool AIsWrite, bool BIsWrite)
-        : Dist(Dist), StrideA(StrideA), StrideB(StrideB),
+    DepDistanceStrideAndSizeInfo(const SCEV *Dist, uint64_t MaxStride,
+                                 std::optional<uint64_t> CommonStride,
+                                 bool ShouldRetryWithRuntimeCheck,
+                                 uint64_t TypeByteSize, bool AIsWrite,
+                                 bool BIsWrite)
+        : Dist(Dist), MaxStride(MaxStride), CommonStride(CommonStride),
+          ShouldRetryWithRuntimeCheck(ShouldRetryWithRuntimeCheck),
           TypeByteSize(TypeByteSize), AIsWrite(AIsWrite), BIsWrite(BIsWrite) {}
   };
 
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 71582d5d86549b..6e3e1e8900c704 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1799,8 +1799,7 @@ void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
 ///     }
 static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
                                      const SCEV &MaxBTC, const SCEV &Dist,
-                                     uint64_t MaxStride,
-                                     uint64_t TypeByteSize) {
+                                     uint64_t MaxStride) {
 
   // If we can prove that
   //      (**) |Dist| > MaxBTC * Step
@@ -1819,8 +1818,7 @@ static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
   // will be executed only if LoopCount >= VF, proving distance >= LoopCount
   // also guarantees that distance >= VF.
   //
-  const uint64_t ByteStride = MaxStride * TypeByteSize;
-  const SCEV *Step = SE.getConstant(MaxBTC.getType(), ByteStride);
+  const SCEV *Step = SE.getConstant(MaxBTC.getType(), MaxStride);
   const SCEV *Product = SE.getMulExpr(&MaxBTC, Step);
 
   const SCEV *CastedDist = &Dist;
@@ -1864,9 +1862,7 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
   if (Distance % TypeByteSize)
     return false;
 
-  uint64_t ScaledDist = Distance / TypeByteSize;
-
-  // No dependence if the scaled distance is not multiple of the stride.
+  // No dependence if the distance is not multiple of the stride.
   // E.g.
   //      for (i = 0; i < 1024 ; i += 4)
   //        A[i+2] = A[i] + 1;
@@ -1882,7 +1878,7 @@ static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
   // Two accesses in memory (scaled distance is 4, stride is 3):
   //     | A[0] |      |      | A[3] |      |      | A[6] |      |      |
   //     |      |      |      |      | A[4] |      |      | A[7] |      |
-  return ScaledDist % Stride;
+  return Distance % Stride;
 }
 
 std::variant<MemoryDepChecker::Dependence::DepType,
@@ -1921,6 +1917,7 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
   if (StrideAPtr && *StrideAPtr < 0) {
     std::swap(Src, Sink);
     std::swap(AInst, BInst);
+    std::swap(ATy, BTy);
     std::swap(StrideAPtr, StrideBPtr);
   }
 
@@ -1972,30 +1969,68 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
     return MemoryDepChecker::Dependence::IndirectUnsafe;
   }
 
-  int64_t StrideAPtrInt = *StrideAPtr;
-  int64_t StrideBPtrInt = *StrideBPtr;
-  LLVM_DEBUG(dbgs() << "LAA:  Src induction step: " << StrideAPtrInt
-                    << " Sink induction step: " << StrideBPtrInt << "\n");
+  LLVM_DEBUG(dbgs() << "LAA:  Src induction step: " << *StrideAPtr
+                    << " Sink induction step: " << *StrideBPtr << "\n");
+
+  // Note that store size is different from alloc size, which is dependent on
+  // store size. We use the former for checking illegal cases, and the latter
+  // for scaling strides.
+  TypeSize AStoreSz = DL.getTypeStoreSize(ATy),
+           BStoreSz = DL.getTypeStoreSize(BTy);
+
+  // When the distance is zero, we're reading/writing the same memory location:
+  // check that the store sizes are equal. Otherwise, fail with an unknown
+  // dependence for which we should not generate runtime checks.
+  if (Dist->isZero() && AStoreSz != BStoreSz)
+    return MemoryDepChecker::Dependence::Unknown;
+
+  // We can't get get a uint64_t for the AllocSize if either of the store sizes
+  // are scalable.
+  if (AStoreSz.isScalable() || BStoreSz.isScalable())
+    return MemoryDepChecker::Dependence::Unknown;
+
+  // The TypeByteSize is used to scale Distance and VF. In these contexts, the
+  // only size that matters is the size of the Sink.
+  uint64_t ASz = alignTo(AStoreSz, DL.getABITypeAlign(ATy).value()),
+           TypeByteSize = alignTo(BStoreSz, DL.getABITypeAlign(BTy).value());
+
+  // We scale the strides by the alloc-type-sizes, so we can check that the
+  // common distance is equal when ASz != BSz.
+  int64_t StrideAScaled = *StrideAPtr * ASz;
+  int64_t StrideBScaled = *StrideBPtr * TypeByteSize;
+
   // At least Src or Sink are loop invariant and the other is strided or
   // invariant. We can generate a runtime check to disambiguate the accesses.
-  if (!StrideAPtrInt || !StrideBPtrInt)
+  if (!StrideAScaled || !StrideBScaled)
     return MemoryDepChecker::Dependence::Unknown;
 
   // Both Src and Sink have a constant stride, check if they are in the same
   // direction.
-  if ((StrideAPtrInt > 0) != (StrideBPtrInt > 0)) {
+  if ((StrideAScaled > 0) != (StrideBScaled > 0)) {
     LLVM_DEBUG(
         dbgs() << "Pointer access with strides in different directions\n");
     return MemoryDepChecker::Dependence::Unknown;
   }
 
-  uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
-  bool HasSameSize =
-      DL.getTypeStoreSizeInBits(ATy) == DL.getTypeStoreSizeInBits(BTy);
-  if (!HasSameSize)
-    TypeByteSize = 0;
-  return DepDistanceStrideAndSizeInfo(Dist, std::abs(StrideAPtrInt),
-                                      std::abs(StrideBPtrInt), TypeByteSize,
+  StrideAScaled = std::abs(StrideAScaled);
+  StrideBScaled = std::abs(StrideBScaled);
+
+  // MaxStride is the max of the scaled strides, as expected.
+  uint64_t MaxStride = std::max(StrideAScaled, StrideBScaled);
+
+  // CommonStride is set if both scaled strides are equal.
+  std::optional<uint64_t> CommonStride;
+  if (StrideAScaled == StrideBScaled)
+    CommonStride = StrideAScaled;
+
+  // TODO: Historically, we don't retry with runtime checks unless the unscaled
+  // strides are the same, but this doesn't make sense. Fix this once the
+  // condition for runtime checks in isDependent is fixed.
+  bool ShouldRetryWithRuntimeCheck =
+      std::abs(*StrideAPtr) == std::abs(*StrideBPtr);
+
+  return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride,
+                                      ShouldRetryWithRuntimeCheck, TypeByteSize,
                                       AIsWrite, BIsWrite);
 }
 
@@ -2011,32 +2046,28 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   if (std::holds_alternative<Dependence::DepType>(Res))
     return std::get<Dependence::DepType>(Res);
 
-  auto &[Dist, StrideA, StrideB, TypeByteSize, AIsWrite, BIsWrite] =
+  auto &[Dist, MaxStride, CommonStride, ShouldRetryWithRuntimeCheck,
+         TypeByteSize, AIsWrite, BIsWrite] =
       std::get<DepDistanceStrideAndSizeInfo>(Res);
-  bool HasSameSize = TypeByteSize > 0;
 
-  std::optional<uint64_t> CommonStride =
-      StrideA == StrideB ? std::make_optional(StrideA) : std::nullopt;
   if (isa<SCEVCouldNotCompute>(Dist)) {
-    // TODO: Relax requirement that there is a common stride to retry with
-    // non-constant distance dependencies.
-    FoundNonConstantDistanceDependence |= CommonStride.has_value();
+    // TODO: Relax requirement that there is a common unscaled stride to retry
+    // with non-constant distance dependencies.
+    FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck;
     LLVM_DEBUG(dbgs() << "LAA: Dependence because of uncomputable distance.\n");
     return Dependence::Unknown;
   }
 
   ScalarEvolution &SE = *PSE.getSE();
   auto &DL = InnermostLoop->getHeader()->getDataLayout();
-  uint64_t MaxStride = std::max(StrideA, StrideB);
 
   // If the distance between the acecsses is larger than their maximum absolute
   // stride multiplied by the symbolic maximum backedge taken count (which is an
   // upper bound of the number of iterations), the accesses are independet, i.e.
   // they are far enough appart that accesses won't access the same location
   // across all loop ierations.
-  if (HasSameSize && isSafeDependenceDistance(
-                         DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()),
-                         *Dist, MaxStride, TypeByteSize))
+  if (isSafeDependenceDistance(
+          DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()), *Dist, MaxStride))
     return Dependence::NoDep;
 
   const SCEVConstant *ConstDist = dyn_cast<SCEVConstant>(Dist);
@@ -2047,7 +2078,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
 
     // If the distance between accesses and their strides are known constants,
     // check whether the accesses interlace each other.
-    if (Distance > 0 && CommonStride && CommonStride > 1 && HasSameSize &&
+    if (Distance > 0 && CommonStride && CommonStride > 1 &&
         areStridedAccessesIndependent(Distance, *CommonStride, TypeByteSize)) {
       LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
       return Dependence::NoDep;
@@ -2061,15 +2092,9 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
 
   // Negative distances are not plausible dependencies.
   if (SE.isKnownNonPositive(Dist)) {
-    if (SE.isKnownNonNegative(Dist)) {
-      if (HasSameSize) {
-        // Write to the same location with the same size.
-        return Dependence::Forward;
-      }
-      LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but "
-                           "different type sizes\n");
-      return Dependence::Unknown;
-    }
+    if (SE.isKnownNonNegative(Dist))
+      // Write to the same location.
+      return Dependence::Forward;
 
     bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
     // Check if the first access writes to a location that is read in a later
@@ -2084,13 +2109,12 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
       if (!ConstDist) {
         // TODO: FoundNonConstantDistanceDependence is used as a necessary
         // condition to consider retrying with runtime checks. Historically, we
-        // did not set it when strides were different but there is no inherent
-        // reason to.
-        FoundNonConstantDistanceDependence |= CommonStride.has_value();
+        // did not set it when unscaled strides were different but there is no
+        // inherent reason to.
+        FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck;
         return Dependence::Unknown;
       }
-      if (!HasSameSize ||
-          couldPreventStoreLoadForward(
+      if (couldPreventStoreLoadForward(
               ConstDist->getAPInt().abs().getZExtValue(), TypeByteSize)) {
         LLVM_DEBUG(
             dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
@@ -2105,27 +2129,20 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   int64_t MinDistance = SE.getSignedRangeMin(Dist).getSExtValue();
   // Below we only handle strictly positive distances.
   if (MinDistance <= 0) {
-    FoundNonConstantDistanceDependence |= CommonStride.has_value();
+    FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck;
     return Dependence::Unknown;
   }
 
-  if (!ConstDist) {
+  if (!ConstDist)
     // Previously this case would be treated as Unknown, possibly setting
     // FoundNonConstantDistanceDependence to force re-trying with runtime
     // checks. Until the TODO below is addressed, set it here to preserve
     // original behavior w.r.t. re-trying with runtime checks.
     // TODO: FoundNonConstantDistanceDependence is used as a necessary
     // condition to consider retrying with runtime checks. Historically, we
-    // did not set it when strides were different but there is no inherent
-    // reason to.
-    FoundNonConstantDistanceDependence |= CommonStride.has_value();
-  }
-
-  if (!HasSameSize) {
-    LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with "
-                         "different type sizes\n");
-    return Dependence::Unknown;
-  }
+    // did not set it when unscaled strides were different but there is no
+    // inherent reason to.
+    FoundNonConstantDistanceDependence |= ShouldRetryWithRuntimeCheck;
 
   if (!CommonStride)
     return Dependence::Unknown;
@@ -2140,8 +2157,8 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
 
   // It's not vectorizable if the distance is smaller than the minimum distance
   // needed for a vectroized/unrolled version. Vectorizing one iteration in
-  // front needs TypeByteSize * Stride. Vectorizing the last iteration needs
-  // TypeByteSize (No need to plus the last gap distance).
+  // front needs CommonStride. Vectorizing the last iteration needs TypeByteSize
+  // (No need to plus the last gap distance).
   //
   // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
   //      foo(int *A) {
@@ -2168,8 +2185,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
   // We know that Dist is positive, but it may not be constant. Use the signed
   // minimum for computations below, as this ensures we compute the closest
   // possible dependence distance.
-  uint64_t MinDistanceNeeded =
-      TypeByteSize * *CommonStride * (MinNumIter - 1) + TypeByteSize;
+  uint64_t MinDistanceNeeded = *CommonStride * (MinNumIter - 1) + TypeByteSize;
   if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) {
     if (!ConstDist) {
       // For non-constant distances, we checked the lower bound of the
@@ -2225,7 +2241,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
 
   // An update to MinDepDistBytes requires an update to MaxSafeVectorWidthInBits
   // since there is a backwards dependency.
-  uint64_t MaxVF = MinDepDistBytes / (TypeByteSize * *CommonStride);
+  uint64_t MaxVF = MinDepDistBytes / *CommonStride;
   LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance
                     << " with max VF = " << MaxVF << '\n');
 
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
index 0bdcc357901487..cf1098f33fd653 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
@@ -129,16 +129,8 @@ define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
 ; CHECK-LABEL: 'neg_dist_dep_type_size_equivalence'
 ; CHECK-NEXT:    loop:
 ; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:  Backward loop carried data dependence that prevents store-to-load forwarding.
 ; CHECK-NEXT:      Dependences:
-; CHECK-NEXT:        Unknown:
-; CHECK-NEXT:            %ld.f64 = load double, ptr %gep.iv, align 8 ->
-; CHECK-NEXT:            store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
-; CHECK-EMPTY:
-; CHECK-NEXT:        Unknown:
-; CHECK-NEXT:            %ld.i64 = load i64, ptr %gep.iv, align 8 ->
-; CHECK-NEXT:            store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
-; CHECK-EMPTY:
 ; CHECK-NEXT:        BackwardVectorizableButPreventsForwarding:
 ; CHECK-NEXT:            %ld.f64 = load double, ptr %gep.iv, align 8 ->
 ; CHECK-NEXT:            store double %val, ptr %gep.iv.101.i64, align 8
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll
index adfd19923e921c..7837c20f003e24 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-carried.ll
@@ -70,10 +70,6 @@ define void @forward_different_access_sizes(ptr readnone %end, ptr %start) {
 ; CHECK-NEXT:            store i32 0, ptr %gep.2, align 4 ->
 ; CHECK-NEXT:            %l = load i24, ptr %gep.1, align 1
 ; CHECK-EMPTY:
-; CHECK-NEXT:        Forward:
-; CHECK-NEXT:            store i32 0, ptr %gep.2, align 4 ->
-; CHECK-NEXT:            store i24 %l, ptr %ptr.iv, align 1
-; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/positive-dependence-distance-different-access-sizes.ll b/llvm/test/Analysis/LoopAccessAnalysis/positive-dependence-distance-different-access-sizes.ll
index 08e0bae7f05bac..ffd4619fd88dc9 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/positive-dependence-distance-different-access-sizes.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/positive-dependence-distance-different-access-sizes.ll
@@ -3,26 +3,13 @@
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 
-; TODO: No runtime checks should be needed, as the distance between accesses
-; is large enough to need runtime checks.
 define void @test_distance_positive_independent_via_trip_count(ptr %A) {
 ; CHECK-LABEL: 'test_distance_positive_independent_via_trip_count'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Memory dependences are safe
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
-; CHECK-NEXT:      Check 0:
-; CHECK-NEXT:        Comparing group ([[GRP1:0x[0-9a-f]+]]):
-; CHECK-NEXT:          %gep.A.400 = getelementptr inbounds i32, ptr %A.400, i64 %iv
-; CHECK-NEXT:        Against group ([[GRP2:0x[0-9a-f]+]]):
-; CHECK-NEXT:          %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv
 ; CHECK-NEXT:      Grouped accesses:
-; CHECK-NEXT:        Group [[GRP1]]:
-; CHECK-NEXT:          (Low: (400 + %A)<nuw> High: (804 + %A))
-; CHECK-NEXT:            Member: {(400 + %A)<nuw>,+,4}<nuw><%loop>
-; CHECK-NEXT:        Group [[GRP2]]:
-; CHECK-NEXT:          (Low: %A High: (101 + %A))
-; CHECK-NEXT:            Member: {%A,+,1}<nuw><%loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
@@ -41,7 +28,7 @@ loop:
   %ext = zext i8 %l to i32
   store i32 %ext, ptr %gep.A.400, align 4
   %iv.next = add nuw nsw i64 %iv, 1
-  %ec = icmp eq i64 %iv, 100
+  %ec = icmp eq i64 %iv.next, 100
   br i1 %ec, label %exit, label %loop
 
 exit:
@@ -57,16 +44,16 @@ define void @test_distance_positive_backwards(ptr %A) {
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Check 0:
-; CHECK-NEXT:        Comparing group ([[GRP3:0x[0-9a-f]+]]):
+; CHECK-NEXT:        Comparing group ([[GRP1:0x[0-9a-f]+]]):
 ; CHECK-NEXT:          %gep.A.400 = getelementptr inbounds i32, ptr %A.1, i64 %iv
-; CHECK-NEXT:        Against group ([[GRP4:0x[0-9a-f]+]]):
+; CHECK-NEXT:        Against group ([[GRP2:0x[0-9a-f]+]]):
 ; CHECK-NEXT:          %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv
 ; CHECK-NEXT:      Grouped accesses:
-; CHECK-NEXT:        Group [[GRP3]]:
-; CHECK-NEXT:          (Low: (1 + %A)<nuw> High: (405 + %A))
+; CHECK-NEXT:        Group [[GRP1]]:
+; CHECK-NEXT:          (Low: (1 + %A)<nuw> High: (401 + %A))
 ; CHECK-NEXT:            Member: {(1 + %A)<nuw>,+,4}<nuw><%loop>
-; CHECK-NEXT:        Group [[GRP4]]:
-; CHECK-NEXT:          (Low: %A High: (101 + %A))
+; CHECK-NEXT:        Group [[GRP2]]:
+; CHECK-NEXT:          (Low: %A High: (100 + %A))
 ; CHECK-NEXT:            Member: {%A,+,1}<nuw><%loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
@@ -86,7 +73,7 @@ loop:
   %ext = zext i8 %l to i32
   store i32 %ext, ptr %gep.A.400, align 4
   %iv.next = add nuw nsw i64 %iv, 1
-  %ec = icmp eq i64 %iv, 100
+  %ec = icmp eq i64 %iv.next, 100
   br i1 %ec, label %exit, label %loop
 
 exit:
@@ -100,16 +87,16 @@ define void @test_distance_positive_via_assume(ptr %A, i64 %off) {
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Check 0:
-; CHECK-NEXT:        Comparing group ([[GRP5:0x[0-9a-f]+]]):
+; CHECK-NEXT:        Comparing group ([[GRP3:0x[0-9a-f]+]]):
 ; CHECK-NEXT:          %gep.A.400 = getelementptr inbounds i32, ptr %A.off, i64 %iv
-; CHECK-NEXT:        Against group ([[GRP6:0x[0-9a-f]+]]):
+; CHECK-NEXT:        Against group ([[GRP4:0x[0-9a-f]+]]):
 ; CHECK-NEXT:          %gep.A = getelementptr inbounds i8, ptr %A, i64 %iv
 ; CHECK-NEXT:      Grouped accesses:
-; CHECK-NEXT:        Group [[GRP5]]:
-; CHECK-NEXT:          (Low: (%off + %A) High: (404 + %off + %A))
+; CHECK-NEXT:        Group [[GRP3]]:
+; CHECK-NEXT:          (Low: (%off + %A) High: (400 + %off + %A))
 ; CHECK-NEXT:            Member: {(%off + %A),+,4}<nw><%loop>
-; CHECK-NEXT:        Group [[GRP6]]:
-; CHECK-NEXT:          (Low: %A High: (101 + %A))
+; CHECK-NEXT:        Group [[GRP4]]:
+; CHECK-NEXT:          (Low: %A High: (100 + %A))
 ; CHECK-NEXT:            Member: {%A,+,1}<nuw><%loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
@@ -131,7 +118,7 @@ loop:
   %ext = zext i8 %l to i32
   store i32 %ext, ptr %gep.A.400, align 4
   %iv.next = add nuw nsw i64 %iv, 1
-  %ec = icmp eq i64 %iv, 100
+  %ec = icmp eq i64 %iv.next, 100
   br i1 %ec, label %exit, label %loop
 
 exit:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll
index 79d7ab84b3a0f0..e3dd48114e3f8d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-allocsize-not-equal-typesize.ll
@@ -96,17 +96,42 @@ exit:
 define void @pr58722_store_interleave_group(ptr %src, ptr %dst) {
 ; CHECK-LABEL: @pr58722_store_interleave_group(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[TMP1]]
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[TMP0]] to i24
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP1]] to i24
+; CHECK-NEXT:    store i24 [[TMP6]], ptr [[TMP4]], align 4
+; CHECK-NEXT:    store i24 [[TMP7]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 5000
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 10000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[GEP_IV:%.*]] = getelementptr inbounds i64, ptr [[SRC:%.*]], i32 [[IV]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[GEP_IV:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i32 [[IV]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[GEP_IV]], align 4
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[GEP_IV]], i64 1
 ; CHECK-NEXT:    [[TRUNC_IV:%.*]] = trunc i32 [[IV]] to i24
 ; CHECK-NEXT:    store i24 [[TRUNC_IV]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 2
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[IV]], 10000
-; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;



More information about the llvm-commits mailing list