[llvm] r313237 - [LV] Fix maximum legal VF calculation

Thu Sep 14 00:40:02 PDT 2017

Author: alonkom
Date: Thu Sep 14 00:40:02 2017
New Revision: 313237

URL: http://llvm.org/viewvc/llvm-project?rev=313237&view=rev
Log:
[LV] Fix maximum legal VF calculation

This patch fixes pr34283, which exposed that the computation of
maximum legal width for vectorization was wrong, because it relied
on MaxInterleaveFactor to obtain the maximum stride used in the loop,
however not all strided accesses in the loop have an interleave-group
associated with them.
Instead of recording the maximum stride in the loop, which can be over
conservative (e.g. if the access with the maximum stride is not involved
in the dependence limitation), this patch tracks the actual maximum legal
width imposed by accesses that are involved in dependencies.

Differential Revision: https://reviews.llvm.org/D37507

Modified:
    llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h
    llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp
    llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/trunk/test/Transforms/LoopVectorize/memdep.ll

Modified: llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h?rev=313237&r1=313236&r2=313237&view=diff
==============================================================================

--- llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h (original)
+++ llvm/trunk/include/llvm/Analysis/LoopAccessAnalysis.h Thu Sep 14 00:40:02 2017
@@ -163,7 +163,7 @@ public:
   };
 
   MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L)
-      : PSE(PSE), InnermostLoop(L), AccessIdx(0),
+      : PSE(PSE), InnermostLoop(L), AccessIdx(0), MaxSafeRegisterWidth(-1U),
         ShouldRetryWithRuntimeCheck(false), SafeForVectorization(true),
         RecordDependences(true) {}
 
@@ -199,6 +199,10 @@ public:
   /// the accesses safely with.
   uint64_t getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
 
+  /// \brief Return the number of elements that are safe to operate on
+  /// simultaneously, multiplied by the size of the element in bits.
+  uint64_t getMaxSafeRegisterWidth() const { return MaxSafeRegisterWidth; }
+
   /// \brief In same cases when the dependency check fails we can still
   /// vectorize the loop with a dynamic array access check.
   bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; }
@@ -255,6 +259,12 @@ private:
   // We can access this many bytes in parallel safely.
   uint64_t MaxSafeDepDistBytes;
 
+  /// \brief Number of elements (from consecutive iterations) that are safe to
+  /// operate on simultaneously, multiplied by the size of the element in bits.
+  /// The size of the element is taken from the memory access that is most
+  /// restrictive.
+  uint64_t MaxSafeRegisterWidth;
+
   /// \brief If we see a non-constant dependence distance we can still try to
   /// vectorize this loop with runtime checks.
   bool ShouldRetryWithRuntimeCheck;

Modified: llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp?rev=313237&r1=313236&r2=313237&view=diff
==============================================================================
--- llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp (original)
+++ llvm/trunk/lib/Analysis/LoopAccessAnalysis.cpp Thu Sep 14 00:40:02 2017
@@ -1540,10 +1540,11 @@ MemoryDepChecker::isDependent(const MemA
       couldPreventStoreLoadForward(Distance, TypeByteSize))
     return Dependence::BackwardVectorizableButPreventsForwarding;
 
+  uint64_t MaxVF = MaxSafeDepDistBytes / (TypeByteSize * Stride);
   DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
-               << " with max VF = "
-               << MaxSafeDepDistBytes / (TypeByteSize * Stride) << '\n');
-
+               << " with max VF = " << MaxVF << '\n');
+  uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
+  MaxSafeRegisterWidth = std::min(MaxSafeRegisterWidth, MaxVFInBits);
   return Dependence::BackwardVectorizable;
 }
 

Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=313237&r1=313236&r2=313237&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Thu Sep 14 00:40:02 2017
@@ -963,14 +963,6 @@ public:
     return InterleaveGroupMap.count(Instr);
   }
 
-  /// \brief Return the maximum interleave factor of all interleaved groups.
-  unsigned getMaxInterleaveFactor() const {
-    unsigned MaxFactor = 1;
-    for (auto &Entry : InterleaveGroupMap)
-      MaxFactor = std::max(MaxFactor, Entry.second->getFactor());
-    return MaxFactor;
-  }
-
   /// \brief Get the interleave group that \p Instr belongs to.
   ///
   /// \returns nullptr if doesn't have such group.
@@ -1553,11 +1545,6 @@ public:
     return InterleaveInfo.isInterleaved(Instr);
   }
 
-  /// \brief Return the maximum interleave factor of all interleaved groups.
-  unsigned getMaxInterleaveFactor() const {
-    return InterleaveInfo.getMaxInterleaveFactor();
-  }
-
   /// \brief Get the interleaved access group that \p Instr belongs to.
   const InterleaveGroup *getInterleavedAccessGroup(Instruction *Instr) {
     return InterleaveInfo.getInterleaveGroup(Instr);
@@ -1571,6 +1558,10 @@ public:
 
   unsigned getMaxSafeDepDistBytes() { return LAI->getMaxSafeDepDistBytes(); }
 
+  uint64_t getMaxSafeRegisterWidth() const {
+	  return LAI->getDepChecker().getMaxSafeRegisterWidth();
+  }
+
   bool hasStride(Value *V) { return LAI->hasStride(V); }
 
   /// Returns true if the target machine supports masked store operation
@@ -6077,9 +6068,11 @@ void InterleavedAccessInfo::analyzeInter
 
   // Remove interleaved store groups with gaps.
   for (InterleaveGroup *Group : StoreGroups)
-    if (Group->getNumMembers() != Group->getFactor())
+    if (Group->getNumMembers() != Group->getFactor()) {
+      DEBUG(dbgs() << "LV: Invalidate candidate interleaved store group due "
+                      "to gaps.\n");
       releaseGroup(Group);
-
+    }
   // Remove interleaved groups with gaps (currently only loads) whose memory
   // accesses may wrap around. We have to revisit the getPtrStride analysis,
   // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
@@ -6132,6 +6125,8 @@ void InterleavedAccessInfo::analyzeInter
       // to look for a member at index factor - 1, since every group must have
       // a member at index zero.
       if (Group->isReverse()) {
+        DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
+                        "a reverse access with gaps.\n");
         releaseGroup(Group);
         continue;
       }
@@ -6215,25 +6210,20 @@ LoopVectorizationCostModel::computeFeasi
   unsigned SmallestType, WidestType;
   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
-  unsigned MaxSafeDepDist = -1U;
 
-  // Get the maximum safe dependence distance in bits computed by LAA. If the
-  // loop contains any interleaved accesses, we divide the dependence distance
-  // by the maximum interleave factor of all interleaved groups. Note that
-  // although the division ensures correctness, this is a fairly conservative
-  // computation because the maximum distance computed by LAA may not involve
-  // any of the interleaved accesses.
-  if (Legal->getMaxSafeDepDistBytes() != -1U)
-    MaxSafeDepDist =
-        Legal->getMaxSafeDepDistBytes() * 8 / Legal->getMaxInterleaveFactor();
+  // Get the maximum safe dependence distance in bits computed by LAA.
+  // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
+  // the memory accesses that is most restrictive (involved in the smallest
+  // dependence distance).
+  unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
+
+  WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
 
-  WidestRegister =
-      ((WidestRegister < MaxSafeDepDist) ? WidestRegister : MaxSafeDepDist);
   unsigned MaxVectorSize = WidestRegister / WidestType;
 
   DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType << " / "
                << WidestType << " bits.\n");
-  DEBUG(dbgs() << "LV: The Widest register is: " << WidestRegister
+  DEBUG(dbgs() << "LV: The Widest register safe to use is: " << WidestRegister
                << " bits.\n");
 
   assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"

Modified: llvm/trunk/test/Transforms/LoopVectorize/memdep.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/memdep.ll?rev=313237&r1=313236&r2=313237&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/memdep.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/memdep.ll Thu Sep 14 00:40:02 2017
@@ -1,5 +1,7 @@
 ; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S | FileCheck %s
 ; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s -check-prefix=WIDTH
+; RUN: opt -S -loop-vectorize -force-vector-width=4 < %s | FileCheck %s -check-prefix=RIGHTVF
+; RUN: opt -S -loop-vectorize -force-vector-width=8 < %s | FileCheck %s -check-prefix=WRONGVF
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
@@ -220,3 +222,52 @@ for.body:
 for.end:
   ret void
 }
+
+
+;Check the new calculation of the maximum safe distance in bits which can be vectorized.
+;The previous behavior did not take account that the stride was 2.
+;Therefore the maxVF was computed as 8 instead of 4, as the dependence distance here is 6 iterations, given by |N-(N-12)|/2.  
+
+;#define M 32
+;#define N 2 * M
+;unsigned int a [N];
+;void pr34283(){
+;	unsigned int j=0;
+;   for (j = 0; j < M - 6; ++j)
+;    {
+;        a[N - 2 * j] = 69;
+;        a[N - 12 - 2 * j] = 7;
+;    }
+;
+;}
+
+; RIGHTVF-LABEL: @pr34283
+; RIGHTVF: <4 x i64>
+
+; WRONGVF-LABLE: @pr34283
+; WRONGVF-NOT: <8 x i64>
+
+ at a = common local_unnamed_addr global [64 x i32] zeroinitializer, align 16
+
+; Function Attrs: norecurse nounwind uwtable
+define void @pr34283() local_unnamed_addr {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl i64 %indvars.iv, 1
+  %1 = sub nuw nsw i64 64, %0
+  %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* @a, i64 0, i64 %1
+  store i32 69, i32* %arrayidx, align 8
+  %2 = sub nuw nsw i64 52, %0
+  %arrayidx4 = getelementptr inbounds [64 x i32], [64 x i32]* @a, i64 0, i64 %2
+  store i32 7, i32* %arrayidx4, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 26
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+