[llvm] 4ee45ab - [LV] Invalidate cost model decisions along with interleave groups.

Sat Apr 18 02:24:29 PDT 2020

Author: Florian Hahn
Date: 2020-04-18T10:23:49+01:00
New Revision: 4ee45ab60f8639375296f8b7b96e2eb5e8a2c9d3

URL: https://github.com/llvm/llvm-project/commit/4ee45ab60f8639375296f8b7b96e2eb5e8a2c9d3
DIFF: https://github.com/llvm/llvm-project/commit/4ee45ab60f8639375296f8b7b96e2eb5e8a2c9d3.diff

LOG: [LV] Invalidate cost model decisions along with interleave groups.

Cost-modeling decisions are tied to the compute interleave groups
(widening decisions, scalar and uniform values). When invalidating the
interleave groups, those decisions also need to be invalidated.

Otherwise there is a mis-match during VPlan construction.
VPWidenMemoryRecipes created initially are left around w/o converting them
into VPInterleave recipes. Such a conversion indeed should not take place,
and these gather/scatter recipes may in fact be right. The crux is leaving around
obsolete CM_Interleave (and dependent) markings of instructions along with
their costs, instead of recalculating decisions, costs, and recipes.

Alternatively to forcing a complete recompute later on, we could try
to selectively invalidate the decisions connected to the interleave
groups. But we would likely need to run the uniform/scalar value
detection parts again anyways and the extra complexity is probably not
worth it.

Fixes PR45572.

Reviewers: gilr, rengolin, Ayal, hsaito

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D78298

Added: 
    llvm/test/Transforms/LoopVectorize/Hexagon/invalidate-cm-after-invalidating-interleavegroups.ll

Modified: 
    llvm/include/llvm/Analysis/VectorUtils.h
    llvm/lib/Analysis/VectorUtils.cpp
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 2b680e8131c4..89a2ea292209 100644

--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -698,7 +698,7 @@ class InterleavedAccessInfo {
                         const LoopAccessInfo *LAI)
       : PSE(PSE), TheLoop(L), DT(DT), LI(LI), LAI(LAI) {}
 
-  ~InterleavedAccessInfo() { reset(); }
+  ~InterleavedAccessInfo() { invalidateGroups(); }
 
   /// Analyze the interleaved accesses and collect them in interleave
   /// groups. Substitute symbolic strides using \p Strides.
@@ -709,16 +709,24 @@ class InterleavedAccessInfo {
   /// Invalidate groups, e.g., in case all blocks in loop will be predicated
   /// contrary to original assumption. Although we currently prevent group
   /// formation for predicated accesses, we may be able to relax this limitation
-  /// in the future once we handle more complicated blocks.
-  void reset() {
+  /// in the future once we handle more complicated blocks. Returns true if any
+  /// groups were invalidated.
+  bool invalidateGroups() {
+    if (InterleaveGroups.empty()) {
+      assert(
+          !RequiresScalarEpilogue &&
+          "RequiresScalarEpilog should not be set without interleave groups");
+      return false;
+    }
+
     InterleaveGroupMap.clear();
     for (auto *Ptr : InterleaveGroups)
       delete Ptr;
     InterleaveGroups.clear();
     RequiresScalarEpilogue = false;
+    return true;
   }
 
-
   /// Check if \p Instr belongs to any interleave group.
   bool isInterleaved(Instruction *Instr) const {
     return InterleaveGroupMap.find(Instr) != InterleaveGroupMap.end();

diff  --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 0d411485ddd9..7cbcb17a9a3d 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1243,6 +1243,8 @@ void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
     if (Group->requiresScalarEpilogue())
       DelSet.insert(Group);
   }
+  assert(!DelSet.empty() && "At least one group must be invalidated, as a "
+                            "scalar epilogue was required");
   for (auto *Ptr : DelSet) {
     LLVM_DEBUG(
         dbgs()

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7eaab8b0b739..27e3c93f7af8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1309,6 +1309,13 @@ class LoopVectorizationCostModel {
   /// i.e. either vector version isn't available, or is too expensive.
   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
 
+  /// Invalidates decisions already taken by the cost model.
+  void invalidateCostModelingDecisions() {
+    WideningDecisions.clear();
+    Uniforms.clear();
+    Scalars.clear();
+  }
+
 private:
   unsigned NumPredStores = 0;
 
@@ -4977,8 +4984,13 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
 
   // Invalidate interleave groups that require an epilogue if we can't mask
   // the interleave-group.
-  if (!useMaskedInterleavedAccesses(TTI))
+  if (!useMaskedInterleavedAccesses(TTI)) {
+    assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
+           "No decisions should have been taken at this point");
+    // Note: There is no need to invalidate any cost modeling decisions here, as
+    // non where taken so far.
     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
+  }
 
   unsigned MaxVF = computeFeasibleMaxVF(TC);
   if (TC > 0 && TC % MaxVF == 0) {
@@ -6517,7 +6529,11 @@ Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
         dbgs()
         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
            "which requires masked-interleaved support.\n");
-    CM.InterleaveInfo.reset();
+    if (CM.InterleaveInfo.invalidateGroups())
+      // Invalidating interleave groups also requires invalidating all decisions
+      // based on them, which includes widening decisions and uniform and scalar
+      // values.
+      CM.invalidateCostModelingDecisions();
   }
 
   if (UserVF) {

diff  --git a/llvm/test/Transforms/LoopVectorize/Hexagon/invalidate-cm-after-invalidating-interleavegroups.ll b/llvm/test/Transforms/LoopVectorize/Hexagon/invalidate-cm-after-invalidating-interleavegroups.ll
new file mode 100644
index 000000000000..5ce7eab25156
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/Hexagon/invalidate-cm-after-invalidating-interleavegroups.ll
@@ -0,0 +1,96 @@
+; RUN: opt -loop-vectorize -hexagon-autohvx=1 -force-vector-width=64 -prefer-predicate-over-epilog -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Test for PR45572.
+
+; Check that interleave groups and decisions based on them are correctly
+; invalidated with tail-folding on platforms where masked interleaved accesses
+; are disabled.
+
+; Make sure a vector body has been created, 64 element vectors are used and a block predicate has been computed.
+; Also make sure the loads are not widened.
+
+; CHECK-LABEL: @test1
+; CHECK: vector.body:
+; CHECK: %induction = add <64 x i32>
+; CHECK: icmp ule <64 x i32> %induction
+; CHECK-NOT: load <{{.*}} x i32>
+
+
+define void @test1(i32* %arg, i32 %N) #0 {
+entry:
+  %tmp = alloca i8
+  br label %loop
+
+loop:                                              ; preds = %bb2, %bb
+  %iv = phi i32 [ %iv.next, %loop], [ 0, %entry ]
+  %idx.mul = mul nuw nsw i32 %iv, 7
+  %idx.start = add nuw nsw i32 %idx.mul, 1
+  %tmp6 = getelementptr inbounds i32, i32* %arg, i32 %idx.start
+  %tmp7 = load i32, i32* %tmp6, align 4
+  %tmp8 = add nuw nsw i32 %idx.start, 1
+  %tmp9 = getelementptr inbounds i32, i32* %arg, i32 %tmp8
+  %tmp10 = load i32, i32* %tmp9, align 4
+  %tmp11 = add nuw nsw i32 %idx.start, 2
+  %tmp12 = getelementptr inbounds i32, i32* %arg, i32 %tmp11
+  %tmp13 = load i32, i32* %tmp12, align 4
+  %tmp14 = add nuw nsw i32 %idx.start, 3
+  %tmp15 = getelementptr inbounds i32, i32* %arg, i32 %tmp14
+  %tmp16 = load i32, i32* %tmp15, align 4
+  %tmp18 = add nuw nsw i32 %idx.start, 4
+  %tmp19 = getelementptr inbounds i32, i32* %arg, i32 %tmp18
+  %tmp20 = load i32, i32* %tmp19, align 4
+  %tmp21 = add nuw nsw i32 %idx.start, 5
+  %tmp22 = getelementptr inbounds i32, i32* %arg, i32 %tmp21
+  %tmp23 = load i32, i32* %tmp22, align 4
+  %tmp25 = add nuw nsw i32 %idx.start, 6
+  %tmp26 = getelementptr inbounds i32, i32* %arg, i32 %tmp25
+  %tmp27 = load i32, i32* %tmp26, align 4
+  store i8 0, i8* %tmp, align 1
+  %iv.next= add nuw nsw i32 %iv, 1
+  %exit.cond = icmp eq i32 %iv.next, %N
+  br i1 %exit.cond, label %exit, label %loop
+
+exit:                                             ; preds = %loop
+  ret void
+}
+
+; The loop below only requires tail folding due to interleave groups with gaps.
+; Make sure the loads are not widened.
+
+; CHECK-LABEL: @test2
+; CHECK: vector.body:
+; CHECK-NOT: load <{{.*}} x i32>
+define void @test2(i32* %arg) #1 {
+entry:
+  %tmp = alloca i8
+  br label %loop
+
+loop:                                              ; preds = %bb2, %bb
+  %iv = phi i32 [ %iv.next, %loop], [ 0, %entry ]
+  %idx.start = mul nuw nsw i32 %iv, 5
+  %tmp6 = getelementptr inbounds i32, i32* %arg, i32 %idx.start
+  %tmp7 = load i32, i32* %tmp6, align 4
+  %tmp8 = add nuw nsw i32 %idx.start, 1
+  %tmp9 = getelementptr inbounds i32, i32* %arg, i32 %tmp8
+  %tmp10 = load i32, i32* %tmp9, align 4
+  %tmp11 = add nuw nsw i32 %idx.start, 2
+  %tmp12 = getelementptr inbounds i32, i32* %arg, i32 %tmp11
+  %tmp13 = load i32, i32* %tmp12, align 4
+  %tmp14 = add nuw nsw i32 %idx.start, 3
+  %tmp15 = getelementptr inbounds i32, i32* %arg, i32 %tmp14
+  %tmp16 = load i32, i32* %tmp15, align 4
+  store i8 0, i8* %tmp, align 1
+  %iv.next= add nuw nsw i32 %iv, 1
+  %exit.cond = icmp eq i32 %iv.next, 128
+  br i1 %exit.cond, label %exit, label %loop
+
+exit:                                             ; preds = %loop
+  ret void
+}
+
+
+attributes #0 = { "target-features"="+hvx,+hvx-length128b" }
+attributes #1 = { optsize "target-features"="+hvx,+hvx-length128b" }