[llvm] 8f18874 - [LV] Still vectorise when tail-folding can't find a primary inducation variable

Thu Jan 9 01:16:57 PST 2020

Author: Sjoerd Meijer
Date: 2020-01-09T09:14:00Z
New Revision: 8f1887456ab4ba24a62ccb19d0d04b08972a0289

URL: https://github.com/llvm/llvm-project/commit/8f1887456ab4ba24a62ccb19d0d04b08972a0289
DIFF: https://github.com/llvm/llvm-project/commit/8f1887456ab4ba24a62ccb19d0d04b08972a0289.diff

LOG: [LV] Still vectorise when tail-folding can't find a primary inducation variable

This addresses a vectorisation regression for tail-folded loops that are
counting down, e.g. loops as simple as this:

  void foo(char *A, char *B, char *C, uint32_t N) {
    while (N > 0) {
      *C++ = *A++ + *B++;
       N--;
    }
  }

These are loops that can be vectorised, but when tail-folding is requested, it
can't find a primary induction variable which we do need for predicating the
loop. As a result, the loop isn't vectorised at all, which it is able to do
when tail-folding is not attempted. So, this adds a check for the primary
induction variable where we decide how to lower the scalar epilogue. I.e., when
there isn't a primary induction variable, a scalar epilogue loop is allowed
(i.e. don't request tail-folding) so that vectorisation could still be
triggered.

Having this check for the primary induction variable make sense anyway, and in
addition, in a follow-up of this I will look into discovering earlier the
primary induction variable for counting down loops, so that this can also be
tail-folded.

Differential revision: https://reviews.llvm.org/D72324

Added: 
    llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
    llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fd30d52a562a..0400e44dd0ec 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7502,30 +7502,43 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
 }
 
-static ScalarEpilogueLowering
-getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
-                          ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
-                          TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
-                          AssumptionCache *AC, LoopInfo *LI,
-                          ScalarEvolution *SE, DominatorTree *DT,
-                          const LoopAccessInfo *LAI) {
-  ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
+// Determine how to lower the scalar epilogue, which depends on 1) optimising
+// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
+// predication, and 4) a TTI hook that analyses whether the loop is suitable
+// for predication.
+static ScalarEpilogueLowering getScalarEpilogueLowering(
+    Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
+    BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
+    AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+    LoopVectorizationLegality &LVL) {
+  bool OptSize =
+      F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+                                                     PGSOQueryType::IRPass);
+  // 1) OptSize takes precedence over all other options, i.e. if this is set,
+  // don't look at hints or options, and don't request a scalar epilogue.
+  if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
+    return CM_ScalarEpilogueNotAllowedOptSize;
+
   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
                               !PreferPredicateOverEpilog;
 
-  if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
-      (F->hasOptSize() ||
-       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
-                                   PGSOQueryType::IRPass)))
-    SEL = CM_ScalarEpilogueNotAllowedOptSize;
-  else if (PreferPredicateOverEpilog ||
-           Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
-           (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, LAI) &&
-            Hints.getPredicate() != LoopVectorizeHints::FK_Disabled &&
-            !PredicateOptDisabled))
-    SEL = CM_ScalarEpilogueNotNeededUsePredicate;
+  // 2) Next, if disabling predication is requested on the command line, honour
+  // this and request a scalar epilogue. Also do this if we don't have a
+  // primary induction variable, which is required for predication.
+  if (PredicateOptDisabled || !LVL.getPrimaryInduction())
+    return CM_ScalarEpilogueAllowed;
+
+  // 3) and 4) look if enabling predication is requested on the command line,
+  // with a loop hint, or if the TTI hook indicates this is profitable, request
+  // predication .
+  if (PreferPredicateOverEpilog ||
+      Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
+      (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
+                                        LVL.getLAI()) &&
+       Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
+    return CM_ScalarEpilogueNotNeededUsePredicate;
 
-  return SEL;
+  return CM_ScalarEpilogueAllowed;
 }
 
 // Process the loop in the VPlan-native vectorization path. This path builds
@@ -7543,9 +7556,8 @@ static bool processLoopInVPlanNativePath(
   Function *F = L->getHeader()->getParent();
   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
 
-  ScalarEpilogueLowering SEL =
-    getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
-                              PSE.getSE(), DT, LVL->getLAI());
+  ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
+      F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
 
   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
                                 &Hints, IAI);
@@ -7637,9 +7649,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   // Check the function attributes and profiles to find out if this function
   // should be optimized for size.
-  ScalarEpilogueLowering SEL =
-    getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
-                              PSE.getSE(), DT, LVL.getLAI());
+  ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
+      F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
 
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
   // here. They may require CFG and instruction level transformations before

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
new file mode 100644
index 000000000000..5a3438230a2d
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -disable-mve-tail-predication=false -S | FileCheck %s
+
+; Check that when we can't predicate this loop that it is still vectorised (with
+; an epilogue).
+; TODO: the reason this can't be predicated is because a primary induction
+; variable can't be found (not yet) for this counting down loop. But with that
+; fixed, this should be able to be predicated.
+
+; CHECK-LABEL: vector.body:
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-arm-unknown-eabihf"
+
+define dso_local void @foo(i8* noalias nocapture readonly %A, i8* noalias nocapture readonly %B, i8* noalias nocapture %C, i32 %N) #0 {
+entry:
+  %cmp6 = icmp eq i32 %N, 0
+  br i1 %cmp6, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %N.addr.010 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
+  %C.addr.09 = phi i8* [ %incdec.ptr4, %while.body ], [ %C, %while.body.preheader ]
+  %B.addr.08 = phi i8* [ %incdec.ptr1, %while.body ], [ %B, %while.body.preheader ]
+  %A.addr.07 = phi i8* [ %incdec.ptr, %while.body ], [ %A, %while.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %A.addr.07, i32 1
+  %0 = load i8, i8* %A.addr.07, align 1
+  %incdec.ptr1 = getelementptr inbounds i8, i8* %B.addr.08, i32 1
+  %1 = load i8, i8* %B.addr.08, align 1
+  %add = add i8 %1, %0
+  %incdec.ptr4 = getelementptr inbounds i8, i8* %C.addr.09, i32 1
+  store i8 %add, i8* %C.addr.09, align 1
+  %dec = add i32 %N.addr.010, -1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  ret void
+}
+
+attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" }

diff  --git a/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll
new file mode 100644
index 000000000000..2667bfe68f61
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s
+
+; Check that when we can't predicate this loop that it is still vectorised (with
+; an epilogue).
+; TODO: the reason this can't be predicated is because a primary induction
+; variable can't be found (not yet) for this counting down loop. But with that
+; fixed, this should be able to be predicated.
+
+; CHECK-LABEL: vector.body:
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+define dso_local void @foo(i8* noalias nocapture readonly %A, i8* noalias nocapture readonly %B, i8* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp6 = icmp eq i32 %N, 0
+  br i1 %cmp6, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %N.addr.010 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
+  %C.addr.09 = phi i8* [ %incdec.ptr4, %while.body ], [ %C, %while.body.preheader ]
+  %B.addr.08 = phi i8* [ %incdec.ptr1, %while.body ], [ %B, %while.body.preheader ]
+  %A.addr.07 = phi i8* [ %incdec.ptr, %while.body ], [ %A, %while.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %A.addr.07, i32 1
+  %0 = load i8, i8* %A.addr.07, align 1
+  %incdec.ptr1 = getelementptr inbounds i8, i8* %B.addr.08, i32 1
+  %1 = load i8, i8* %B.addr.08, align 1
+  %add = add i8 %1, %0
+  %incdec.ptr4 = getelementptr inbounds i8, i8* %C.addr.09, i32 1
+  store i8 %add, i8* %C.addr.09, align 1
+  %dec = add i32 %N.addr.010, -1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  ret void
+}