[llvm] 959eaa5 - [ARM][MVE] Only tail-fold integer add reductions

Tue Jul 14 02:15:39 PDT 2020

Author: Sjoerd Meijer
Date: 2020-07-14T10:15:07+01:00
New Revision: 959eaa50d62d807dd78c980d5f3b9da0f06b0003

URL: https://github.com/llvm/llvm-project/commit/959eaa50d62d807dd78c980d5f3b9da0f06b0003
DIFF: https://github.com/llvm/llvm-project/commit/959eaa50d62d807dd78c980d5f3b9da0f06b0003.diff

LOG: [ARM][MVE] Only tail-fold integer add reductions

If a vector body has live-out values, it is probably a reduction, which needs a
final reduction step after the loop. MVE has a VADDV instruction to reduce
integer vectors, but doesn't have an equivalent one for float vectors. A
live-out value that is not recognised as reduction later in the optimisation
pipeline will result in the tail-predicated loop to be reverted to a
non-predicated loop and this is very expensive, i.e. it has a significant
performance impact, which is what we hope to avoid with fine tuning the ARM TTI
hook preferPredicateOverEpilogue implementation.

Differential Revision: https://reviews.llvm.org/D82953

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
    llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 575e6171059d..bea4e157a131 100644

--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -1405,12 +1406,47 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
 static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                                  const DataLayout &DL,
                                  const LoopAccessInfo *LAI) {
+  LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
+
+  // If there are live-out values, it is probably a reduction, which needs a
+  // final reduction step after the loop. MVE has a VADDV instruction to reduce
+  // integer vectors, but doesn't have an equivalent one for float vectors. A
+  // live-out value that is not recognised as a reduction will result in the
+  // tail-predicated loop to be reverted to a non-predicated loop and this is
+  // very expensive, i.e. it has a significant performance impact. So, in this
+  // case it's better not to tail-predicate the loop, which is what we check
+  // here. Thus, we allow only 1 live-out value, which has to be an integer
+  // reduction, which matches the loops supported by ARMLowOverheadLoops.
+  // It is important to keep ARMLowOverheadLoops and canTailPredicateLoop in
+  // sync with each other.
+  SmallVector< Instruction *, 8 > LiveOuts;
+  LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
+  bool IntReductionsDisabled =
+      EnableTailPredication == TailPredication::EnabledNoReductions ||
+      EnableTailPredication == TailPredication::ForceEnabledNoReductions;
+
+  for (auto *I : LiveOuts) {
+    if (!I->getType()->isIntegerTy()) {
+      LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer "
+                           "live-out value\n");
+      return false;
+    }
+    if (I->getOpcode() != Instruction::Add) {
+      LLVM_DEBUG(dbgs() << "Only add reductions supported\n");
+      return false;
+    }
+    if (IntReductionsDisabled) {
+      LLVM_DEBUG(dbgs() << "Integer add reductions not enabled\n");
+      return false;
+    }
+  }
+
+  // Next, check that all instructions can be tail-predicated.
   PredicatedScalarEvolution PSE = LAI->getPSE();
+  SmallVector<Instruction *, 16> LoadStores;
   int ICmpCount = 0;
   int Stride = 0;
 
-  LLVM_DEBUG(dbgs() << "tail-predication: checking allowed instructions\n");
-  SmallVector<Instruction *, 16> LoadStores;
   for (BasicBlock *BB : L->blocks()) {
     for (Instruction &I : BB->instructionsWithoutDebug()) {
       if (isa<PHINode>(&I))
@@ -1458,8 +1494,10 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
                                              TargetLibraryInfo *TLI,
                                              DominatorTree *DT,
                                              const LoopAccessInfo *LAI) {
-  if (!EnableTailPredication)
+  if (!EnableTailPredication) {
+    LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
     return false;
+  }
 
   // Creating a predicated vector loop is the first step for generating a
   // tail-predicated hardware loop, for which we need the MVE masked

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
index eda3c115c0f6..c6b415ff9cd4 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
@@ -4,6 +4,13 @@
 ; RUN: opt < %s -loop-vectorize -tail-predication=enabled -prefer-predicate-over-epilog -S | \
 ; RUN:   FileCheck -check-prefixes=COMMON,PREDFLAG %s
 
+; RUN: opt < %s -loop-vectorize -tail-predication=enabled-no-reductions -S | \
+; RUN:  FileCheck %s -check-prefixes=COMMON,NORED
+
+; RUN: opt < %s -loop-vectorize -tail-predication=force-enabled-no-reductions -S | \
+; RUN:  FileCheck %s -check-prefixes=COMMON,NORED
+
+
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-arm-unknown-eabihf"
 
@@ -162,6 +169,326 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !14
 }
 
+define dso_local i32 @i32_add_reduction(i32* noalias nocapture readonly %B, i32 %N) local_unnamed_addr #0 {
+; COMMON-LABEL: i32_add_reduction(
+; COMMON:       entry:
+; CHECK:        @llvm.get.active.lane.mask
+; NORED-NOT:    @llvm.get.active.lane.mask
+; COMMON:       }
+entry:
+  %cmp6 = icmp sgt i32 %N, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  %add.lcssa = phi i32 [ %add, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %S.0.lcssa = phi i32 [ 1, %entry ], [ %add.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %S.0.lcssa
+
+for.body:
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %S.07 = phi i32 [ %add, %for.body ], [ 1, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.08
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, %S.07
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+
+; Don't tail-fold float reductions.
+;
+define dso_local void @f32_reduction(float* nocapture readonly %Input, i32 %N, float* nocapture %Output) local_unnamed_addr #0 {
+; CHECK-LABEL: f32_reduction(
+; CHECK:       vector.body:
+; CHECK-NOT:   @llvm.masked.load
+; CHECK-NOT:   @llvm.masked.store
+; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
+entry:
+  %cmp6 = icmp eq i32 %N, 0
+  br i1 %cmp6, label %while.end, label %while.body.preheader
+
+while.body.preheader:                             ; preds = %entry
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.preheader, %while.body
+  %blkCnt.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
+  %sum.08 = phi float [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ]
+  %Input.addr.07 = phi float* [ %incdec.ptr, %while.body ], [ %Input, %while.body.preheader ]
+  %incdec.ptr = getelementptr inbounds float, float* %Input.addr.07, i32 1
+  %0 = load float, float* %Input.addr.07, align 4
+  %add = fadd fast float %0, %sum.08
+  %dec = add i32 %blkCnt.09, -1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  %add.lcssa = phi float [ %add, %while.body ]
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ]
+  %conv = uitofp i32 %N to float
+  %div = fdiv fast float %sum.0.lcssa, %conv
+  store float %div, float* %Output, align 4
+  ret void
+}
+
+; Don't tail-fold float reductions.
+;
+define dso_local void @mixed_f32_i32_reduction(float* nocapture readonly %fInput, i32* nocapture readonly %iInput, i32 %N, float* nocapture %fOutput, i32* nocapture %iOutput) local_unnamed_addr #0 {
+; CHECK-LABEL: mixed_f32_i32_reduction(
+; CHECK:       vector.body:
+; CHECK-NOT:   @llvm.masked.load
+; CHECK-NOT:   @llvm.masked.store
+; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
+entry:
+  %cmp15 = icmp eq i32 %N, 0
+  br i1 %cmp15, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %blkCnt.020 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
+  %isum.019 = phi i32 [ %add2, %while.body ], [ 0, %while.body.preheader ]
+  %fsum.018 = phi float [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ]
+  %fInput.addr.017 = phi float* [ %incdec.ptr, %while.body ], [ %fInput, %while.body.preheader ]
+  %iInput.addr.016 = phi i32* [ %incdec.ptr1, %while.body ], [ %iInput, %while.body.preheader ]
+  %incdec.ptr = getelementptr inbounds float, float* %fInput.addr.017, i32 1
+  %incdec.ptr1 = getelementptr inbounds i32, i32* %iInput.addr.016, i32 1
+  %0 = load i32, i32* %iInput.addr.016, align 4
+  %add2 = add nsw i32 %0, %isum.019
+  %1 = load float, float* %fInput.addr.017, align 4
+  %add = fadd fast float %1, %fsum.018
+  %dec = add i32 %blkCnt.020, -1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  %add.lcssa = phi float [ %add, %while.body ]
+  %add2.lcssa = phi i32 [ %add2, %while.body ]
+  %phitmp = sitofp i32 %add2.lcssa to float
+  br label %while.end
+
+while.end:
+  %fsum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ]
+  %isum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %phitmp, %while.end.loopexit ]
+  %conv = uitofp i32 %N to float
+  %div = fdiv fast float %fsum.0.lcssa, %conv
+  store float %div, float* %fOutput, align 4
+  %div5 = fdiv fast float %isum.0.lcssa, %conv
+  %conv6 = fptosi float %div5 to i32
+  store i32 %conv6, i32* %iOutput, align 4
+  ret void
+}
+
+define dso_local i32 @i32_mul_reduction(i32* noalias nocapture readonly %B, i32 %N) local_unnamed_addr #0 {
+; CHECK-LABEL: i32_mul_reduction(
+; CHECK:       vector.body:
+; CHECK-NOT:   @llvm.masked.load
+; CHECK-NOT:   @llvm.masked.store
+; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
+entry:
+  %cmp6 = icmp sgt i32 %N, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  %mul.lcssa = phi i32 [ %mul, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %S.0.lcssa = phi i32 [ 1, %entry ], [ %mul.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %S.0.lcssa
+
+for.body:
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %S.07 = phi i32 [ %mul, %for.body ], [ 1, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.08
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = mul nsw i32 %0, %S.07
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define dso_local i32 @i32_or_reduction(i32* noalias nocapture readonly %B, i32 %N) local_unnamed_addr #0 {
+; CHECK-LABEL: i32_or_reduction(
+; CHECK:       vector.body:
+; CHECK-NOT:   @llvm.masked.load
+; CHECK-NOT:   @llvm.masked.store
+; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
+entry:
+  %cmp6 = icmp sgt i32 %N, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %or.lcssa = phi i32 [ %or, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %S.0.lcssa = phi i32 [ 1, %entry ], [ %or.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %S.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %S.07 = phi i32 [ %or, %for.body ], [ 1, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.08
+  %0 = load i32, i32* %arrayidx, align 4
+  %or = or i32 %0, %S.07
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define dso_local i32 @i32_and_reduction(i32* noalias nocapture readonly %A, i32 %N, i32 %S) local_unnamed_addr #0 {
+; CHECK-LABEL: i32_and_reduction(
+; CHECK:       vector.body:
+; CHECK-NOT:   @llvm.masked.load
+; CHECK-NOT:   @llvm.masked.store
+; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
+entry:
+  %cmp5 = icmp sgt i32 %N, 0
+  br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %and.lcssa = phi i32 [ %and, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %S.addr.0.lcssa = phi i32 [ %S, %entry ], [ %and.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %S.addr.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %S.addr.06 = phi i32 [ %and, %for.body ], [ %S, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.07
+  %0 = load i32, i32* %arrayidx, align 4
+  %and = and i32 %0, %S.addr.06
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define i32 @i32_smin_reduction(i32* nocapture readonly %x, i32 %n) #0 {
+; CHECK-LABEL: i32_smin_reduction(
+; CHECK:       vector.body:
+; CHECK-NOT:   @llvm.masked.load
+; CHECK-NOT:   @llvm.masked.store
+; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %r.07 = phi i32 [ %add, %for.body ], [ 2147483647, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
+  %0 = load i32, i32* %arrayidx, align 4
+  %c = icmp slt i32 %r.07, %0
+  %add = select i1 %c, i32 %r.07, i32 %0
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %add, %for.body ]
+  ret i32 %r.0.lcssa
+}
+
+define i32 @i32_smax_reduction(i32* nocapture readonly %x, i32 %n) #0 {
+; CHECK-LABEL: i32_smax_reduction(
+; CHECK:       vector.body:
+; CHECK-NOT:   @llvm.masked.load
+; CHECK-NOT:   @llvm.masked.store
+; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %r.07 = phi i32 [ %add, %for.body ], [ -2147483648, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
+  %0 = load i32, i32* %arrayidx, align 4
+  %c = icmp sgt i32 %r.07, %0
+  %add = select i1 %c, i32 %r.07, i32 %0
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %add, %for.body ]
+  ret i32 %r.0.lcssa
+}
+
+define i32 @i32_umin_reduction(i32* nocapture readonly %x, i32 %n) #0 {
+; CHECK-LABEL: i32_umin_reduction(
+; CHECK:       vector.body:
+; CHECK-NOT:   @llvm.masked.load
+; CHECK-NOT:   @llvm.masked.store
+; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %r.07 = phi i32 [ %add, %for.body ], [ 4294967295, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
+  %0 = load i32, i32* %arrayidx, align 4
+  %c = icmp ult i32 %r.07, %0
+  %add = select i1 %c, i32 %r.07, i32 %0
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %r.0.lcssa = phi i32 [ 4294967295, %entry ], [ %add, %for.body ]
+  ret i32 %r.0.lcssa
+}
+
+define i32 @i32_umax_reduction(i32* nocapture readonly %x, i32 %n) #0 {
+; CHECK-LABEL: i32_umax_reduction(
+; CHECK:       vector.body:
+; CHECK-NOT:   @llvm.masked.load
+; CHECK-NOT:   @llvm.masked.store
+; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.cond.cleanup
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
+  %0 = load i32, i32* %arrayidx, align 4
+  %c = icmp ugt i32 %r.07, %0
+  %add = select i1 %c, i32 %r.07, i32 %0
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %r.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  ret i32 %r.0.lcssa
+}
+
 ; CHECK:      !0 = distinct !{!0, !1}
 ; CHECK-NEXT: !1 = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK-NEXT: !2 = distinct !{!2, !3, !1}