[llvm] cab6cd6 - [AArch64][LoopVectorize] Introduce trip count minimal value threshold to ignore tail-folding.

Tue Aug 9 14:11:12 PDT 2022

Author: Dinar Temirbulatov
Date: 2022-08-09T22:10:17+01:00
New Revision: cab6cd68340255be241b7cf169c67a1899ced115

URL: https://github.com/llvm/llvm-project/commit/cab6cd68340255be241b7cf169c67a1899ced115
DIFF: https://github.com/llvm/llvm-project/commit/cab6cd68340255be241b7cf169c67a1899ced115.diff

LOG: [AArch64][LoopVectorize] Introduce trip count minimal value threshold to ignore tail-folding.

After D121595 was commited, I noticed regressions assosicated with small trip
count numbersvectorisation by tail folding with scalable vectors. As a solution
for those issues I propose to introduce the minimal trip count threshold value.

  Differential Revision: https://reviews.llvm.org/D130755

Added: 
    

Modified: 
    llvm/include/llvm/Analysis/TargetTransformInfo.h
    llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
    llvm/lib/Analysis/TargetTransformInfo.cpp
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 43c2fce688452..c838df90aeb98 100644

--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1465,6 +1465,10 @@ class TargetTransformInfo {
   /// to a stack reload.
   unsigned getGISelRematGlobalCost() const;
 
+  /// \returns the lower bound of a trip count to decide on vectorization
+  /// while tail-folding.
+  unsigned getMinTripCountTailFoldingThreshold() const;
+
   /// \returns True if the target supports scalable vectors.
   bool supportsScalableVectors() const;
 
@@ -1863,6 +1867,7 @@ class TargetTransformInfo::Concept {
                                                ReductionFlags) const = 0;
   virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
   virtual unsigned getGISelRematGlobalCost() const = 0;
+  virtual unsigned getMinTripCountTailFoldingThreshold() const = 0;
   virtual bool enableScalableVectorization() const = 0;
   virtual bool supportsScalableVectors() const = 0;
   virtual bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
@@ -2502,6 +2507,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.getGISelRematGlobalCost();
   }
 
+  unsigned getMinTripCountTailFoldingThreshold() const override {
+    return Impl.getMinTripCountTailFoldingThreshold();
+  }
+
   bool supportsScalableVectors() const override {
     return Impl.supportsScalableVectors();
   }

diff  --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index cf0995a9b0fe0..abc9116a5a2a4 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -817,6 +817,8 @@ class TargetTransformInfoImplBase {
 
   unsigned getGISelRematGlobalCost() const { return 1; }
 
+  unsigned getMinTripCountTailFoldingThreshold() const { return 0; }
+
   bool supportsScalableVectors() const { return false; }
 
   bool enableScalableVectorization() const { return false; }

diff  --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 74b1083397bef..eba58a0dcce99 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1132,6 +1132,10 @@ unsigned TargetTransformInfo::getGISelRematGlobalCost() const {
   return TTIImpl->getGISelRematGlobalCost();
 }
 
+unsigned TargetTransformInfo::getMinTripCountTailFoldingThreshold() const {
+  return TTIImpl->getMinTripCountTailFoldingThreshold();
+}
+
 bool TargetTransformInfo::supportsScalableVectors() const {
   return TTIImpl->supportsScalableVectors();
 }

diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index c4e7135d8b57a..acd243d3f1736 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -335,6 +335,10 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     return 2;
   }
 
+  unsigned getMinTripCountTailFoldingThreshold() const {
+    return ST->hasSVE() ? 5 : 0;
+  }
+
   PredicationStyle emitGetActiveLaneMask() const {
     if (ST->hasSVE())
       return PredicationStyle::DataAndControlFlow;

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 368ab0078c0b9..c1d5677d16f78 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -10145,8 +10145,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
     else {
-      LLVM_DEBUG(dbgs() << "\n");
-      SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
+      if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
+        LLVM_DEBUG(dbgs() << "\n");
+        SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
+      } else {
+        LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
+                             "small to consider vectorizing.\n");
+        reportVectorizationFailure(
+            "The trip count is below the minial threshold value.",
+            "loop trip count is too low, avoiding vectorization",
+            "LowTripCount", ORE, L);
+        Hints.emitRemarkWithHints();
+        return false;
+      }
     }
   }
 

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
index a97782aac2845..bc3e1c2513a11 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
@@ -40,18 +40,22 @@ for.end:                                          ; preds = %for.body
 
 define void @trip5_i8(i8* noalias nocapture noundef %dst, i8* noalias nocapture noundef readonly %src) #0 {
 ; CHECK-LABEL: @trip5_i8(
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
-; CHECK:         [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ {{%.*}}, %vector.ph ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %vector.body ]
-; CHECK:         {{%.*}} = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0nxv16i8(<vscale x 16 x i8>* {{%.*}}, i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
-; CHECK:         {{%.*}} = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0nxv16i8(<vscale x 16 x i8>* {{%.*}}, i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
-; CHECK:         call void @llvm.masked.store.nxv16i8.p0nxv16i8(<vscale x 16 x i8> {{%.*}}, <vscale x 16 x i8>* {{%.*}}, i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK:         [[VSCALE:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[VF:%.*]] = mul i64 [[VSCALE]], 16
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 5)
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NOT:%.*]] = xor <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i32 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer)
-; CHECK-NEXT:    br i1 true, label %middle.block, label %vector.body
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[MUL:%.*]] = shl i8 [[TMP0]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[MUL]], [[TMP1]]
+; CHECK-NEXT:    store i8 [[ADD]], i8* [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 5
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
 ;
 entry:
   br label %for.body