[llvm] befc952 - [LoopVectorize] Permit tail-folding for low trip counts using scalable vectors

Mon May 16 01:14:32 PDT 2022

Author: David Sherwood
Date: 2022-05-16T09:14:24+01:00
New Revision: befc952045066e840c7d84b3d3a4ab112a36366a

URL: https://github.com/llvm/llvm-project/commit/befc952045066e840c7d84b3d3a4ab112a36366a
DIFF: https://github.com/llvm/llvm-project/commit/befc952045066e840c7d84b3d3a4ab112a36366a.diff

LOG: [LoopVectorize] Permit tail-folding for low trip counts using scalable vectors

When the loop vectoriser encounters a known low trip count it tries
to create a single predicated loop in order to get the benefit of
vectorisation and eliminate the scalar tail. However, until now the
vectoriser prevented the use of scalable vectors in this case due
to concerns in the past about stability. I believe that tail-folded
loops using scalable vectors are now sufficiently well tested that
we can enable this. For the same reason I've also enabled it when
optimising for code size too.

Tests added here:

  Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
  Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll
  Transforms/LoopVectorize/RISCV/low-trip-count.ll

Differential Revision: https://reviews.llvm.org/D121595

Added: 
    llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
    llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll
    llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3501920430e11..266aee02d5c95 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5133,14 +5133,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     }
   }
 
-  // For scalable vectors don't use tail folding for low trip counts or
-  // optimizing for code size. We only permit this if the user has explicitly
-  // requested it.
-  if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate &&
-      ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate &&
-      MaxFactors.ScalableVF.isVector())
-    MaxFactors.ScalableVF = ElementCount::getScalable(0);
-
   // If we don't know the precise trip count, or if the trip count that we
   // found modulo the vectorization factor is not zero, try to fold the tail
   // by masking.

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
new file mode 100644
index 0000000000000..e299d35a2dcd3
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-low-trip-count.ll
@@ -0,0 +1,73 @@
+; RUN: opt -loop-vectorize -S < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @trip7_i64(i64* noalias nocapture noundef %dst, i64* noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip7_i64(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK:         [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 {{%.*}}, i64 7)
+; CHECK:         {{%.*}} = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64>* {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK:         {{%.*}} = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64>* {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK:         call void @llvm.masked.store.nxv2i64.p0nxv2i64(<vscale x 2 x i64> {{%.*}}, <vscale x 2 x i64>* {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK:         [[VSCALE:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[VF:%.*]] = mul i64 [[VSCALE]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{%.*}}
+; CHECK-NEXT:    br i1 [[COND]], label %middle.block, label %vector.body
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i64, i64* %src, i64 %i.06
+  %0 = load i64, i64* %arrayidx, align 8
+  %mul = shl nsw i64 %0, 1
+  %arrayidx1 = getelementptr inbounds i64, i64* %dst, i64 %i.06
+  %1 = load i64, i64* %arrayidx1, align 8
+  %add = add nsw i64 %1, %mul
+  store i64 %add, i64* %arrayidx1, align 8
+  %inc = add nuw nsw i64 %i.06, 1
+  %exitcond.not = icmp eq i64 %inc, 7
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @trip5_i8(i8* noalias nocapture noundef %dst, i8* noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip5_i8(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK:         [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 {{%.*}}, i64 5)
+; CHECK:         {{%.*}} = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0nxv16i8(<vscale x 16 x i8>* {{%.*}}, i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK:         {{%.*}} = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0nxv16i8(<vscale x 16 x i8>* {{%.*}}, i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
+; CHECK:         call void @llvm.masked.store.nxv16i8.p0nxv16i8(<vscale x 16 x i8> {{%.*}}, <vscale x 16 x i8>* {{%.*}}, i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK:         [[VSCALE:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[VF:%.*]] = mul i64 [[VSCALE]], 16
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{%.*}}
+; CHECK-NEXT:    br i1 [[COND]], label %middle.block, label %vector.body
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8, i8* %src, i64 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %mul = shl i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, i8* %dst, i64 %i.08
+  %1 = load i8, i8* %arrayidx1, align 1
+  %add = add i8 %mul, %1
+  store i8 %add, i8* %arrayidx1, align 1
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, 5
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { vscale_range(1,16) "target-features"="+sve" }

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll
new file mode 100644
index 0000000000000..c93fca02dbf23
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-optsize.ll
@@ -0,0 +1,39 @@
+; RUN: opt -loop-vectorize -S < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @trip1024_i64(i64* noalias nocapture noundef %dst, i64* noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip1024_i64(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK:         [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 {{%.*}}, i64 1024)
+; CHECK:         {{%.*}} = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64>* {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK:         {{%.*}} = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0nxv2i64(<vscale x 2 x i64>* {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; CHECK:         call void @llvm.masked.store.nxv2i64.p0nxv2i64(<vscale x 2 x i64> {{%.*}}, <vscale x 2 x i64>* {{%.*}}, i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK:         [[VSCALE:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[VF:%.*]] = mul i64 [[VSCALE]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[INDEX_NEXT]], {{%.*}}
+; CHECK-NEXT:    br i1 [[COND]], label %middle.block, label %vector.body
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i64, i64* %src, i64 %i.06
+  %0 = load i64, i64* %arrayidx, align 8
+  %mul = shl nsw i64 %0, 1
+  %arrayidx1 = getelementptr inbounds i64, i64* %dst, i64 %i.06
+  %1 = load i64, i64* %arrayidx1, align 8
+  %add = add nsw i64 %1, %mul
+  store i64 %add, i64* %arrayidx1, align 8
+  %inc = add nuw nsw i64 %i.06, 1
+  %exitcond.not = icmp eq i64 %inc, 1024
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { vscale_range(1,16) "target-features"="+sve" optsize }

diff  --git a/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
new file mode 100644
index 0000000000000..e8341acc3657f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/low-trip-count.ll
@@ -0,0 +1,33 @@
+; RUN: opt -loop-vectorize -riscv-v-vector-bits-min=128 -scalable-vectorization=on -force-target-instruction-cost=1 -S < %s | FileCheck %s
+
+target triple = "riscv64"
+
+define void @trip5_i8(i8* noalias nocapture noundef %dst, i8* noalias nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: @trip5_i8(
+; CHECK:       vector.body:
+; CHECK:         [[ACTIVE_LANE_MASK:%.*]] = icmp ule <vscale x 8 x i64> {{%.*}}, shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 4, i32 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK:         {{%.*}} = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0nxv8i8(<vscale x 8 x i8>* {{%.*}}, i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
+; CHECK:         {{%.*}} = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8.p0nxv8i8(<vscale x 8 x i8>* {{%.*}}, i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i8> poison)
+; CHECK:         call void @llvm.masked.store.nxv8i8.p0nxv8i8(<vscale x 8 x i8> {{%.*}}, <vscale x 8 x i8>* {{%.*}}, i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i8, i8* %src, i64 %i.08
+  %0 = load i8, i8* %arrayidx, align 1
+  %mul = shl i8 %0, 1
+  %arrayidx1 = getelementptr inbounds i8, i8* %dst, i64 %i.08
+  %1 = load i8, i8* %arrayidx1, align 1
+  %add = add i8 %mul, %1
+  store i8 %add, i8* %arrayidx1, align 1
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, 5
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { "target-features"="+v,+d" }