[llvm] b0f904b - [LV] Account for minimum vscale when rejecting scalable vectorization of short loops

Fri Dec 9 11:29:49 PST 2022

Author: Philip Reames
Date: 2022-12-09T11:29:41-08:00
New Revision: b0f904b6da044ea65d344bc07fc56234adbc6268

URL: https://github.com/llvm/llvm-project/commit/b0f904b6da044ea65d344bc07fc56234adbc6268
DIFF: https://github.com/llvm/llvm-project/commit/b0f904b6da044ea65d344bc07fc56234adbc6268.diff

LOG: [LV] Account for minimum vscale when rejecting scalable vectorization of short loops

The vectorizer has code to reject scalable vectorization of loops with very short trip counts, and instead use fixed length vectors. The current code doesn't account for the minimum vscale value known, and thus under estimates the number of lanes in the scalable type for RISCV's default configuration. This results in use of predication and a trivially dead loop where a single straight line piece of code would suffice.

Note that the code quality of the original scalable vectorization could (and probably should) be improved other ways as well. This patch is solely about whether the scalable vectorization was the right choice to begin with.

This bit of code - both with and without my change - does make the unchecked assumption that the target knows how to lower fixed length vectors whose length is provably less than the vector length.

Differential Revision: https://reviews.llvm.org/D137285

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6ba0185f0ab0..9805a845d99a 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5182,7 +5182,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
     ElementCount MaxSafeVF, bool FoldTailByMasking) {
   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
-  TypeSize WidestRegister = TTI.getRegisterBitWidth(
+  const TypeSize WidestRegister = TTI.getRegisterBitWidth(
       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
                            : TargetTransformInfo::RGK_FixedWidthVector);
 
@@ -5209,9 +5209,14 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
     return ElementCount::getFixed(1);
   }
 
-  const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
-  if (ConstTripCount &&
-      ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
+  unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
+  if (MaxVectorElementCount.isScalable() &&
+      TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
+    auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
+    auto Min = Attr.getVScaleRangeMin();
+    WidestRegisterMinEC *= Min;
+  }
+  if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC &&
       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
     // If loop trip count (TC) is known at compile time there is no point in
     // choosing VF greater than TC (as done in the loop below). Select maximum

diff  --git a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
index 98abf020a3b4..24b692c12172 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/short-trip-count.ll
@@ -1,10 +1,63 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=riscv64 -mattr=+v -passes=loop-vectorize < %s | FileCheck %s
 
-; FIXME: Using a <4 x i32> would be strictly better than tail folded
-; scalable vectorization in this case.
-define void @small_trip_count(i32* nocapture %a) nounwind vscale_range(4,1024) {
-; CHECK-LABEL: @small_trip_count(
+define void @small_trip_count_min_vlen_128(i32* nocapture %a) nounwind vscale_range(4,1024) {
+; CHECK-LABEL: @small_trip_count_min_vlen_128(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 4, 4
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[IV]]
+; CHECK-NEXT:    [[V:%.*]] = load i32, i32* [[GEP]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[V]], 1
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[GEP]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[IV]], 3
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ %iv.next, %loop ], [ 0, %entry ]
+  %gep = getelementptr inbounds i32, i32* %a, i32 %iv
+  %v = load i32, i32* %gep, align 4
+  %add = add nsw i32 %v, 1
+  store i32 %add, i32* %gep, align 4
+  %iv.next = add i32 %iv, 1
+  %cond = icmp eq i32 %iv, 3
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Note: This test uses a vscale_range starting at 1, which is technically incompatibile
+; with +v.  If we expose a target hook for minimum vlen, this example will need reworked
+define void @small_trip_count_min_vlen_32(i32* nocapture %a) nounwind vscale_range(1,1024) {
+; CHECK-LABEL: @small_trip_count_min_vlen_32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 2
@@ -35,7 +88,7 @@ define void @small_trip_count(i32* nocapture %a) nounwind vscale_range(4,1024) {
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -49,7 +102,7 @@ define void @small_trip_count(i32* nocapture %a) nounwind vscale_range(4,1024) {
 ; CHECK-NEXT:    store i32 [[ADD]], i32* [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[IV]], 3
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;