[llvm] a0fcf84 - [LV] Consider if scalar epilogue is required in getMaximizedVFForTarget.

Thu Jul 6 05:36:02 PDT 2023

Author: Florian Hahn
Date: 2023-07-06T13:35:35+01:00
New Revision: a0fcf84a8c4d953730170f8f2da17a7556991607

URL: https://github.com/llvm/llvm-project/commit/a0fcf84a8c4d953730170f8f2da17a7556991607
DIFF: https://github.com/llvm/llvm-project/commit/a0fcf84a8c4d953730170f8f2da17a7556991607.diff

LOG: [LV] Consider if scalar epilogue is required in getMaximizedVFForTarget.

When a scalar epilogue is required, at least one iteration of the scalar loop
has to execute. Adjust ConstTripCount accordingly to avoid picking a max VF
that results in a dead vector loop.

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D154261

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
    llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d82c664029b234..548cde377532b9 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5288,6 +5288,13 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
     auto Min = Attr.getVScaleRangeMin();
     WidestRegisterMinEC *= Min;
   }
+
+  // When a scalar epilogue is required, at least one iteration of the scalar
+  // loop has to execute. Adjust ConstTripCount accordingly to avoid picking a
+  // max VF that results in a dead vector loop.
+  if (ConstTripCount > 0 && requiresScalarEpilogue(true))
+    ConstTripCount -= 1;
+
   if (ConstTripCount && ConstTripCount <= WidestRegisterMinEC &&
       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
     // If loop trip count (TC) is known at compile time there is no point in

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
index b8333ad0c47789..2a8476c8d7111b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
@@ -283,12 +283,10 @@ exit:
   ret void
 }
 
-define void @limit_main_loop_vf_to_avoid_epilogue_vectorization(ptr noalias %src, ptr noalias %dst) {
-; CHECK-LABEL: @limit_main_loop_vf_to_avoid_epilogue_vectorization(
-; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+define void @limit_main_loop_vf_to_avoid_dead_main_vector_loop(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: @limit_main_loop_vf_to_avoid_dead_main_vector_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
@@ -296,48 +294,28 @@ define void @limit_main_loop_vf_to_avoid_epilogue_vectorization(ptr noalias %src
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [3 x i8], ptr [[SRC:%.*]], i64 [[TMP0]], i64 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <96 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <96 x i8> [[WIDE_VEC]], <96 x i8> poison, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93>
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <24 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_VEC]], <24 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[DST:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    store <32 x i8> [[STRIDED_VEC]], ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NEXT:    store <8 x i8> [[STRIDED_VEC]], ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x i8], ptr [[SRC]], i64 [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <24 x i8>, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <24 x i8> [[WIDE_VEC2]], <24 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <8 x i8> [[STRIDED_VEC3]], ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 8
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 24
-; CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    br label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 24, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds [3 x i8], ptr [[SRC]], i64 [[IV]], i64 0
 ; CHECK-NEXT:    [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1
 ; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
 ; CHECK-NEXT:    store i8 [[L]], ptr [[GEP_DST]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 32
-; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll
index a23e6326d8631d..a9f4b7d7f209cb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr56319-vector-exit-cond-optimization-epilogue-vectorization.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=loop-vectorize -mcpu=skylake-avx512 -S %s | FileCheck %s
+; RUN: opt -passes=loop-vectorize -mcpu=skylake-avx512 -epilogue-vectorization-force-VF=2 -S %s | FileCheck %s
 
 target triple = "x86_64-apple-macos"
 
 ; Test case where the exit condition in the main vector loop can be optimized
-; to true, but not in the epilogue vector loop. In the test the interleave
-; group requires to execute at least one scalar iteration, meaning the last
-; vector iteration of the epilogue vector loop cannot be executed.
+; to true, but not in the epilogue vector loop. A single iteration of the main
+; vector loop will be executed, followed by 2 iterations of the vector epilogue,
+; followed by a final iteration of the scalar loop,
 define void @pr56319(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-LABEL: @pr56319(
 ; CHECK-NEXT:  iter.check:
@@ -40,12 +40,12 @@ define void @pr56319(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX1]], 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [3 x i8], ptr [[SRC]], i64 [[TMP6]], i64 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <12 x i8>, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i8> [[WIDE_VEC2]], <12 x i8> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <6 x i8>, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <6 x i8> [[WIDE_VEC2]], <6 x i8> poison, <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    store <4 x i8> [[STRIDED_VEC3]], ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT:    store <2 x i8> [[STRIDED_VEC3]], ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 36
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
@@ -60,7 +60,7 @@ define void @pr56319(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
 ; CHECK-NEXT:    store i8 [[L]], ptr [[GEP_DST]], align 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 39
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 37
 ; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
@@ -75,7 +75,7 @@ loop:
   %gep.dst = getelementptr inbounds i8, ptr %dst, i64 %iv
   store i8 %l, ptr %gep.dst, align 1
   %iv.next = add nuw nsw i64 %iv, 1
-  %cmp = icmp eq i64 %iv.next, 39
+  %cmp = icmp eq i64 %iv.next, 37
   br i1 %cmp, label %exit, label %loop
 
 exit: