[llvm] b4c6d1b - [LoopVectorizer] Don't perform interleaving of predicated scalar loops

Mon Feb 7 11:34:33 PST 2022

Author: David Green
Date: 2022-02-07T19:34:28Z
New Revision: b4c6d1bb379192cb5b712fda9f60cd105f21194f

URL: https://github.com/llvm/llvm-project/commit/b4c6d1bb379192cb5b712fda9f60cd105f21194f
DIFF: https://github.com/llvm/llvm-project/commit/b4c6d1bb379192cb5b712fda9f60cd105f21194f.diff

LOG: [LoopVectorizer] Don't perform interleaving of predicated scalar loops

The vectorizer will choose at times to "vectorize" loops with a scalar
factor (VF=1) with interleaving (IC > 1). This can occasionally produce
better code than the unroller (notable for reductions where it can
produce independent reduction chains that are combined after the loop).
At times this is not very beneficial though, for example when runtime
checks are needed or when the scalar code requires predication.

This addresses the second point, preventing the vectorizer from
interleaving when the scalar loop will require predication. This
prevents it from making a bit of a mess, that is worse than the original
and better left for the unroller to unroll if beneficial. It helps
reverse some of the regressions from D118090.

Differential Revision: https://reviews.llvm.org/D118566

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/AArch64/scalar_interleave.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 295f2a90a1c48..f8f54a0e70603 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6136,9 +6136,15 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
     return IC;
   }
 
-  // Note that if we've already vectorized the loop we will have done the
-  // runtime check and so interleaving won't require further checks.
-  bool InterleavingRequiresRuntimePointerCheck =
+  // For any scalar loop that either requires runtime checks or predication we
+  // are better off leaving this to the unroller. Note that if we've already
+  // vectorized the loop we will have done the runtime check and so interleaving
+  // won't require further checks.
+  bool ScalarInterleavingRequiresPredication =
+      (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
+         return Legal->blockNeedsPredication(BB);
+       }));
+  bool ScalarInterleavingRequiresRuntimePointerCheck =
       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
 
   // We want to interleave small loops in order to reduce the loop overhead and
@@ -6148,7 +6154,8 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
                     << "LV: VF is " << VF << '\n');
   const bool AggressivelyInterleaveReductions =
       TTI.enableAggressiveInterleaving(HasReductions);
-  if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
+  if (!ScalarInterleavingRequiresRuntimePointerCheck &&
+      !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
     // We assume that the cost overhead is 1 and we use the cost model
     // to estimate the cost of the loop and interleave until the cost of the
     // loop overhead is about 5% of the cost of the loop.

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalar_interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalar_interleave.ll
index cae779aaeb7c6..510ff4d8ff56a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalar_interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalar_interleave.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -loop-vectorize -S -o - < %s | FileCheck %s
+; RUN: opt -loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S -o - < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-arm-none-eabi"
@@ -47,90 +48,10 @@ define void @arm_correlate_f16(half* nocapture noundef readonly %pSrcA, i32 noun
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 1, [[IF_END12]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ]
 ; CHECK-NEXT:    [[I_077:%.*]] = phi i32 [ 0, [[IF_END12]] ], [ [[INC33:%.*]], [[FOR_END]] ]
 ; CHECK-NEXT:    [[PDST_ADDR_176:%.*]] = phi half* [ [[PDST_ADDR_0]], [[IF_END12]] ], [ [[PDST_ADDR_2:%.*]], [[FOR_END]] ]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[INDVARS_IV]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
-; CHECK:       vector.scevcheck:
-; CHECK-NEXT:    [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[I_077]])
-; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
-; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[I_077]], [[MUL_RESULT]]
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP2]], [[I_077]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP3]], [[MUL_OVERFLOW]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[INDVARS_IV]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE9:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi half [ 0xH0000, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_LOAD_CONTINUE9]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi half [ 0xH0000, [[VECTOR_PH]] ], [ [[PREDPHI10:%.*]], [[PRED_LOAD_CONTINUE9]] ]
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[INDUCTION2:%.*]] = add i32 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 [[I_077]], [[INDUCTION]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sub i32 [[I_077]], [[INDUCTION2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult i32 [[TMP5]], [[SRCBLEN_ADDR_0]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP6]], [[SRCBLEN_ADDR_0]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[INDUCTION]], [[SRCALEN_ADDR_0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ult i32 [[INDUCTION2]], [[SRCALEN_ADDR_0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = and i1 [[TMP9]], [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP8]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[INDUCTION]] to i64
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds half, half* [[PIN1_0]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load half, half* [[TMP14]], align 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP16:%.*]] = phi half [ poison, [[VECTOR_BODY]] ], [ [[TMP15]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]]
-; CHECK:       pred.load.if4:
-; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[INDUCTION2]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds half, half* [[PIN1_0]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load half, half* [[TMP18]], align 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE5]]
-; CHECK:       pred.load.continue5:
-; CHECK-NEXT:    [[TMP20:%.*]] = phi half [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], [[PRED_LOAD_IF4]] ]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]]
-; CHECK:       pred.load.if6:
-; CHECK-NEXT:    [[TMP21:%.*]] = sub nsw i32 0, [[TMP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = sext i32 [[TMP21]] to i64
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds half, half* [[PIN2_0]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load half, half* [[TMP23]], align 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE7]]
-; CHECK:       pred.load.continue7:
-; CHECK-NEXT:    [[TMP25:%.*]] = phi half [ poison, [[PRED_LOAD_CONTINUE5]] ], [ [[TMP24]], [[PRED_LOAD_IF6]] ]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9]]
-; CHECK:       pred.load.if8:
-; CHECK-NEXT:    [[TMP26:%.*]] = sub nsw i32 0, [[TMP6]]
-; CHECK-NEXT:    [[TMP27:%.*]] = sext i32 [[TMP26]] to i64
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds half, half* [[PIN2_0]], i64 [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = load half, half* [[TMP28]], align 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE9]]
-; CHECK:       pred.load.continue9:
-; CHECK-NEXT:    [[TMP30:%.*]] = phi half [ poison, [[PRED_LOAD_CONTINUE7]] ], [ [[TMP29]], [[PRED_LOAD_IF8]] ]
-; CHECK-NEXT:    [[TMP31:%.*]] = fmul fast half [[TMP25]], [[TMP16]]
-; CHECK-NEXT:    [[TMP32:%.*]] = fmul fast half [[TMP30]], [[TMP20]]
-; CHECK-NEXT:    [[TMP33:%.*]] = fadd fast half [[TMP31]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP34:%.*]] = fadd fast half [[TMP32]], [[VEC_PHI3]]
-; CHECK-NEXT:    [[TMP35:%.*]] = xor i1 [[TMP11]], true
-; CHECK-NEXT:    [[TMP36:%.*]] = xor i1 [[TMP12]], true
-; CHECK-NEXT:    [[PREDPHI]] = select i1 [[TMP35]], half [[VEC_PHI]], half [[TMP33]]
-; CHECK-NEXT:    [[PREDPHI10]] = select i1 [[TMP36]], half [[VEC_PHI3]], half [[TMP34]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast half [[PREDPHI10]], [[PREDPHI]]
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[INDVARS_IV]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_COND14_PREHEADER]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi half [ 0xH0000, [[VECTOR_SCEVCHECK]] ], [ 0xH0000, [[FOR_COND14_PREHEADER]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY16:%.*]]
 ; CHECK:       for.body16:
-; CHECK-NEXT:    [[J_074:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    [[SUM_073:%.*]] = phi half [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_1:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[J_074:%.*]] = phi i32 [ 0, [[FOR_COND14_PREHEADER]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[SUM_073:%.*]] = phi half [ 0xH0000, [[FOR_COND14_PREHEADER]] ], [ [[SUM_1:%.*]], [[FOR_INC]] ]
 ; CHECK-NEXT:    [[SUB17:%.*]] = sub i32 [[I_077]], [[J_074]]
 ; CHECK-NEXT:    [[CMP18:%.*]] = icmp ult i32 [[SUB17]], [[SRCBLEN_ADDR_0]]
 ; CHECK-NEXT:    [[CMP19:%.*]] = icmp ult i32 [[J_074]], [[SRCALEN_ADDR_0]]
@@ -139,21 +60,21 @@ define void @arm_correlate_f16(half* nocapture noundef readonly %pSrcA, i32 noun
 ; CHECK:       if.then20:
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[J_074]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds half, half* [[PIN1_0]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP38:%.*]] = load half, half* [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load half, half* [[ARRAYIDX]], align 2
 ; CHECK-NEXT:    [[SUB22:%.*]] = sub nsw i32 0, [[SUB17]]
 ; CHECK-NEXT:    [[IDXPROM23:%.*]] = sext i32 [[SUB22]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds half, half* [[PIN2_0]], i64 [[IDXPROM23]]
-; CHECK-NEXT:    [[TMP39:%.*]] = load half, half* [[ARRAYIDX24]], align 2
-; CHECK-NEXT:    [[MUL:%.*]] = fmul fast half [[TMP39]], [[TMP38]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[ARRAYIDX24]], align 2
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast half [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[ADD25:%.*]] = fadd fast half [[MUL]], [[SUM_073]]
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
 ; CHECK-NEXT:    [[SUM_1]] = phi half [ [[ADD25]], [[IF_THEN20]] ], [ [[SUM_073]], [[FOR_BODY16]] ]
 ; CHECK-NEXT:    [[INC]] = add nuw i32 [[J_074]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[INDVARS_IV]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY16]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY16]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi half [ [[SUM_1]], [[FOR_INC]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[SUM_1_LCSSA:%.*]] = phi half [ [[SUM_1]], [[FOR_INC]] ]
 ; CHECK-NEXT:    [[PDST_ADDR_2]] = getelementptr inbounds half, half* [[PDST_ADDR_176]], i64 [[CMP27]]
 ; CHECK-NEXT:    store half [[SUM_1_LCSSA]], half* [[PDST_ADDR_176]], align 2
 ; CHECK-NEXT:    [[INC33]] = add nuw i32 [[I_077]], 1