[llvm] d475030 - [SCEV] Apply loop guards to divisibility tests
Gil Rapaport via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 1 22:10:38 PST 2021
Author: Gil Rapaport
Date: 2021-02-02T08:09:39+02:00
New Revision: d475030dc28a85e649cdd3dd4e5941ec82227c26
URL: https://github.com/llvm/llvm-project/commit/d475030dc28a85e649cdd3dd4e5941ec82227c26
DIFF: https://github.com/llvm/llvm-project/commit/d475030dc28a85e649cdd3dd4e5941ec82227c26.diff
LOG: [SCEV] Apply loop guards to divisibility tests
Extend applyLoopGuards() to take into account conditions/assumes proving some
value %v to be divisible by D by rewriting %v to (%v / D) * D. This lets the
loop unroller and the loop vectorizer identify more loops as not requiring
remainder loops.
Differential Revision: https://reviews.llvm.org/D95521
Added:
llvm/test/Transforms/LoopUnroll/runtime-unroll-assume-no-remainder.ll
Modified:
llvm/include/llvm/Analysis/ScalarEvolution.h
llvm/lib/Analysis/ScalarEvolution.cpp
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index b3f199de2cfa..c35c1db7dfe0 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1177,6 +1177,9 @@ class ScalarEvolution {
/// sharpen it.
void setNoWrapFlags(SCEVAddRecExpr *AddRec, SCEV::NoWrapFlags Flags);
+ /// Try to apply information from loop guards for \p L to \p Expr.
+ const SCEV *applyLoopGuards(const SCEV *Expr, const Loop *L);
+
private:
/// A CallbackVH to arrange for ScalarEvolution to be notified whenever a
/// Value is deleted.
@@ -2021,9 +2024,6 @@ class ScalarEvolution {
/// Assign A and B to LHS and RHS, respectively.
bool matchURem(const SCEV *Expr, const SCEV *&LHS, const SCEV *&RHS);
- /// Try to apply information from loop guards for \p L to \p Expr.
- const SCEV *applyLoopGuards(const SCEV *Expr, const Loop *L);
-
/// Look for a SCEV expression with type `SCEVType` and operands `Ops` in
/// `UniqueSCEVs`.
///
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index ce97fa0edbe3..484d3387acba 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -6887,7 +6887,8 @@ ScalarEvolution::getSmallConstantTripMultiple(const Loop *L,
// Attempt to factor more general cases. Returns the greatest power of
// two divisor. If overflow happens, the trip count expression is still
// divisible by the greatest power of 2 divisor returned.
- return 1U << std::min((uint32_t)31, GetMinTrailingZeros(TCExpr));
+ return 1U << std::min((uint32_t)31,
+ GetMinTrailingZeros(applyLoopGuards(TCExpr, L)));
ConstantInt *Result = TC->getValue();
@@ -13259,6 +13260,27 @@ class SCEVLoopGuardRewriter : public SCEVRewriteVisitor<SCEVLoopGuardRewriter> {
const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) {
auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS,
const SCEV *RHS, ValueToSCEVMapTy &RewriteMap) {
+ // If we have LHS == 0, check if LHS is computing a property of some unknown
+ // SCEV %v which we can rewrite %v to express explicitly.
+ const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS);
+ if (Predicate == CmpInst::ICMP_EQ && RHSC &&
+ RHSC->getValue()->isNullValue()) {
+ // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to
+ // explicitly express that.
+ const SCEV *URemLHS = nullptr;
+ const SCEV *URemRHS = nullptr;
+ if (matchURem(LHS, URemLHS, URemRHS)) {
+ if (const SCEVUnknown *LHSUnknown = dyn_cast<SCEVUnknown>(URemLHS)) {
+ Value *V = LHSUnknown->getValue();
+ auto Multiple =
+ getMulExpr(getUDivExpr(URemLHS, URemRHS), URemRHS,
+ (SCEV::NoWrapFlags)(SCEV::FlagNUW | SCEV::FlagNSW));
+ RewriteMap[V] = Multiple;
+ return;
+ }
+ }
+ }
+
if (!isa<SCEVUnknown>(LHS)) {
std::swap(LHS, RHS);
Predicate = CmpInst::getSwappedPredicate(Predicate);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ec36b8292ad3..23209463ac2e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5573,7 +5573,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
const SCEV *ExitCount = SE->getAddExpr(
BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
const SCEV *Rem = SE->getURemExpr(
- ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
+ SE->applyLoopGuards(ExitCount, TheLoop),
+ SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
if (Rem->isZero()) {
// Accept MaxVF if we do not have a tail.
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
index 57049ed9cb20..bbce71501575 100644
--- a/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
@@ -9,7 +9,7 @@ define void @test_trip_multiple_4(i32 %num) {
; CHECK: Loop %for.body: backedge-taken count is (-1 + %num)
; CHECK-NEXT: Loop %for.body: max backedge-taken count is -2
; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (-1 + %num)
-; CHECK: Loop %for.body: Trip multiple is 1
+; CHECK: Loop %for.body: Trip multiple is 4
;
entry:
%u = urem i32 %num, 4
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-unroll-assume-no-remainder.ll b/llvm/test/Transforms/LoopUnroll/runtime-unroll-assume-no-remainder.ll
new file mode 100644
index 000000000000..b7e13931d1ae
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/runtime-unroll-assume-no-remainder.ll
@@ -0,0 +1,172 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true -unroll-count=2 | FileCheck %s
+
+; Make sure the loop is unrolled without a remainder loop based on an assumption
+; that the least significant bit is known to be zero.
+
+define dso_local void @assumeDivisibleTC(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i32 %p, i32 %q) local_unnamed_addr {
+; CHECK-LABEL: @assumeDivisibleTC(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[P:%.*]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT: br i1 [[CMP]], label [[GUARDED:%.*]], label [[EXIT:%.*]]
+; CHECK: guarded:
+; CHECK-NEXT: [[REM:%.*]] = urem i32 [[Q:%.*]], 2
+; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[REM]], 0
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP2]])
+; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[P]], [[Q]]
+; CHECK-NEXT: [[N:%.*]] = select i1 [[GT]], i32 [[P]], i32 [[Q]]
+; CHECK-NEXT: [[CMP110:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP110]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INC_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[I_011]]
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP0]], 3
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[I_011]]
+; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX4]], align 1
+; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[I_011]], 1
+; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 [[INC]]
+; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1
+; CHECK-NEXT: [[ADD_1:%.*]] = add i8 [[TMP1]], 3
+; CHECK-NEXT: [[ARRAYIDX4_1:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[INC]]
+; CHECK-NEXT: store i8 [[ADD_1]], i8* [[ARRAYIDX4_1]], align 1
+; CHECK-NEXT: [[INC_1]] = add nuw nsw i32 [[INC]], 1
+; CHECK-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[INC_1]], [[N]]
+; CHECK-NEXT: br i1 [[CMP1_1]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]], [[LOOP0:!llvm.loop !.*]]
+; CHECK: exit.loopexit:
+; CHECK-NEXT: br label [[EXIT]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ %and = and i32 %p, 1
+ %cmp = icmp eq i32 %and, 0
+ br i1 %cmp, label %guarded, label %exit
+
+guarded:
+ %rem = urem i32 %q, 2
+ %cmp2 = icmp eq i32 %rem, 0
+ tail call void @llvm.assume(i1 %cmp2)
+ %gt = icmp sgt i32 %p, %q
+ %n = select i1 %gt, i32 %p, i32 %q
+ %cmp110 = icmp sgt i32 %n, 0
+ br i1 %cmp110, label %for.body, label %exit
+
+for.body:
+ %i.011 = phi i32 [ %inc, %for.body ], [ 0, %guarded ]
+ %arrayidx = getelementptr inbounds i8, i8* %b, i32 %i.011
+ %0 = load i8, i8* %arrayidx, align 1
+ %add = add i8 %0, 3
+ %arrayidx4 = getelementptr inbounds i8, i8* %a, i32 %i.011
+ store i8 %add, i8* %arrayidx4, align 1
+ %inc = add nuw nsw i32 %i.011, 1
+ %cmp1 = icmp slt i32 %inc, %n
+ br i1 %cmp1, label %for.body, label %exit
+
+exit:
+ ret void
+}
+
+; Make sure the loop is unrolled with a remainder loop when the trip-count
+; is not provably divisible by the unroll factor.
+
+define dso_local void @cannotProveDivisibleTC(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i32 %p, i32 %q) local_unnamed_addr {
+; CHECK-LABEL: @cannotProveDivisibleTC(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[AND:%.*]] = and i32 [[P:%.*]], 6
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT: br i1 [[CMP]], label [[GUARDED:%.*]], label [[EXIT:%.*]]
+; CHECK: guarded:
+; CHECK-NEXT: [[REM:%.*]] = urem i32 [[Q:%.*]], 2
+; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[REM]], 0
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP2]])
+; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[P]], [[Q]]
+; CHECK-NEXT: [[N:%.*]] = select i1 [[GT]], i32 [[P]], i32 [[Q]]
+; CHECK-NEXT: [[CMP110:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP110]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT: [[XTRAITER:%.*]] = and i32 [[N]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 1
+; CHECK-NEXT: br i1 [[TMP1]], label [[EXIT_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY_PREHEADER_NEW:%.*]]
+; CHECK: for.body.preheader.new:
+; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i32 [[N]], [[XTRAITER]]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER_NEW]] ], [ [[INC_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[NITER:%.*]] = phi i32 [ [[UNROLL_ITER]], [[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NSUB_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[I_011]]
+; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP2]], 3
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[I_011]]
+; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX4]], align 1
+; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[I_011]], 1
+; CHECK-NEXT: [[NITER_NSUB:%.*]] = sub i32 [[NITER]], 1
+; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 [[INC]]
+; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1
+; CHECK-NEXT: [[ADD_1:%.*]] = add i8 [[TMP3]], 3
+; CHECK-NEXT: [[ARRAYIDX4_1:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[INC]]
+; CHECK-NEXT: store i8 [[ADD_1]], i8* [[ARRAYIDX4_1]], align 1
+; CHECK-NEXT: [[INC_1]] = add nuw nsw i32 [[INC]], 1
+; CHECK-NEXT: [[NITER_NSUB_1]] = sub i32 [[NITER_NSUB]], 1
+; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp ne i32 [[NITER_NSUB_1]], 0
+; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]], [[LOOP2:!llvm.loop !.*]]
+; CHECK: exit.loopexit.unr-lcssa.loopexit:
+; CHECK-NEXT: [[I_011_UNR_PH:%.*]] = phi i32 [ [[INC_1]], [[FOR_BODY]] ]
+; CHECK-NEXT: br label [[EXIT_LOOPEXIT_UNR_LCSSA]]
+; CHECK: exit.loopexit.unr-lcssa:
+; CHECK-NEXT: [[I_011_UNR:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[I_011_UNR_PH]], [[EXIT_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
+; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0
+; CHECK-NEXT: br i1 [[LCMP_MOD]], label [[FOR_BODY_EPIL_PREHEADER:%.*]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK: for.body.epil.preheader:
+; CHECK-NEXT: br label [[FOR_BODY_EPIL:%.*]]
+; CHECK: for.body.epil:
+; CHECK-NEXT: [[I_011_EPIL:%.*]] = phi i32 [ [[I_011_UNR]], [[FOR_BODY_EPIL_PREHEADER]] ]
+; CHECK-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 [[I_011_EPIL]]
+; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX_EPIL]], align 1
+; CHECK-NEXT: [[ADD_EPIL:%.*]] = add i8 [[TMP4]], 3
+; CHECK-NEXT: [[ARRAYIDX4_EPIL:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[I_011_EPIL]]
+; CHECK-NEXT: store i8 [[ADD_EPIL]], i8* [[ARRAYIDX4_EPIL]], align 1
+; CHECK-NEXT: [[INC_EPIL:%.*]] = add nuw nsw i32 [[I_011_EPIL]], 1
+; CHECK-NEXT: [[CMP1_EPIL:%.*]] = icmp slt i32 [[INC_EPIL]], [[N]]
+; CHECK-NEXT: br label [[EXIT_LOOPEXIT_EPILOG_LCSSA:%.*]]
+; CHECK: exit.loopexit.epilog-lcssa:
+; CHECK-NEXT: br label [[EXIT_LOOPEXIT]]
+; CHECK: exit.loopexit:
+; CHECK-NEXT: br label [[EXIT]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ %and = and i32 %p, 6
+ %cmp = icmp eq i32 %and, 0
+ br i1 %cmp, label %guarded, label %exit
+
+guarded:
+ %rem = urem i32 %q, 2
+ %cmp2 = icmp eq i32 %rem, 0
+ tail call void @llvm.assume(i1 %cmp2)
+ %gt = icmp sgt i32 %p, %q
+ %n = select i1 %gt, i32 %p, i32 %q
+ %cmp110 = icmp sgt i32 %n, 0
+ br i1 %cmp110, label %for.body, label %exit
+
+for.body:
+ %i.011 = phi i32 [ %inc, %for.body ], [ 0, %guarded ]
+ %arrayidx = getelementptr inbounds i8, i8* %b, i32 %i.011
+ %0 = load i8, i8* %arrayidx, align 1
+ %add = add i8 %0, 3
+ %arrayidx4 = getelementptr inbounds i8, i8* %a, i32 %i.011
+ store i8 %add, i8* %arrayidx4, align 1
+ %inc = add nuw nsw i32 %i.011, 1
+ %cmp1 = icmp slt i32 %inc, %n
+ br i1 %cmp1, label %for.body, label %exit
+
+exit:
+ ret void
+}
+
+declare void @llvm.assume(i1 noundef) nofree nosync nounwind willreturn
diff --git a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
index 1f9cdbad662a..a99279b93938 100644
--- a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
+++ b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
@@ -64,13 +64,21 @@ exit:
; Make sure the loop is vectorized under -Os without folding its tail based on
; its trip-count's lower bits assumed to be zero.
-define dso_local void @assumeAlignedTC(i32* noalias nocapture %A, i32* %p) optsize {
+define dso_local void @assumeAlignedTC(i32* noalias nocapture %A, i32 %p, i32 %q) optsize {
; CHECK-LABEL: @assumeAlignedTC(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[N:%.*]] = load i32, i32* [[P:%.*]], align 4
-; CHECK-NEXT: [[AND:%.*]] = and i32 [[N]], 3
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT: [[AND1:%.*]] = and i32 [[P:%.*]], 3
+; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[AND1]], 0
+; CHECK-NEXT: br i1 [[CMP1]], label [[GUARDED:%.*]], label [[EXIT:%.*]]
+; CHECK: guarded:
+; CHECK-NEXT: [[REM:%.*]] = urem i32 [[Q:%.*]], 8
+; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[REM]], 0
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP2]])
+; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[P]], [[Q]]
+; CHECK-NEXT: [[N:%.*]] = select i1 [[GT]], i32 [[P]], i32 [[Q]]
+; CHECK-NEXT: [[CMP110:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP110]], label [[LOOP_PREHEADER:%.*]], label [[EXIT]]
+; CHECK: loop.preheader:
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
@@ -89,32 +97,151 @@ define dso_local void @assumeAlignedTC(i32* noalias nocapture %A, i32* %p) optsi
; CHECK-NEXT: store <4 x i32> <i32 13, i32 13, i32 13, i32 13>, <4 x i32>* [[TMP3]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ]
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[RIV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[RIVPLUS1:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[RIV:%.*]] = phi i32 [ [[RIVPLUS1:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[RIV]]
; CHECK-NEXT: store i32 13, i32* [[ARRAYIDX]], align 1
; CHECK-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[N]]
-; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]]
+; CHECK-NEXT: br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], [[LOOP5:!llvm.loop !.*]]
+; CHECK: exit.loopexit:
+; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
entry:
- %n = load i32, i32* %p
- %and = and i32 %n, 3
- %cmp = icmp eq i32 %and, 0
- tail call void @llvm.assume(i1 %cmp)
- br label %loop
+ %and1 = and i32 %p, 3
+ %cmp1 = icmp eq i32 %and1, 0
+ br i1 %cmp1, label %guarded, label %exit
+
+guarded:
+ %rem = urem i32 %q, 8
+ %cmp2 = icmp eq i32 %rem, 0
+ tail call void @llvm.assume(i1 %cmp2)
+ %gt = icmp sgt i32 %p, %q
+ %n = select i1 %gt, i32 %p, i32 %q
+ %cmp110 = icmp sgt i32 %n, 0
+ br i1 %cmp110, label %loop, label %exit
loop:
- %riv = phi i32 [ 0, %entry ], [ %rivPlus1, %loop ]
+ %riv = phi i32 [ 0, %guarded ], [ %rivPlus1, %loop ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %riv
+ store i32 13, i32* %arrayidx, align 1
+ %rivPlus1 = add nuw nsw i32 %riv, 1
+ %cond = icmp eq i32 %rivPlus1, %n
+ br i1 %cond, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+; Make sure the loop's tail is folded when vectorized under -Os based on its trip-count's
+; not being provably divisible by chosen VF.
+
+define dso_local void @cannotProveAlignedTC(i32* noalias nocapture %A, i32 %p, i32 %q) optsize {
+; CHECK-LABEL: @cannotProveAlignedTC(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[AND1:%.*]] = and i32 [[P:%.*]], 3
+; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[AND1]], 0
+; CHECK-NEXT: br i1 [[CMP1]], label [[GUARDED:%.*]], label [[EXIT:%.*]]
+; CHECK: guarded:
+; CHECK-NEXT: [[REM:%.*]] = urem i32 [[Q:%.*]], 3
+; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[REM]], 0
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP2]])
+; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[P]], [[Q]]
+; CHECK-NEXT: [[N:%.*]] = select i1 [[GT]], i32 [[P]], i32 [[Q]]
+; CHECK-NEXT: [[CMP110:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT: br i1 [[CMP110]], label [[LOOP_PREHEADER:%.*]], label [[EXIT]]
+; CHECK: loop.preheader:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
+; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK: pred.store.if:
+; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP2]]
+; CHECK-NEXT: store i32 13, i32* [[TMP3]], align 1
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
+; CHECK: pred.store.continue:
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1
+; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
+; CHECK: pred.store.if1:
+; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP5]]
+; CHECK-NEXT: store i32 13, i32* [[TMP6]], align 1
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]]
+; CHECK: pred.store.continue2:
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2
+; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; CHECK: pred.store.if3:
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP8]]
+; CHECK-NEXT: store i32 13, i32* [[TMP9]], align 1
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]]
+; CHECK: pred.store.continue4:
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3
+; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
+; CHECK: pred.store.if5:
+; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 3
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP11]]
+; CHECK-NEXT: store i32 13, i32* [[TMP12]], align 1
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
+; CHECK: pred.store.continue6:
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
+; CHECK: middle.block:
+; CHECK-NEXT: br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ]
+; CHECK-NEXT: br label [[LOOP:%.*]]
+; CHECK: loop:
+; CHECK-NEXT: [[RIV:%.*]] = phi i32 [ [[RIVPLUS1:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[RIV]]
+; CHECK-NEXT: store i32 13, i32* [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[N]]
+; CHECK-NEXT: br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], [[LOOP7:!llvm.loop !.*]]
+; CHECK: exit.loopexit:
+; CHECK-NEXT: br label [[EXIT]]
+; CHECK: exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ %and1 = and i32 %p, 3
+ %cmp1 = icmp eq i32 %and1, 0
+ br i1 %cmp1, label %guarded, label %exit
+
+guarded:
+ %rem = urem i32 %q, 3
+ %cmp2 = icmp eq i32 %rem, 0
+ tail call void @llvm.assume(i1 %cmp2)
+ %gt = icmp sgt i32 %p, %q
+ %n = select i1 %gt, i32 %p, i32 %q
+ %cmp110 = icmp sgt i32 %n, 0
+ br i1 %cmp110, label %loop, label %exit
+
+loop:
+ %riv = phi i32 [ 0, %guarded ], [ %rivPlus1, %loop ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %riv
store i32 13, i32* %arrayidx, align 1
%rivPlus1 = add nuw nsw i32 %riv, 1
More information about the llvm-commits
mailing list