[llvm] 2b69efd - [ARM][LV] Add a preferPredicatedReductionSelect target hook
David Green via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 21 00:48:27 PDT 2020
Author: David Green
Date: 2020-08-21T08:48:12+01:00
New Revision: 2b69efded0dc95ffbbdb39d109d38fbb8f0d6390
URL: https://github.com/llvm/llvm-project/commit/2b69efded0dc95ffbbdb39d109d38fbb8f0d6390
DIFF: https://github.com/llvm/llvm-project/commit/2b69efded0dc95ffbbdb39d109d38fbb8f0d6390.diff
LOG: [ARM][LV] Add a preferPredicatedReductionSelect target hook
As part of D84741, this adds a target hook for the
preferPredicatedReductionSelect option and makes use
of it under MVE, allowing us to tail predicate most
reduction loops.
Differential Revision: https://reviews.llvm.org/D85980
Added:
Modified:
llvm/include/llvm/Analysis/TargetTransformInfo.h
llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
llvm/lib/Analysis/TargetTransformInfo.cpp
llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
llvm/lib/Target/ARM/ARMTargetTransformInfo.h
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
llvm/test/Transforms/LoopVectorize/ARM/tail-folding-allowed.ll
llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 092c9744a21bc..06d354411af69 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1283,6 +1283,20 @@ class TargetTransformInfo {
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
ReductionFlags Flags) const;
+ /// \returns True if the target prefers reductions select kept in the loop
+ /// when tail folding. i.e.
+ /// loop:
+ /// p = phi (0, s)
+ /// a = add (p, x)
+ /// s = select (mask, a, p)
+ /// vecreduce.add(s)
+ ///
+ /// As opposed to the normal scheme of p = phi (0, a) which allows the select
+ /// to be pulled out of the loop. If the select(.., add, ..) can be predicated
+ /// by the target, this can lead to cleaner code generation.
+ bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
+ ReductionFlags Flags) const;
+
/// \returns True if the target wants to expand the given reduction intrinsic
/// into a shuffle sequence.
bool shouldExpandReduction(const IntrinsicInst *II) const;
@@ -1573,6 +1587,8 @@ class TargetTransformInfo::Concept {
VectorType *VecTy) const = 0;
virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
ReductionFlags) const = 0;
+ virtual bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
+ ReductionFlags) const = 0;
virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
virtual unsigned getGISelRematGlobalCost() const = 0;
virtual bool hasActiveVectorLength() const = 0;
@@ -2073,6 +2089,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
ReductionFlags Flags) const override {
return Impl.useReductionIntrinsic(Opcode, Ty, Flags);
}
+ bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
+ ReductionFlags Flags) const override {
+ return Impl.preferPredicatedReductionSelect(Opcode, Ty, Flags);
+ }
bool shouldExpandReduction(const IntrinsicInst *II) const override {
return Impl.shouldExpandReduction(II);
}
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 4dc0a90cc9db1..ee1527ff88819 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -658,6 +658,11 @@ class TargetTransformInfoImplBase {
return false;
}
+ bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
+ TTI::ReductionFlags Flags) const {
+ return false;
+ }
+
bool shouldExpandReduction(const IntrinsicInst *II) const { return true; }
unsigned getGISelRematGlobalCost() const { return 1; }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 944e621fc0701..52c88180c9ec5 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1013,6 +1013,11 @@ bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode, Type *Ty,
return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags);
}
+bool TargetTransformInfo::preferPredicatedReductionSelect(
+ unsigned Opcode, Type *Ty, ReductionFlags Flags) const {
+ return TTIImpl->preferPredicatedReductionSelect(Opcode, Ty, Flags);
+}
+
bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const {
return TTIImpl->shouldExpandReduction(II);
}
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 85ed3cd153bf8..1a4b7c53a60aa 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1589,35 +1589,28 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
const LoopAccessInfo *LAI) {
LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
- // If there are live-out values, it is probably a reduction, which needs a
- // final reduction step after the loop. MVE has a VADDV instruction to reduce
- // integer vectors, but doesn't have an equivalent one for float vectors. A
- // live-out value that is not recognised as a reduction will result in the
- // tail-predicated loop to be reverted to a non-predicated loop and this is
- // very expensive, i.e. it has a significant performance impact. So, in this
- // case it's better not to tail-predicate the loop, which is what we check
- // here. Thus, we allow only 1 live-out value, which has to be an integer
- // reduction, which matches the loops supported by ARMLowOverheadLoops.
- // It is important to keep ARMLowOverheadLoops and canTailPredicateLoop in
- // sync with each other.
+ // If there are live-out values, it is probably a reduction. We can predicate
+ // most reduction operations freely under MVE using a combination of
+ // prefer-predicated-reduction-select and inloop reductions. We limit this to
+ // floating point and integer reductions, but don't check for operators
+ // specifically here. If the value ends up not being a reduction (and so the
+ // vectorizer cannot tailfold the loop), we should fall back to standard
+ // vectorization automatically.
SmallVector< Instruction *, 8 > LiveOuts;
LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
- bool IntReductionsDisabled =
+ bool ReductionsDisabled =
EnableTailPredication == TailPredication::EnabledNoReductions ||
EnableTailPredication == TailPredication::ForceEnabledNoReductions;
for (auto *I : LiveOuts) {
- if (!I->getType()->isIntegerTy()) {
- LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer "
+ if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
+ !I->getType()->isHalfTy()) {
+ LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
"live-out value\n");
return false;
}
- if (I->getOpcode() != Instruction::Add) {
- LLVM_DEBUG(dbgs() << "Only add reductions supported\n");
- return false;
- }
- if (IntReductionsDisabled) {
- LLVM_DEBUG(dbgs() << "Integer add reductions not enabled\n");
+ if (ReductionsDisabled) {
+ LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
return false;
}
}
@@ -1811,3 +1804,10 @@ bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const {
return ST->hasMVEIntegerOps();
}
+
+bool ARMTTIImpl::preferPredicatedReductionSelect(
+ unsigned Opcode, Type *Ty, TTI::ReductionFlags Flags) const {
+ if (!ST->hasMVEIntegerOps())
+ return false;
+ return true;
+}
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 6b24f8b09fe95..cc2019b47a076 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -186,6 +186,9 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
TTI::ReductionFlags Flags) const;
+ bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
+ TTI::ReductionFlags Flags) const;
+
bool shouldExpandReduction(const IntrinsicInst *II) const {
switch (II->getIntrinsicID()) {
case Intrinsic::experimental_vector_reduce_v2_fadd:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 21d650ca751a5..86f15500d8389 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3929,7 +3929,10 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
// and so use the select value for the phi instead of the old
// LoopExitValue.
RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
- if (PreferPredicatedReductionSelect) {
+ if (PreferPredicatedReductionSelect ||
+ TTI->preferPredicatedReductionSelect(
+ RdxDesc.getRecurrenceBinOp(RdxDesc.getRecurrenceKind()),
+ Phi->getType(), TargetTransformInfo::ReductionFlags())) {
auto *VecRdxPhi = cast<PHINode>(getOrCreateVectorValue(Phi, Part));
VecRdxPhi->setIncomingValueForBlock(
LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
index da5b5a60a400c..cf7bf00d2131f 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll
@@ -12,18 +12,18 @@ define i32 @reduction_sum_single(i32* noalias nocapture %A) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256)
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
-; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[VEC_PHI]], [[TMP2]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
-; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
+; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP2]], <4 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[DOTLR_PH:%.*]]
@@ -60,7 +60,7 @@ define i32 @reduction_sum(i32* noalias nocapture %A, i32* noalias nocapture %B)
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256)
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
@@ -70,14 +70,14 @@ define i32 @reduction_sum(i32* noalias nocapture %A, i32* noalias nocapture %B)
; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[VEC_PHI]], [[VEC_IND]]
; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT: [[TMP6]] = add <4 x i32> [[TMP5]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP5]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT: [[TMP7]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP6]], <4 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP6]], <4 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP7]])
; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[DOTLR_PH:%.*]]
@@ -117,39 +117,29 @@ define i32 @reduction_prod(i32* noalias nocapture %A, i32* noalias nocapture %B)
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256)
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP5]] = mul <4 x i32> [[TMP4]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP5]], <4 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !6
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP5]])
-; CHECK-NEXT: br i1 false, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY]] ]
; CHECK-NEXT: br label [[DOTLR_PH:%.*]]
; CHECK: .lr.ph:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[PROD_02:%.*]] = phi i32 [ [[L9:%.*]], [[DOTLR_PH]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[L2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]]
-; CHECK-NEXT: [[L3:%.*]] = load i32, i32* [[L2]], align 4
-; CHECK-NEXT: [[L4:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[INDVARS_IV]]
-; CHECK-NEXT: [[L5:%.*]] = load i32, i32* [[L4]], align 4
-; CHECK-NEXT: [[L8:%.*]] = mul i32 [[PROD_02]], [[L3]]
-; CHECK-NEXT: [[L9]] = mul i32 [[L8]], [[L5]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !7
+; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop !7
; CHECK: ._crit_edge:
-; CHECK-NEXT: [[PROD_0_LCSSA:%.*]] = phi i32 [ [[L9]], [[DOTLR_PH]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[PROD_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[PROD_0_LCSSA]]
;
entry:
@@ -181,39 +171,29 @@ define i32 @reduction_and(i32* nocapture %A, i32* nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256)
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP5]] = and <4 x i32> [[TMP4]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i32> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i32> [[TMP4]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP5]], <4 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !8
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP5]])
-; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ -1, [[ENTRY]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[RESULT_08:%.*]] = phi i32 [ [[AND:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]]
-; CHECK-NEXT: [[L0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[INDVARS_IV]]
-; CHECK-NEXT: [[L1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = and i32 [[RESULT_08]], [[L0]]
-; CHECK-NEXT: [[AND]] = and i32 [[ADD]], [[L1]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !9
+; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !9
; CHECK: for.end:
-; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]]
;
entry:
@@ -245,39 +225,29 @@ define i32 @reduction_or(i32* nocapture %A, i32* nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256)
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP5]] = or <4 x i32> [[TMP4]], [[VEC_PHI]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6]] = or <4 x i32> [[VEC_PHI]], [[TMP5]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !10
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
-; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[RESULT_08:%.*]] = phi i32 [ [[OR:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]]
-; CHECK-NEXT: [[L0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[INDVARS_IV]]
-; CHECK-NEXT: [[L1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[L1]], [[L0]]
-; CHECK-NEXT: [[OR]] = or i32 [[ADD]], [[RESULT_08]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !11
+; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !11
; CHECK: for.end:
-; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]]
;
entry:
@@ -309,39 +279,29 @@ define i32 @reduction_xor(i32* nocapture %A, i32* nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256)
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP5]] = xor <4 x i32> [[TMP4]], [[VEC_PHI]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6]] = xor <4 x i32> [[VEC_PHI]], [[TMP5]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !12
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]])
-; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[RESULT_08:%.*]] = phi i32 [ [[XOR:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDVARS_IV]]
-; CHECK-NEXT: [[L0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[INDVARS_IV]]
-; CHECK-NEXT: [[L1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[L1]], [[L0]]
-; CHECK-NEXT: [[XOR]] = xor i32 [[ADD]], [[RESULT_08]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !13
+; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !13
; CHECK: for.end:
-; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]]
;
entry:
@@ -373,39 +333,29 @@ define float @reduction_fadd(float* nocapture %A, float* nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 [[INDEX]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256)
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> undef)
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP5]] = fadd fast <4 x float> [[TMP4]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> undef)
+; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <4 x float> [[TMP4]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP5]], <4 x float> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !14
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !14
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP5]])
-; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fadd.f32.v4f32(float 0.000000e+00, <4 x float> [[TMP6]])
+; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[RESULT_08:%.*]] = phi float [ [[FADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i32 [[INDVARS_IV]]
-; CHECK-NEXT: [[L0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i32 [[INDVARS_IV]]
-; CHECK-NEXT: [[L1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[RESULT_08]], [[L0]]
-; CHECK-NEXT: [[FADD]] = fadd fast float [[ADD]], [[L1]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !15
+; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !15
; CHECK: for.end:
-; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret float [[RESULT_0_LCSSA]]
;
entry:
@@ -437,39 +387,29 @@ define float @reduction_fmul(float* nocapture %A, float* nocapture %B) {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 [[INDEX]]
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 256)
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP1]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> undef)
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP5]] = fmul fast <4 x float> [[TMP4]], [[WIDE_LOAD1]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> undef)
+; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP4]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP5]], <4 x float> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !16
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !16
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP5]])
-; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.experimental.vector.reduce.v2.fmul.f32.v4f32(float 1.000000e+00, <4 x float> [[TMP6]])
+; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ 0.000000e+00, [[ENTRY]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[RESULT_08:%.*]] = phi float [ [[FMUL:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i32 [[INDVARS_IV]]
-; CHECK-NEXT: [[L0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i32 [[INDVARS_IV]]
-; CHECK-NEXT: [[L1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = fmul fast float [[RESULT_08]], [[L0]]
-; CHECK-NEXT: [[FMUL]] = fmul fast float [[ADD]], [[L1]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !17
+; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !17
; CHECK: for.end:
-; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ [[FMUL]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret float [[RESULT_0_LCSSA]]
;
entry:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-allowed.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-allowed.ll
index 251c2175bb8b5..ed12afca9b8d5 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-allowed.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-allowed.ll
@@ -34,3 +34,212 @@ for.body: ; preds = %for.body.preheader,
%exitcond.not = icmp eq i32 %inc, %N
br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
}
+
+define dso_local void @f32_reduction(float* nocapture readonly %Input, i32 %N, float* nocapture %Output) {
+; CHECK-LABEL: f32_reduction(
+; CHECK: vector.body:
+; CHECK: @llvm.masked.load
+; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
+entry:
+ %cmp6 = icmp eq i32 %N, 0
+ br i1 %cmp6, label %while.end, label %while.body.preheader
+
+while.body.preheader: ; preds = %entry
+ br label %while.body
+
+while.body: ; preds = %while.body.preheader, %while.body
+ %blkCnt.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
+ %sum.08 = phi float [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ]
+ %Input.addr.07 = phi float* [ %incdec.ptr, %while.body ], [ %Input, %while.body.preheader ]
+ %incdec.ptr = getelementptr inbounds float, float* %Input.addr.07, i32 1
+ %0 = load float, float* %Input.addr.07, align 4
+ %add = fadd fast float %0, %sum.08
+ %dec = add i32 %blkCnt.09, -1
+ %cmp = icmp eq i32 %dec, 0
+ br i1 %cmp, label %while.end.loopexit, label %while.body
+
+while.end.loopexit: ; preds = %while.body
+ %add.lcssa = phi float [ %add, %while.body ]
+ br label %while.end
+
+while.end: ; preds = %while.end.loopexit, %entry
+ %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ]
+ %conv = uitofp i32 %N to float
+ %div = fdiv fast float %sum.0.lcssa, %conv
+ store float %div, float* %Output, align 4
+ ret void
+}
+
+define dso_local void @f16_reduction(half* nocapture readonly %Input, i32 %N, half* nocapture %Output) {
+; CHECK-LABEL: f16_reduction(
+; CHECK: vector.body:
+; CHECK: @llvm.masked.load
+; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
+entry:
+ %cmp6 = icmp eq i32 %N, 0
+ br i1 %cmp6, label %while.end, label %while.body.preheader
+
+while.body.preheader: ; preds = %entry
+ br label %while.body
+
+while.body: ; preds = %while.body.preheader, %while.body
+ %blkCnt.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
+ %sum.08 = phi half [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ]
+ %Input.addr.07 = phi half* [ %incdec.ptr, %while.body ], [ %Input, %while.body.preheader ]
+ %incdec.ptr = getelementptr inbounds half, half* %Input.addr.07, i32 1
+ %0 = load half, half* %Input.addr.07, align 2
+ %add = fadd fast half %0, %sum.08
+ %dec = add i32 %blkCnt.09, -1
+ %cmp = icmp eq i32 %dec, 0
+ br i1 %cmp, label %while.end.loopexit, label %while.body
+
+while.end.loopexit: ; preds = %while.body
+ %add.lcssa = phi half [ %add, %while.body ]
+ br label %while.end
+
+while.end: ; preds = %while.end.loopexit, %entry
+ %sum.0.lcssa = phi half [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ]
+ %conv = uitofp i32 %N to half
+ %div = fdiv fast half %sum.0.lcssa, %conv
+ store half %div, half* %Output, align 2
+ ret void
+}
+
+define dso_local void @mixed_f32_i32_reduction(float* nocapture readonly %fInput, i32* nocapture readonly %iInput, i32 %N, float* nocapture %fOutput, i32* nocapture %iOutput) {
+; CHECK-LABEL: mixed_f32_i32_reduction(
+; CHECK: vector.body:
+; CHECK: @llvm.masked.load
+; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
+entry:
+ %cmp15 = icmp eq i32 %N, 0
+ br i1 %cmp15, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+ br label %while.body
+
+while.body:
+ %blkCnt.020 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
+ %isum.019 = phi i32 [ %add2, %while.body ], [ 0, %while.body.preheader ]
+ %fsum.018 = phi float [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ]
+ %fInput.addr.017 = phi float* [ %incdec.ptr, %while.body ], [ %fInput, %while.body.preheader ]
+ %iInput.addr.016 = phi i32* [ %incdec.ptr1, %while.body ], [ %iInput, %while.body.preheader ]
+ %incdec.ptr = getelementptr inbounds float, float* %fInput.addr.017, i32 1
+ %incdec.ptr1 = getelementptr inbounds i32, i32* %iInput.addr.016, i32 1
+ %0 = load i32, i32* %iInput.addr.016, align 4
+ %add2 = add nsw i32 %0, %isum.019
+ %1 = load float, float* %fInput.addr.017, align 4
+ %add = fadd fast float %1, %fsum.018
+ %dec = add i32 %blkCnt.020, -1
+ %cmp = icmp eq i32 %dec, 0
+ br i1 %cmp, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+ %add.lcssa = phi float [ %add, %while.body ]
+ %add2.lcssa = phi i32 [ %add2, %while.body ]
+ %phitmp = sitofp i32 %add2.lcssa to float
+ br label %while.end
+
+while.end:
+ %fsum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ]
+ %isum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %phitmp, %while.end.loopexit ]
+ %conv = uitofp i32 %N to float
+ %div = fdiv fast float %fsum.0.lcssa, %conv
+ store float %div, float* %fOutput, align 4
+ %div5 = fdiv fast float %isum.0.lcssa, %conv
+ %conv6 = fptosi float %div5 to i32
+ store i32 %conv6, i32* %iOutput, align 4
+ ret void
+}
+
+define dso_local i32 @i32_mul_reduction(i32* noalias nocapture readonly %B, i32 %N) {
+; CHECK-LABEL: i32_mul_reduction(
+; CHECK: vector.body:
+; CHECK: @llvm.masked.load
+; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
+entry:
+ %cmp6 = icmp sgt i32 %N, 0
+ br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+ br label %for.body
+
+for.cond.cleanup.loopexit:
+ %mul.lcssa = phi i32 [ %mul, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup:
+ %S.0.lcssa = phi i32 [ 1, %entry ], [ %mul.lcssa, %for.cond.cleanup.loopexit ]
+ ret i32 %S.0.lcssa
+
+for.body:
+ %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %S.07 = phi i32 [ %mul, %for.body ], [ 1, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.08
+ %0 = load i32, i32* %arrayidx, align 4
+ %mul = mul nsw i32 %0, %S.07
+ %inc = add nuw nsw i32 %i.08, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define dso_local i32 @i32_or_reduction(i32* noalias nocapture readonly %B, i32 %N) {
+; CHECK-LABEL: i32_or_reduction(
+; CHECK: vector.body:
+; CHECK: @llvm.masked.load
+; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
+entry:
+ %cmp6 = icmp sgt i32 %N, 0
+ br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %or.lcssa = phi i32 [ %or, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %S.0.lcssa = phi i32 [ 1, %entry ], [ %or.lcssa, %for.cond.cleanup.loopexit ]
+ ret i32 %S.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %S.07 = phi i32 [ %or, %for.body ], [ 1, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.08
+ %0 = load i32, i32* %arrayidx, align 4
+ %or = or i32 %0, %S.07
+ %inc = add nuw nsw i32 %i.08, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define dso_local i32 @i32_and_reduction(i32* noalias nocapture readonly %A, i32 %N, i32 %S) {
+; CHECK-LABEL: i32_and_reduction(
+; CHECK: vector.body:
+; CHECK: @llvm.masked.load
+; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
+entry:
+ %cmp5 = icmp sgt i32 %N, 0
+ br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader: ; preds = %entry
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %and.lcssa = phi i32 [ %and, %for.body ]
+ br label %for.cond.cleanup
+
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %S.addr.0.lcssa = phi i32 [ %S, %entry ], [ %and.lcssa, %for.cond.cleanup.loopexit ]
+ ret i32 %S.addr.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+ %S.addr.06 = phi i32 [ %and, %for.body ], [ %S, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.07
+ %0 = load i32, i32* %arrayidx, align 4
+ %and = and i32 %0, %S.addr.06
+ %inc = add nuw nsw i32 %i.07, 1
+ %exitcond = icmp eq i32 %inc, %N
+ br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
index e86f101854413..baedc0a23daa2 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll
@@ -266,189 +266,6 @@ for.body: ; preds = %for.body.preheader,
br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
}
-; Don't tail-fold float reductions.
-;
-define dso_local void @f32_reduction(float* nocapture readonly %Input, i32 %N, float* nocapture %Output) local_unnamed_addr #0 {
-; CHECK-LABEL: f32_reduction(
-; CHECK: vector.body:
-; CHECK-NOT: @llvm.masked.load
-; CHECK-NOT: @llvm.masked.store
-; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
-entry:
- %cmp6 = icmp eq i32 %N, 0
- br i1 %cmp6, label %while.end, label %while.body.preheader
-
-while.body.preheader: ; preds = %entry
- br label %while.body
-
-while.body: ; preds = %while.body.preheader, %while.body
- %blkCnt.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
- %sum.08 = phi float [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ]
- %Input.addr.07 = phi float* [ %incdec.ptr, %while.body ], [ %Input, %while.body.preheader ]
- %incdec.ptr = getelementptr inbounds float, float* %Input.addr.07, i32 1
- %0 = load float, float* %Input.addr.07, align 4
- %add = fadd fast float %0, %sum.08
- %dec = add i32 %blkCnt.09, -1
- %cmp = icmp eq i32 %dec, 0
- br i1 %cmp, label %while.end.loopexit, label %while.body
-
-while.end.loopexit: ; preds = %while.body
- %add.lcssa = phi float [ %add, %while.body ]
- br label %while.end
-
-while.end: ; preds = %while.end.loopexit, %entry
- %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ]
- %conv = uitofp i32 %N to float
- %div = fdiv fast float %sum.0.lcssa, %conv
- store float %div, float* %Output, align 4
- ret void
-}
-
-; Don't tail-fold float reductions.
-;
-define dso_local void @mixed_f32_i32_reduction(float* nocapture readonly %fInput, i32* nocapture readonly %iInput, i32 %N, float* nocapture %fOutput, i32* nocapture %iOutput) local_unnamed_addr #0 {
-; CHECK-LABEL: mixed_f32_i32_reduction(
-; CHECK: vector.body:
-; CHECK-NOT: @llvm.masked.load
-; CHECK-NOT: @llvm.masked.store
-; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
-entry:
- %cmp15 = icmp eq i32 %N, 0
- br i1 %cmp15, label %while.end, label %while.body.preheader
-
-while.body.preheader:
- br label %while.body
-
-while.body:
- %blkCnt.020 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
- %isum.019 = phi i32 [ %add2, %while.body ], [ 0, %while.body.preheader ]
- %fsum.018 = phi float [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ]
- %fInput.addr.017 = phi float* [ %incdec.ptr, %while.body ], [ %fInput, %while.body.preheader ]
- %iInput.addr.016 = phi i32* [ %incdec.ptr1, %while.body ], [ %iInput, %while.body.preheader ]
- %incdec.ptr = getelementptr inbounds float, float* %fInput.addr.017, i32 1
- %incdec.ptr1 = getelementptr inbounds i32, i32* %iInput.addr.016, i32 1
- %0 = load i32, i32* %iInput.addr.016, align 4
- %add2 = add nsw i32 %0, %isum.019
- %1 = load float, float* %fInput.addr.017, align 4
- %add = fadd fast float %1, %fsum.018
- %dec = add i32 %blkCnt.020, -1
- %cmp = icmp eq i32 %dec, 0
- br i1 %cmp, label %while.end.loopexit, label %while.body
-
-while.end.loopexit:
- %add.lcssa = phi float [ %add, %while.body ]
- %add2.lcssa = phi i32 [ %add2, %while.body ]
- %phitmp = sitofp i32 %add2.lcssa to float
- br label %while.end
-
-while.end:
- %fsum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ]
- %isum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %phitmp, %while.end.loopexit ]
- %conv = uitofp i32 %N to float
- %div = fdiv fast float %fsum.0.lcssa, %conv
- store float %div, float* %fOutput, align 4
- %div5 = fdiv fast float %isum.0.lcssa, %conv
- %conv6 = fptosi float %div5 to i32
- store i32 %conv6, i32* %iOutput, align 4
- ret void
-}
-
-define dso_local i32 @i32_mul_reduction(i32* noalias nocapture readonly %B, i32 %N) local_unnamed_addr #0 {
-; CHECK-LABEL: i32_mul_reduction(
-; CHECK: vector.body:
-; CHECK-NOT: @llvm.masked.load
-; CHECK-NOT: @llvm.masked.store
-; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
-entry:
- %cmp6 = icmp sgt i32 %N, 0
- br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
- br label %for.body
-
-for.cond.cleanup.loopexit:
- %mul.lcssa = phi i32 [ %mul, %for.body ]
- br label %for.cond.cleanup
-
-for.cond.cleanup:
- %S.0.lcssa = phi i32 [ 1, %entry ], [ %mul.lcssa, %for.cond.cleanup.loopexit ]
- ret i32 %S.0.lcssa
-
-for.body:
- %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
- %S.07 = phi i32 [ %mul, %for.body ], [ 1, %for.body.preheader ]
- %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.08
- %0 = load i32, i32* %arrayidx, align 4
- %mul = mul nsw i32 %0, %S.07
- %inc = add nuw nsw i32 %i.08, 1
- %exitcond = icmp eq i32 %inc, %N
- br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-define dso_local i32 @i32_or_reduction(i32* noalias nocapture readonly %B, i32 %N) local_unnamed_addr #0 {
-; CHECK-LABEL: i32_or_reduction(
-; CHECK: vector.body:
-; CHECK-NOT: @llvm.masked.load
-; CHECK-NOT: @llvm.masked.store
-; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
-entry:
- %cmp6 = icmp sgt i32 %N, 0
- br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.body
- %or.lcssa = phi i32 [ %or, %for.body ]
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %S.0.lcssa = phi i32 [ 1, %entry ], [ %or.lcssa, %for.cond.cleanup.loopexit ]
- ret i32 %S.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.body
- %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
- %S.07 = phi i32 [ %or, %for.body ], [ 1, %for.body.preheader ]
- %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.08
- %0 = load i32, i32* %arrayidx, align 4
- %or = or i32 %0, %S.07
- %inc = add nuw nsw i32 %i.08, 1
- %exitcond = icmp eq i32 %inc, %N
- br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
-}
-
-define dso_local i32 @i32_and_reduction(i32* noalias nocapture readonly %A, i32 %N, i32 %S) local_unnamed_addr #0 {
-; CHECK-LABEL: i32_and_reduction(
-; CHECK: vector.body:
-; CHECK-NOT: @llvm.masked.load
-; CHECK-NOT: @llvm.masked.store
-; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
-entry:
- %cmp5 = icmp sgt i32 %N, 0
- br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.body
- %and.lcssa = phi i32 [ %and, %for.body ]
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- %S.addr.0.lcssa = phi i32 [ %S, %entry ], [ %and.lcssa, %for.cond.cleanup.loopexit ]
- ret i32 %S.addr.0.lcssa
-
-for.body: ; preds = %for.body.preheader, %for.body
- %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
- %S.addr.06 = phi i32 [ %and, %for.body ], [ %S, %for.body.preheader ]
- %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.07
- %0 = load i32, i32* %arrayidx, align 4
- %and = and i32 %0, %S.addr.06
- %inc = add nuw nsw i32 %i.07, 1
- %exitcond = icmp eq i32 %inc, %N
- br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
-}
-
define i32 @i32_smin_reduction(i32* nocapture readonly %x, i32 %n) #0 {
; CHECK-LABEL: i32_smin_reduction(
; CHECK: vector.body:
More information about the llvm-commits
mailing list