[llvm] f39f92c - [ARM][MVE] tail-predication: overflow checks for elementcount, cont'd
Sjoerd Meijer via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 28 01:22:14 PDT 2020
Author: Sjoerd Meijer
Date: 2020-09-28T09:20:51+01:00
New Revision: f39f92c1f610fcdfad74730a3e3df881e32a28c2
URL: https://github.com/llvm/llvm-project/commit/f39f92c1f610fcdfad74730a3e3df881e32a28c2
DIFF: https://github.com/llvm/llvm-project/commit/f39f92c1f610fcdfad74730a3e3df881e32a28c2.diff
LOG: [ARM][MVE] tail-predication: overflow checks for elementcount, cont'd
This is a reimplementation of the overflow checks for the elementcount,
i.e. the 2nd argument of intrinsic get.active.lane.mask. The element
count is lowered in each iteration of the tail-predicated loop, and
we must prove that this expression doesn't overflow.
Many thanks to Eli Friedman and Sam Parker for all their help with
this work.
Differential Revision: https://reviews.llvm.org/D88086
Added:
Modified:
llvm/lib/Target/ARM/MVETailPredication.cpp
llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
Removed:
llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-forced.ll
################################################################################
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index 21ba4a7f5db9..794c4949437c 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -373,15 +373,15 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
EnableTailPredication == TailPredication::ForceEnabledNoReductions ||
EnableTailPredication == TailPredication::ForceEnabled;
- // 1) Check that the original scalar loop TripCount (TC) belongs to this loop.
- // The scalar tripcount corresponds the number of elements processed by the
- // loop, so we will refer to that from this point on.
Value *ElemCount = ActiveLaneMask->getOperand(1);
auto *EC= SE->getSCEV(ElemCount);
auto *TC = SE->getSCEV(TripCount);
int VectorWidth = VecTy->getNumElements();
ConstantInt *ConstElemCount = nullptr;
+ // 1) Smoke tests that the original scalar loop TripCount (TC) belongs to
+ // this loop. The scalar tripcount corresponds the number of elements
+ // processed by the loop, so we will refer to that from this point on.
if (!SE->isLoopInvariant(EC, L)) {
LLVM_DEBUG(dbgs() << "ARM TP: element count must be loop invariant.\n");
return false;
@@ -405,6 +405,9 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
// counting from 0.
uint64_t TC2 = ConstElemCount->getZExtValue() + 1;
+ // If the tripcount values are inconsistent, we don't want to insert the
+ // VCTP and trigger tail-predication; it's better to keep intrinsic
+ // get.active.lane.mask and legalize this.
if (TC1 != TC2) {
LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: "
<< TC1 << " from set.loop.iterations, and "
@@ -412,104 +415,59 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
return false;
}
} else if (!ForceTailPredication) {
- // Smoke tests if the element count is a runtime value. I.e., this isn't
- // fully generic because that would require a full SCEV visitor here. It
- // would require extracting the variable from the elementcount SCEV
- // expression, and match this up with the tripcount SCEV expression. If
- // this matches up, we know both expressions are bound by the same
- // variable, and thus we know this tripcount belongs to this loop. The
- // checks below will catch most cases though.
- if (isa<SCEVAddExpr>(EC) || isa<SCEVUnknown>(EC)) {
- // If the element count is a simple AddExpr or SCEVUnknown, which is e.g.
- // the case when the element count is just a variable %N, we can just see
- // if it is an operand in the tripcount scev expression.
- if (isa<SCEVAddExpr>(TC) && !SE->hasOperand(TC, EC)) {
- LLVM_DEBUG(dbgs() << "ARM TP: Can't verify the element counter\n");
- return false;
- }
- } else if (const SCEVAddRecExpr *AddRecExpr = dyn_cast<SCEVAddRecExpr>(EC)) {
- // For more complicated AddRecExpr, check that the corresponding loop and
- // its loop hierarhy contains the trip count loop.
- if (!AddRecExpr->getLoop()->contains(L)) {
- LLVM_DEBUG(dbgs() << "ARM TP: Can't verify the element counter\n");
- return false;
- }
- } else {
- LLVM_DEBUG(dbgs() << "ARM TP: Unsupported SCEV type, can't verify the "
- "element counter\n");
+ // 2) We need to prove that the sub expression that we create in the
+ // tail-predicated loop body, which calculates the remaining elements to be
+ // processed, is non-negative, i.e. it doesn't overflow:
+ //
+ // ((ElementCount + VectorWidth - 1) / VectorWidth) - TripCount >= 0
+ //
+ // This is true if:
+ //
+ // TripCount == (ElementCount + VectorWidth - 1) / VectorWidth
+ //
+ // which what we will be using here.
+ //
+ auto *VW = SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth));
+ // ElementCount + (VW-1):
+ auto *ECPlusVWMinus1 = SE->getAddExpr(EC,
+ SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1)));
+
+ // Ceil = ElementCount + (VW-1) / VW
+ auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1, VW);
+
+ LLVM_DEBUG(
+ dbgs() << "ARM TP: Analysing overflow behaviour for:\n";
+ dbgs() << "ARM TP: - TripCount = "; TC->dump();
+ dbgs() << "ARM TP: - ElemCount = "; EC->dump();
+ dbgs() << "ARM TP: - VecWidth = " << VectorWidth << "\n";
+ dbgs() << "ARM TP: - (ElemCount+VW-1) / VW = "; Ceil->dump();
+ );
+
+ // As an example, almost all the tripcount expressions (produced by the
+ // vectoriser) look like this:
+ //
+ // TC = ((-4 + (4 * ((3 + %N) /u 4))<nuw>) /u 4)
+ //
+ // and "ElementCount + (VW-1) / VW":
+ //
+ // Ceil = ((3 + %N) /u 4)
+ //
+ // Check for equality of TC and Ceil by calculating SCEV expression
+ // TC - Ceil and test it for zero.
+ //
+ bool Zero = SE->getMinusSCEV(
+ SE->getBackedgeTakenCount(L),
+ SE->getUDivExpr(SE->getAddExpr(SE->getMulExpr(Ceil, VW),
+ SE->getNegativeSCEV(VW)),
+ VW))
+ ->isZero();
+
+ if (!Zero) {
+ LLVM_DEBUG(dbgs() << "ARM TP: possible overflow in sub expression.\n");
return false;
}
}
- // 2) Prove that the sub expression is non-negative, i.e. it doesn't overflow:
- //
- // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount
- //
- // 2.1) First prove overflow can't happen in:
- //
- // ElementCount + (VectorWidth - 1)
- //
- // Because of a lack of context, it is
diff icult to get a useful bounds on
- // this expression. But since ElementCount uses the same variables as the
- // TripCount (TC), for which we can find meaningful value ranges, we use that
- // instead and assert that:
- //
- // upperbound(TC) <= UINT_MAX - VectorWidth
- //
- unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits();
- auto MaxMinusVW = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth);
- APInt UpperboundTC = SE->getUnsignedRangeMax(TC);
-
- if (UpperboundTC.ugt(MaxMinusVW) && !ForceTailPredication) {
- LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in tripcount rounding:\n";
- dbgs() << "upperbound(TC) <= UINT_MAX - VectorWidth\n";
- dbgs() << UpperboundTC << " <= " << MaxMinusVW << " == false\n";);
- return false;
- }
-
- // 2.2) Make sure overflow doesn't happen in final expression:
- // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount,
- // To do this, compare the full ranges of these subexpressions:
- //
- // Range(Ceil) <= Range(TC)
- //
- // where Ceil = ElementCount + (VW-1) / VW. If Ceil and TC are runtime
- // values (and not constants), we have to compensate for the lowerbound value
- // range to be off by 1. The reason is that the TC lives in the preheader in
- // this form:
- //
- // %trip.count.minus = add nsw nuw i32 %N, -1
- //
- // For the loop to be executed, %N has to be >= 1 and as a result the value
- // range of %trip.count.minus has a lower bound of 0. Value %TC has this form:
- //
- // %5 = add nuw nsw i32 %4, 1
- // call void @llvm.set.loop.iterations.i32(i32 %5)
- //
- // where %5 is some expression using %N, which needs to have a lower bound of
- // 1. Thus, if the ranges of Ceil and TC are not a single constant but a set,
- // we first add 0 to TC such that we can do the <= comparison on both sets.
- //
-
- // Tmp = ElementCount + (VW-1)
- auto *ECPlusVWMinus1 = SE->getAddExpr(EC,
- SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1)));
- // Ceil = ElementCount + (VW-1) / VW
- auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1,
- SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth)));
-
- ConstantRange RangeCeil = SE->getUnsignedRange(Ceil) ;
- ConstantRange RangeTC = SE->getUnsignedRange(TC) ;
- if (!RangeTC.isSingleElement()) {
- auto ZeroRange =
- ConstantRange(APInt(TripCount->getType()->getScalarSizeInBits(), 0));
- RangeTC = RangeTC.unionWith(ZeroRange);
- }
- if (!RangeTC.contains(RangeCeil) && !ForceTailPredication) {
- LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in sub\n");
- return false;
- }
-
// 3) Find out if IV is an induction phi. Note that we can't use Loop
// helpers here to get the induction variable, because the hardware loop is
// no longer in loopsimplify form, and also the hwloop intrinsic uses a
@@ -518,6 +476,7 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
auto *IV = ActiveLaneMask->getOperand(0);
auto *IVExpr = SE->getSCEV(IV);
auto *AddExpr = dyn_cast<SCEVAddRecExpr>(IVExpr);
+
if (!AddExpr) {
LLVM_DEBUG(dbgs() << "ARM TP: induction not an add expr: "; IVExpr->dump());
return false;
@@ -527,6 +486,11 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
LLVM_DEBUG(dbgs() << "ARM TP: phi not part of this loop\n");
return false;
}
+ auto *Base = dyn_cast<SCEVConstant>(AddExpr->getOperand(0));
+ if (!Base || !Base->isZero()) {
+ LLVM_DEBUG(dbgs() << "ARM TP: induction base is not 0\n");
+ return false;
+ }
auto *Step = dyn_cast<SCEVConstant>(AddExpr->getOperand(1));
if (!Step) {
LLVM_DEBUG(dbgs() << "ARM TP: induction step is not a constant: ";
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
index 22ffa12c93ea..51300a959f5a 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
@@ -478,102 +478,63 @@ for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
-; CHECK-LABEL: wrong_tripcount_arg
-; CHECK: vector.body:
-; CHECK: call <4 x i1> @llvm.arm.mve.vctp32
-; CHECK-NOT: call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32
-; CHECK: vector.body35:
-; CHECK: call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32
-; CHECK-NOT: call <4 x i1> @llvm.arm.mve.vctp32
+; CHECK-LABEL: tripcount_arg_not_invariant
+; CHECK: call <4 x i1> @llvm.get.active.lane.mask
+; CHECK-NOT: vctp
; CHECK: ret void
;
-define dso_local void @wrong_tripcount_arg(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture %D, i32 %N1, i32 %N2) local_unnamed_addr #0 {
+define dso_local void @tripcount_arg_not_invariant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
entry:
- %cmp29 = icmp sgt i32 %N1, 0
- %0 = add i32 %N1, 3
+ %cmp8 = icmp sgt i32 %N, 0
+ %0 = add i32 %N, 3
%1 = lshr i32 %0, 2
%2 = shl nuw i32 %1, 2
%3 = add i32 %2, -4
%4 = lshr i32 %3, 2
%5 = add nuw nsw i32 %4, 1
- br i1 %cmp29, label %vector.ph, label %for.cond4.preheader
+ br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
+ %trip.count.minus.1 = add i32 %N, -1
call void @llvm.set.loop.iterations.i32(i32 %5)
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
- %lsr.iv62 = phi i32* [ %scevgep63, %vector.body ], [ %D, %vector.ph ]
- %lsr.iv59 = phi i32* [ %scevgep60, %vector.body ], [ %C, %vector.ph ]
- %lsr.iv56 = phi i32* [ %scevgep57, %vector.body ], [ %B, %vector.ph ]
+ %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
+ %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
+ %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ]
- %lsr.iv5658 = bitcast i32* %lsr.iv56 to <4 x i32>*
- %lsr.iv5961 = bitcast i32* %lsr.iv59 to <4 x i32>*
- %lsr.iv6264 = bitcast i32* %lsr.iv62 to <4 x i32>*
- %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N1)
- %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv5658, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
- %wide.masked.load32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv5961, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
- %7 = add nsw <4 x i32> %wide.masked.load32, %wide.masked.load
- call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv6264, i32 4, <4 x i1> %active.lane.mask)
+
+ %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
+ %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
+ %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
+
+ %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %index)
+
+ %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+ %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+ %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask)
%index.next = add i32 %index, 4
- %scevgep57 = getelementptr i32, i32* %lsr.iv56, i32 4
- %scevgep60 = getelementptr i32, i32* %lsr.iv59, i32 4
- %scevgep63 = getelementptr i32, i32* %lsr.iv62, i32 4
+ %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+ %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
+ %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
%8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
%9 = icmp ne i32 %8, 0
- br i1 %9, label %vector.body, label %for.cond4.preheader
-
-for.cond4.preheader: ; preds = %vector.body, %entry
- %cmp527 = icmp sgt i32 %N2, 0
- %10 = add i32 %N2, 3
- %11 = lshr i32 %10, 2
- %12 = shl nuw i32 %11, 2
- %13 = add i32 %12, -4
- %14 = lshr i32 %13, 2
- %15 = add nuw nsw i32 %14, 1
- br i1 %cmp527, label %vector.ph36, label %for.cond.cleanup6
-
-vector.ph36: ; preds = %for.cond4.preheader
- call void @llvm.set.loop.iterations.i32(i32 %15)
- br label %vector.body35
-
-vector.body35: ; preds = %vector.body35, %vector.ph36
- %lsr.iv53 = phi i32* [ %scevgep54, %vector.body35 ], [ %A, %vector.ph36 ]
- %lsr.iv50 = phi i32* [ %scevgep51, %vector.body35 ], [ %C, %vector.ph36 ]
- %lsr.iv = phi i32* [ %scevgep, %vector.body35 ], [ %B, %vector.ph36 ]
- %index40 = phi i32 [ 0, %vector.ph36 ], [ %index.next41, %vector.body35 ]
- %16 = phi i32 [ %15, %vector.ph36 ], [ %18, %vector.body35 ]
- %lsr.iv49 = bitcast i32* %lsr.iv to <4 x i32>*
- %lsr.iv5052 = bitcast i32* %lsr.iv50 to <4 x i32>*
- %lsr.iv5355 = bitcast i32* %lsr.iv53 to <4 x i32>*
-
-; This has N1 as the tripcount / element count, which is the tripcount of the
-; first loop and not this one:
- %active.lane.mask46 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index40, i32 %N1)
-
- %wide.masked.load47 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv49, i32 4, <4 x i1> %active.lane.mask46, <4 x i32> undef)
- %wide.masked.load48 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv5052, i32 4, <4 x i1> %active.lane.mask46, <4 x i32> undef)
- %17 = add nsw <4 x i32> %wide.masked.load48, %wide.masked.load47
- call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %17, <4 x i32>* %lsr.iv5355, i32 4, <4 x i1> %active.lane.mask46)
- %index.next41 = add i32 %index40, 4
- %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
- %scevgep51 = getelementptr i32, i32* %lsr.iv50, i32 4
- %scevgep54 = getelementptr i32, i32* %lsr.iv53, i32 4
- %18 = call i32 @llvm.loop.decrement.reg.i32(i32 %16, i32 1)
- %19 = icmp ne i32 %18, 0
- br i1 %19, label %vector.body35, label %for.cond.cleanup6
+ ;br i1 %9, label %vector.body, label %for.cond.cleanup
+ br i1 %9, label %vector.body, label %vector.ph
-for.cond.cleanup6: ; preds = %vector.body35, %for.cond4.preheader
+for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
-; CHECK-LABEL: tripcount_arg_not_invariant
+; CHECK-LABEL: addrec_base_not_zero
; CHECK: call <4 x i1> @llvm.get.active.lane.mask
; CHECK-NOT: vctp
; CHECK: ret void
;
-define dso_local void @tripcount_arg_not_invariant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+define dso_local void @addrec_base_not_zero(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
entry:
%cmp8 = icmp sgt i32 %N, 0
%0 = add i32 %N, 3
@@ -593,15 +554,15 @@ vector.body: ; preds = %vector.body, %vecto
%lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
%lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
%lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
- %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
- %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ]
+; AddRec base is not 0:
+ %index = phi i32 [ 1, %vector.ph ], [ %index.next, %vector.body ]
+
+ %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ]
%lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
%lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
%lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
-
- %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %index)
-
+ %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
%7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
@@ -619,6 +580,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
+
declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
index 8bf15aba9d97..58f3a94b061f 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
@@ -375,7 +375,7 @@ vector.body:
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-; The induction variable %D is not an IV:
+; The induction variable %N is not an IV:
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32003)
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-forced.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-forced.ll
deleted file mode 100644
index e2fa8ea77071..000000000000
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-forced.ll
+++ /dev/null
@@ -1,61 +0,0 @@
-; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s --check-prefixes=CHECK,ENABLED
-; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=force-enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s --check-prefixes=CHECK,FORCED
-
-; CHECK-LABEL: set_iterations_not_rounded_up
-;
-; ENABLED: call <4 x i1> @llvm.get.active.lane.mask
-; ENABLED-NOT: vctp
-;
-; FORCED-NOT: call <4 x i1> @llvm.get.active.lane.mask
-; FORCED: vctp
-;
-; CHECK: ret void
-;
-define dso_local void @set_iterations_not_rounded_up(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
-entry:
- %cmp8 = icmp sgt i32 %N, 0
-
-; Here, v5 which is used in set.loop.iterations which is usually rounded up to
-; a next multiple of the VF when emitted from the vectoriser, which means a
-; bound can be put on this expression. Without this, we can't, and should flag
-; this as potentially overflow behaviour.
-
- %v5 = add nuw nsw i32 %N, 1
- br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
-
-vector.ph: ; preds = %entry
- %trip.count.minus.1 = add i32 %N, -1
- call void @llvm.set.loop.iterations.i32(i32 %v5)
- br label %vector.body
-
-vector.body: ; preds = %vector.body, %vector.ph
- %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
- %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
- %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
- %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
- %v6 = phi i32 [ %v5, %vector.ph ], [ %v8, %vector.body ]
- %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
- %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
- %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
- %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
- %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
- %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
- %v7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
- call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask)
- %index.next = add i32 %index, 4
- %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
- %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
- %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
- %v8 = call i32 @llvm.loop.decrement.reg.i32(i32 %v6, i32 1)
- %v9 = icmp ne i32 %v8, 0
- br i1 %v9, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %vector.body, %entry
- ret void
-}
-
-declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
-declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-declare void @llvm.set.loop.iterations.i32(i32)
-declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
-declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
More information about the llvm-commits
mailing list