[llvm] 1319d9b - [ARM] Don't revert get.active.lane.mask in ARM Tail-Predication pass
Sjoerd Meijer via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 25 23:45:31 PDT 2020
Author: Sjoerd Meijer
Date: 2020-06-26T07:42:39+01:00
New Revision: 1319d9bb84fa191aba58c4064958fdeb1f29192d
URL: https://github.com/llvm/llvm-project/commit/1319d9bb84fa191aba58c4064958fdeb1f29192d
DIFF: https://github.com/llvm/llvm-project/commit/1319d9bb84fa191aba58c4064958fdeb1f29192d.diff
LOG: [ARM] Don't revert get.active.lane.mask in ARM Tail-Predication pass
Don't revert intrinsic get.active.lane.mask here, this is moved to isel
legalization in D82292.
Differential Revision: https://reviews.llvm.org/D82105
Added:
Modified:
llvm/lib/Target/ARM/MVETailPredication.cpp
llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index 7af202264348..79713bd5cec5 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -86,8 +86,6 @@ class MVETailPredication : public LoopPass {
TargetTransformInfo *TTI = nullptr;
TargetLibraryInfo *TLI = nullptr;
bool ClonedVCTPInExitBlock = false;
- IntrinsicInst *ActiveLaneMask = nullptr;
- FixedVectorType *VecTy = nullptr;
public:
static char ID;
@@ -119,7 +117,8 @@ class MVETailPredication : public LoopPass {
/// intrinsic: check if the first is a loop induction variable, and for the
/// the second check that no overflow can occur in the expression that use
/// this backedge-taken count.
- bool IsSafeActiveMask(Value *TripCount, FixedVectorType *VecTy);
+ bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount,
+ FixedVectorType *VecTy);
/// Insert the intrinsic to represent the effect of tail predication.
void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount,
@@ -130,10 +129,6 @@ class MVETailPredication : public LoopPass {
/// ARMLowOverheadLoops to better optimise away loop update statements inside
/// hardware-loops.
void RematerializeIterCount();
-
- /// If it is not safe to lower @llvm.get.active.lane.mask to a VCTP, it needs
- /// to be lowered to an icmp.
- void RevertActiveLaneMask();
};
} // end namespace
@@ -167,83 +162,6 @@ void MVETailPredication::RematerializeIterCount() {
DeadInsts);
}
-void MVETailPredication::RevertActiveLaneMask() {
- if (!ActiveLaneMask)
- return;
-
- int VectorWidth = VecTy->getElementCount().Min;
- IRBuilder<> Builder(ActiveLaneMask->getParent()->getFirstNonPHI());
-
- // 1. Create the vector induction step. This %induction will be the LHS of
- // the icmp:
- //
- // %splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
- // %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <4 x i32> 0
- // %induction = add <4 x i32> %splat, <i32 0, i32 1, i32 2, i32 3>
- //
- Value *Index = ActiveLaneMask->getOperand(0);
- Value *SplatIndex =
- Builder.CreateVectorSplat(VectorWidth, Index, "lane.mask");
-
- SmallVector<Constant *, 8> Indices;
- for (int i = 0; i < VectorWidth; ++i)
- Indices.push_back(ConstantInt::get(Index->getType(), i));
-
- Constant *CV = ConstantVector::get(Indices);
- Value *Induction = Builder.CreateAdd(SplatIndex, CV, "lane.mask.induction");
-
- LLVM_DEBUG(dbgs() << "ARM TP: New index: " << *SplatIndex << "\n";
- dbgs() << "ARM TP: New Induction: " << *Induction << "\n");
-
- // 2. In the Preheader, first look if the splat BTC already exists. Find this
- // %splat, which will be the RHS of the icmp:
- //
- // %TC.minus.1 = add i32 %N, -1
- // %splatinsert = insertelement <4 x i32> undef, i32 %TC.minus.1, i32 0
- // %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <16 x i32> 0
- //
- auto *Preheader = L->getLoopPreheader();
- auto *BTC = ActiveLaneMask->getOperand(1);
- Value *SplatBTC = nullptr;
-
- if (auto *C = dyn_cast<ConstantInt>(BTC)) {
- Builder.SetInsertPoint(Preheader->getTerminator());
- SplatBTC = Builder.CreateVectorSplat(VectorWidth, C);
- LLVM_DEBUG(dbgs() << "ARM TCP: New splat BTC: " << *SplatBTC << "\n");
- } else {
- Instruction *InsertElem;
- for (auto &V : *Preheader) {
- InsertElem = dyn_cast<InsertElementInst>(&V);
- if (!InsertElem)
- continue;
- ConstantInt *CI = dyn_cast<ConstantInt>(InsertElem->getOperand(2));
- if (!CI)
- continue;
- if (InsertElem->getOperand(1) != BTC || CI->getSExtValue() != 0)
- continue;
- if ((SplatBTC = dyn_cast<ShuffleVectorInst>(*InsertElem->users().begin())))
- break;
- }
- }
- // Or create the splat BTC if it doesn't exist.
- if (!SplatBTC) {
- Builder.SetInsertPoint(Preheader->getTerminator());
- Value *Undef =
- UndefValue::get(FixedVectorType::get(BTC->getType(), VectorWidth));
- Value *Insert = Builder.CreateInsertElement(Undef,
- BTC, Builder.getInt32(0), "insert.btc");
- Value *Zero = ConstantInt::get(Insert->getType(), 0);
- SplatBTC = Builder.CreateShuffleVector (Insert, Undef, Zero, "splat.btc");
- LLVM_DEBUG(dbgs() << "ARM TCP: New splat BTC: " << *SplatBTC << "\n");
- }
-
- Builder.SetInsertPoint(ActiveLaneMask);
- Value *ICmp = Builder.CreateICmp(ICmpInst::ICMP_ULE, Induction, SplatBTC);
- LLVM_DEBUG(dbgs() << "ARM TP: New compare: " << *ICmp << "\n");
- ActiveLaneMask->replaceAllUsesWith(ICmp);
- ActiveLaneMask->eraseFromParent();
-}
-
bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
if (skipLoop(L) || DisableTailPredication)
return false;
@@ -261,7 +179,6 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr;
DL = &L->getHeader()->getModule()->getDataLayout();
this->L = L;
- ActiveLaneMask = nullptr;
// The MVE and LOB extensions are combined to enable tail-predication, but
// there's nothing preventing us from generating VCTP instructions for v8.1m.
@@ -318,15 +235,14 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
<< *Decrement << "\n");
- if (TryConvert(Setup->getArgOperand(0))) {
- if (ClonedVCTPInExitBlock)
- RematerializeIterCount();
- return true;
- } else
- RevertActiveLaneMask();
+ if (!TryConvert(Setup->getArgOperand(0))) {
+ LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n");
+ return false;
+ }
- LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n");
- return false;
+ if (ClonedVCTPInExitBlock)
+ RematerializeIterCount();
+ return true;
}
static FixedVectorType *getVectorType(IntrinsicInst *I) {
@@ -341,10 +257,27 @@ bool MVETailPredication::IsPredicatedVectorLoop() {
// Check that the loop contains at least one masked load/store intrinsic.
// We only support 'normal' vector instructions - other than masked
// load/stores.
+ bool ActiveLaneMask = false;
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
+ auto *Int = dyn_cast<IntrinsicInst>(&I);
+ if (!Int)
+ continue;
+
+ switch (Int->getIntrinsicID()) {
+ case Intrinsic::get_active_lane_mask:
+ ActiveLaneMask = true;
+ LLVM_FALLTHROUGH;
+ case Intrinsic::fma:
+ case Intrinsic::sadd_sat:
+ case Intrinsic::uadd_sat:
+ continue;
+ default:
+ break;
+ }
+
if (IsMasked(&I)) {
- FixedVectorType *VecTy = getVectorType(cast<IntrinsicInst>(&I));
+ auto *VecTy = getVectorType(Int);
unsigned Lanes = VecTy->getNumElements();
unsigned ElementWidth = VecTy->getScalarSizeInBits();
// MVE vectors are 128-bit, but don't support 128 x i1.
@@ -353,23 +286,20 @@ bool MVETailPredication::IsPredicatedVectorLoop() {
if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth)
return false;
MaskedInsts.push_back(cast<IntrinsicInst>(&I));
- } else if (auto *Int = dyn_cast<IntrinsicInst>(&I)) {
- switch (Int->getIntrinsicID()) {
- case Intrinsic::fma:
- case Intrinsic::sadd_sat:
- case Intrinsic::uadd_sat:
- continue;
- default:
- break;
- }
- for (auto &U : Int->args()) {
- if (isa<VectorType>(U->getType()))
- return false;
- }
+ continue;
+ }
+
+ for (const Use &U : Int->args()) {
+ if (isa<VectorType>(U->getType()))
+ return false;
}
}
}
+ if (!ActiveLaneMask) {
+ LLVM_DEBUG(dbgs() << "ARM TP: No get.active.lane.mask intrinsic found.\n");
+ return false;
+ }
return !MaskedInsts.empty();
}
@@ -451,14 +381,15 @@ static bool Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
// (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount
// 3) The IV must be an induction phi with an increment equal to the
// vector width.
-bool MVETailPredication::IsSafeActiveMask(Value *TripCount,
- FixedVectorType *VecTy) {
+bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
+ Value *TripCount, FixedVectorType *VecTy) {
// 1) Test whether entry to the loop is protected by a conditional
// BTC + 1 < 0. In other words, if the scalar trip count overflows,
// becomes negative, we shouldn't enter the loop and creating
// tripcount expression BTC + 1 is not safe. So, check that BTC
// isn't max. This is evaluated in unsigned, because the semantics
// of @get.active.lane.mask is a ULE comparison.
+
int VectorWidth = VecTy->getNumElements();
auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1);
auto *BTC = SE->getSCEV(BackedgeTakenCount);
@@ -570,8 +501,8 @@ bool MVETailPredication::IsSafeActiveMask(Value *TripCount,
if (VectorWidth == StepValue)
return true;
- LLVM_DEBUG(dbgs() << "ARM TP: step value " << StepValue << " doesn't match "
- "vector width : " << VectorWidth << "\n");
+ LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue << " doesn't match "
+ "vector width " << VectorWidth << "\n");
return false;
}
@@ -614,6 +545,7 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
Module *M = L->getHeader()->getModule();
Type *Ty = IntegerType::get(M->getContext(), 32);
+ unsigned VectorWidth = VecTy->getNumElements();
// The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand,
// is one less than the trip count. So we need to find or create
@@ -631,10 +563,10 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
// represent the effect of tail predication.
Builder.SetInsertPoint(ActiveLaneMask);
ConstantInt *Factor =
- ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements());
+ ConstantInt::get(cast<IntegerType>(Ty), VectorWidth);
Intrinsic::ID VCTPID;
- switch (VecTy->getNumElements()) {
+ switch (VectorWidth) {
default:
llvm_unreachable("unexpected number of lanes");
case 4: VCTPID = Intrinsic::arm_mve_vctp32; break;
@@ -680,7 +612,7 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
if (!Predicate || Predicates.count(Predicate))
continue;
- ActiveLaneMask = dyn_cast<IntrinsicInst>(Predicate);
+ auto *ActiveLaneMask = dyn_cast<IntrinsicInst>(Predicate);
if (!ActiveLaneMask ||
ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask)
continue;
@@ -689,8 +621,8 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: "
<< *ActiveLaneMask << "\n");
- VecTy = getVectorType(I);
- if (!IsSafeActiveMask(TripCount, VecTy)) {
+ auto *VecTy = getVectorType(I);
+ if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy)) {
LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n");
return false;
}
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
index 54ddf6468336..a00af0d6a9ec 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
@@ -49,7 +49,7 @@ vector.body: ; preds = %vector.body, %vecto
%tmp7 = bitcast i8* %tmp6 to <16 x i8>*
tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %active.lane.mask)
%index.next = add i32 %index, 16
- %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+ %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
br i1 %tmp16, label %vector.body, label %for.cond.cleanup
@@ -106,7 +106,7 @@ vector.body: ; preds = %vector.body, %vecto
%tmp7 = bitcast i16* %tmp6 to <8 x i16>*
tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %active.lane.mask)
%index.next = add i32 %index, 8
- %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+ %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
br i1 %tmp16, label %vector.body, label %for.cond.cleanup
@@ -160,7 +160,7 @@ vector.body: ; preds = %vector.body, %vecto
%tmp7 = bitcast i32* %tmp6 to <4 x i32>*
tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask)
%index.next = add i32 %index, 4
- %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+ %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
br i1 %tmp16, label %vector.body, label %for.cond.cleanup
@@ -221,7 +221,7 @@ vector.body: ; preds = %vector.body, %vecto
%tmp7 = bitcast i32* %tmp6 to <4 x i32>*
tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask)
%index.next = add i32 %index, 4
- %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+ %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
br i1 %tmp16, label %vector.body, label %for.cond.cleanup
@@ -277,7 +277,7 @@ vector.body: ; preds = %vector.body, %vecto
%tmp7 = bitcast i32* %tmp6 to <4 x i32>*
tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask)
%index.next = add i32 %index, 4
- %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+ %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
br i1 %tmp16, label %vector.body, label %for.cond.cleanup
@@ -336,7 +336,7 @@ vector.body: ; preds = %vector.body, %vecto
%tmp7 = bitcast i32* %tmp6 to <4 x i32>*
tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %wrong)
%index.next = add i32 %index, 4
- %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+ %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
br i1 %tmp16, label %vector.body, label %for.cond.cleanup
@@ -344,6 +344,92 @@ for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
+; TODO: Multiple intrinsics not yet supported.
+; This is currently rejected, because if the vector body is unrolled, the step
+; is not what we expect:
+;
+; Step value 16 doesn't match vector width 4
+;
+; CHECK-LABEL: interleave4
+; CHECK: vector.body:
+; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
+; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %trip.count.minus.1)
+; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %trip.count.minus.1)
+; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %trip.count.minus.1)
+;
+define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+entry:
+ %cmp8 = icmp sgt i32 %N, 0
+ %v0 = add i32 %N, 15
+ %v1 = lshr i32 %v0, 4
+ %v2 = shl nuw i32 %v1, 4
+ %v3 = add i32 %v2, -16
+ %v4 = lshr i32 %v3, 4
+ %v5 = add nuw nsw i32 %v4, 1
+ br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
+
+
+vector.ph:
+ %trip.count.minus.1 = add i32 %N, -1
+ %scevgep = getelementptr i32, i32* %A, i32 8
+ %scevgep30 = getelementptr i32, i32* %C, i32 8
+ %scevgep37 = getelementptr i32, i32* %B, i32 8
+ call void @llvm.set.loop.iterations.i32(i32 %v5)
+ br label %vector.body
+
+vector.body:
+ %lsr.iv38 = phi i32* [ %scevgep39, %vector.body ], [ %scevgep37, %vector.ph ]
+ %lsr.iv31 = phi i32* [ %scevgep32, %vector.body ], [ %scevgep30, %vector.ph ]
+ %lsr.iv = phi i32* [ %scevgep25, %vector.body ], [ %scevgep, %vector.ph ]
+ %index = phi i32 [ 0, %vector.ph ], [ %v14, %vector.body ]
+ %v6 = phi i32 [ %v5, %vector.ph ], [ %v15, %vector.body ]
+ %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>*
+ %lsr.iv3133 = bitcast i32* %lsr.iv31 to <4 x i32>*
+ %lsr.iv26 = bitcast i32* %lsr.iv to <4 x i32>*
+ %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
+ %v7 = add i32 %index, 4
+ %active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %trip.count.minus.1)
+ %v8 = add i32 %v7, 4
+ %active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %trip.count.minus.1)
+ %v9 = add i32 %v8, 4
+ %active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %trip.count.minus.1)
+ %scevgep42 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -2
+ %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep42, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+ %scevgep43 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -1
+ %wide.masked.load18 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep43, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef)
+ %wide.masked.load19 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef)
+ %scevgep41 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 1
+ %wide.masked.load20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep41, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef)
+ %scevgep34 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -2
+ %wide.masked.load21 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep34, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+ %scevgep35 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -1
+ %wide.masked.load22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep35, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef)
+ %wide.masked.load23 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3133, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef)
+ %scevgep36 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 1
+ %wide.masked.load24 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep36, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef)
+ %v10 = add nsw <4 x i32> %wide.masked.load21, %wide.masked.load
+ %v11 = add nsw <4 x i32> %wide.masked.load22, %wide.masked.load18
+ %v12 = add nsw <4 x i32> %wide.masked.load23, %wide.masked.load19
+ %v13 = add nsw <4 x i32> %wide.masked.load24, %wide.masked.load20
+ %scevgep27 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -2
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v10, <4 x i32>* %scevgep27, i32 4, <4 x i1> %active.lane.mask)
+ %scevgep28 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -1
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v11, <4 x i32>* %scevgep28, i32 4, <4 x i1> %active.lane.mask15)
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v12, <4 x i32>* %lsr.iv26, i32 4, <4 x i1> %active.lane.mask16)
+ %scevgep29 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 1
+ call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v13, <4 x i32>* %scevgep29, i32 4, <4 x i1> %active.lane.mask17)
+ %scevgep25 = getelementptr i32, i32* %lsr.iv, i32 16
+ %scevgep32 = getelementptr i32, i32* %lsr.iv31, i32 16
+ %scevgep39 = getelementptr i32, i32* %lsr.iv38, i32 16
+ %v14 = add i32 %v9, 4
+ %v15 = call i32 @llvm.loop.decrement.reg.i32(i32 %v6, i32 1)
+ %v16 = icmp ne i32 %v15, 0
+ br i1 %v16, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+ ret void
+}
+
declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
@@ -353,7 +439,7 @@ declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32 immarg,
declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>)
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
declare void @llvm.set.loop.iterations.i32(i32)
-declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
index dc9da0c9f764..13d750310a56 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
@@ -266,16 +266,9 @@ for.cond.cleanup:
}
; CHECK-LABEL: @overflow_BTC_plus_1(
-;
+; CHECK: vector.body:
; CHECK-NOT: @llvm.arm.mve.vctp32
-; CHECK-NOT: @llvm.get.active.lane.mask
-;
-; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3>
-; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, <i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef)
-;
+; CHECK: @llvm.get.active.lane.mask
; CHECK: ret void
;
define dso_local void @overflow_BTC_plus_1(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
@@ -316,8 +309,9 @@ for.cond.cleanup:
}
; CHECK-LABEL: @overflow_in_sub(
+; CHECK: vector.body:
; CHECK-NOT: @llvm.arm.mve.vctp32
-; CHECK-NOT: @llvm.get.active.lane.mask
+; CHECK: @llvm.get.active.lane.mask
; CHECK: ret void
;
define dso_local void @overflow_in_sub(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
@@ -366,8 +360,9 @@ for.cond.cleanup:
}
; CHECK-LABEL: @overflow_in_rounding_tripcount(
+; CHECK: vector.body:
; CHECK-NOT: @llvm.arm.mve.vctp32
-; CHECK-NOT: @llvm.get.active.lane.mask
+; CHECK: @llvm.get.active.lane.mask
; CHECK: ret void
;
define dso_local void @overflow_in_rounding_tripcount(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
@@ -413,15 +408,9 @@ for.cond.cleanup:
; CHECK-LABEL: @IV_not_an_induction(
-;
+; CHECK: vector.body:
; CHECK-NOT: @llvm.arm.mve.vctp32
-; CHECK-NOT: @llvm.get.active.lane.mask
-;
-; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %N, i32 0
-; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3>
-; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, <i32 32002, i32 32002, i32 32002, i32 32002>
-; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef)
+; CHECK: @llvm.get.active.lane.mask
; CHECK: ret void
;
define dso_local void @IV_not_an_induction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
@@ -462,15 +451,9 @@ for.cond.cleanup:
}
; CHECK-LABEL: @IV_wrong_step(
-;
+; CHECK: vector.body:
; CHECK-NOT: @llvm.arm.mve.vctp32
-; CHECK-NOT: @llvm.get.active.lane.mask
-;
-; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3>
-; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, <i32 32002, i32 32002, i32 32002, i32 32002>
-; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef)
+; CHECK: @llvm.get.active.lane.mask
; CHECK: ret void
;
define dso_local void @IV_wrong_step(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
@@ -514,15 +497,9 @@ for.cond.cleanup:
}
; CHECK-LABEL: @IV_step_not_constant(
-;
+; CHECK: vector.body:
; CHECK-NOT: @llvm.arm.mve.vctp32
-; CHECK-NOT: @llvm.get.active.lane.mask
-;
-; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3>
-; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, <i32 32002, i32 32002, i32 32002, i32 32002>
-; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef)
+; CHECK: @llvm.get.active.lane.mask
; CHECK: ret void
;
define dso_local void @IV_step_not_constant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
@@ -563,15 +540,9 @@ for.cond.cleanup:
}
; CHECK-LABEL: @outerloop_phi(
-;
+; CHECK: vector.body:
; CHECK-NOT: @llvm.arm.mve.vctp32
-; CHECK-NOT: @llvm.get.active.lane.mask
-; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %j.025, i32 0
-; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3>
-; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, <i32 4096, i32 4096, i32 4096, i32 4096>
-; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef)
-;
+; CHECK: @llvm.get.active.lane.mask
; CHECK: ret void
;
define dso_local void @outerloop_phi(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
index 0b103ca54750..5c753134744d 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
@@ -143,21 +143,10 @@ for.cond.cleanup:
;
; CHECK-LABEL: @reduction_not_guarded
;
+; CHECK: vector.body:
; CHECK-NOT: @llvm.arm.mve.vctp
-; CHECK-NOT: @llvm.get.active.lane.mask.v8i1.i32
-;
-; CHECK: entry:
-; CHECK: %[[ELEMCOUNT:.*]] = add i32 %N, -1
-; CHECK: %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %[[ELEMCOUNT]], i32 0
-; CHECK %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer
-;
-; CHECK: vector.body:
-; CHECK: %lane.mask.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-; CHECK: %lane.mask.splat = shufflevector <8 x i32> %lane.mask.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-; CHECK: %lane.mask.induction = add <8 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK: %[[ICMP:.*]] = icmp ule <8 x i32> %lane.mask.induction, %broadcast.splat2
-; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16({{.*}}, <8 x i1> %[[ICMP]], <8 x i16> undef)
-; CHECK: ret
+; CHECK: @llvm.get.active.lane.mask.v8i1.i32
+; CHECK: ret
;
define i16 @reduction_not_guarded(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr {
entry:
@@ -213,20 +202,9 @@ middle.block: ; preds = %vector.body
;
; CHECK-LABEL: @Correlation
;
-; CHECK: entry:
-; CHECK: for.body.lr.ph: ; preds = %entry
-; CHECK: for.body: ; preds = %for.end, %for.body.lr.ph
-; CHECK: vector.ph: ; preds = %for.body
-; CHECK: %trip.count.minus.1 = add i32 %8, -1
-; CHECK: call void @llvm.set.loop.iterations.i32(i32 %7)
-; CHECK: %insert.btc = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-; CHECK: %splat.btc = shufflevector <4 x i32> %insert.btc, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK: br label %vector.body
; CHECK: vector.body:
-; CHECK-NOT: @llvm.arm.mve.vctp
-; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, %splat.btc
-; CHECK: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16({{.*}}, <4 x i1> %[[ICMP]],{{.*}}
-;
+; CHECK-NOT: @llvm.arm.mve.vctp
+; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
;
; FORCE-LABEL: @Correlation
; FORCE: vector.ph: ; preds = %for.body
More information about the llvm-commits
mailing list