[llvm] d4e183f - Revert "[LV] Emit @llvm.get.active.mask for tail-folded loops"
Sjoerd Meijer via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 17 02:10:24 PDT 2020
Author: Sjoerd Meijer
Date: 2020-06-17T10:09:54+01:00
New Revision: d4e183f68631019fe51f855a554988292868b9ac
URL: https://github.com/llvm/llvm-project/commit/d4e183f68631019fe51f855a554988292868b9ac
DIFF: https://github.com/llvm/llvm-project/commit/d4e183f68631019fe51f855a554988292868b9ac.diff
LOG: Revert "[LV] Emit @llvm.get.active.mask for tail-folded loops"
This reverts commit 47650451738c821993c763356854b560a0f9f550
while I investigate the build bot failures.
Added:
Modified:
llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
llvm/lib/Transforms/Vectorize/VPlan.cpp
llvm/lib/Transforms/Vectorize/VPlan.h
llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 82849dc67f43..6d7a19073bf2 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1413,14 +1413,14 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
}
bool ARMTTIImpl::emitGetActiveLaneMask() const {
- if (!ST->hasMVEIntegerOps() || DisableTailPredication)
+ if (!ST->hasMVEIntegerOps())
return false;
- // Intrinsic @llvm.get.active.lane.mask is supported.
+ // TODO: Intrinsic @llvm.get.active.lane.mask is supported.
// It is used in the MVETailPredication pass, which requires the number of
// elements processed by this vector loop to setup the tail-predicated
// loop.
- return true;
+ return false;
}
void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ec756671ea67..3591f6b1acdb 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6829,11 +6829,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
IV = IVRecipe->getVPValue();
}
VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
- bool TailFolded = !CM.isScalarEpilogueAllowed();
- if (TailFolded && CM.TTI.emitGetActiveLaneMask())
- BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC});
- else
- BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
+ BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
return BlockMaskCache[BB] = BlockMask;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1508b2e371a5..a015550b0c77 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -380,20 +380,6 @@ void VPInstruction::generateInstruction(VPTransformState &State,
State.set(this, V, Part);
break;
}
- case VPInstruction::ActiveLaneMask: {
- // Get first lane of vector induction variable.
- Value *VIVElem0 = State.get(getOperand(0), {Part, 0});
- // Get first lane of backedge-taken-count.
- Value *ScalarBTC = State.get(getOperand(1), {Part, 0});
-
- auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
- auto *PredTy = VectorType::get(Int1Ty, State.VF);
- Instruction *Call = Builder.CreateIntrinsic(
- Intrinsic::get_active_lane_mask, {PredTy, ScalarBTC->getType()},
- {VIVElem0, ScalarBTC}, nullptr, "active.lane.mask");
- State.set(this, Call, Part);
- break;
- }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -435,10 +421,6 @@ void VPInstruction::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
case VPInstruction::SLPStore:
O << "combined store";
break;
- case VPInstruction::ActiveLaneMask:
- O << "active lane mask";
- break;
-
default:
O << Instruction::getOpcodeName(getOpcode());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 914cb038adf4..c3097e1e1f14 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -685,7 +685,6 @@ class VPInstruction : public VPUser, public VPRecipeBase {
ICmpULE,
SLPLoad,
SLPStore,
- ActiveLaneMask,
};
private:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll b/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
index 0c2502df6f0a..9939c21b8d1d 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
@@ -45,12 +45,9 @@
define void @prefer_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
; CHECK-LABEL: prefer_folding(
; PREFER-FOLDING: vector.body:
-; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
-; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430)
-; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask,
-; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask,
-; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
+; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
+; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
+; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
;
; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
@@ -510,13 +507,9 @@ for.body:
define void @float(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
; CHECK-LABEL: float(
; PREFER-FOLDING: vector.body:
-; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
-; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430)
-; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask
-; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask
-; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32({{.*}}%active.lane.mask
-; PREFER-FOLDING: %index.next = add i32 %index, 4
+; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32
+; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32
+; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32
; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
entry:
br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
index 44f8133f29ed..274b626c8d91 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
@@ -15,13 +15,9 @@ target triple = "thumbv8.1m.main-arm-unknown-eabihf"
define dso_local void @sgt_loopguard(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) local_unnamed_addr #0 {
; COMMON-LABEL: @sgt_loopguard(
; COMMON: vector.body:
-
-; CHECK-TF: %[[VIVELEM0:.*]] = extractelement <16 x i32> %vec.iv, i32 0
-; CHECK-TF: %[[SCALARBTC:.*]] = extractelement <16 x i32> %broadcast.splat, i32 0
-; CHECK-TF: %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %[[VIVELEM0]], i32 %[[SCALARBTC]])
-; CHECK-TF: llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask
-; CHECK-TF: llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask
-; CHECK-TF: llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask)
+; CHECK-TF: masked.load
+; CHECK-TF: masked.load
+; CHECK-TF: masked.store
entry:
%cmp5 = icmp sgt i32 %N, 0
br i1 %cmp5, label %while.body.preheader, label %while.end
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
index f730350c70e3..72321f0401a0 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
@@ -41,15 +41,11 @@ for.body:
define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
; COMMON-LABEL: tail_folding_enabled(
; COMMON: vector.body:
-; COMMON: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; COMMON: %[[ELEM0:.*]] = add i64 %index, 0
-; COMMON: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 429)
-; COMMON: %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask
-; COMMON: %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask
+; COMMON: %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
+; COMMON: %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
; COMMON: %[[ADD:.*]] = add nsw <4 x i32> %[[WML2]], %[[WML1]]
-; COMMON: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]], {{.*}}<4 x i1> %active.lane.mask
-; COMMON: %index.next = add i64 %index, 4
-; COMMON: br i1 %{{.*}}, label %{{.*}}, label %vector.body
+; COMMON: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]]
+; COMMON: br i1 %12, label %{{.*}}, label %vector.body
entry:
br label %for.body
@@ -79,16 +75,13 @@ define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noa
; PREDFLAG-LABEL: tail_folding_disabled(
; PREDFLAG: vector.body:
-; PREDFLAG: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; PREDFLAG: %[[ELEM0:.*]] = add i64 %index, 0
-; PREDFLAG: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 429)
-; PREDFLAG: %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
-; PREDFLAG: %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
+; PREDFLAG: %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
+; PREDFLAG: %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
; PREDFLAG: %{{.*}} = add nsw <4 x i32> %wide.masked.load1, %wide.masked.load
-; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
+; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32(
; PREDFLAG: %index.next = add i64 %index, 4
-; PREDFLAG: %[[CMP:.*]] = icmp eq i64 %index.next, 432
-; PREDFLAG: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !6
+; PREDFLAG: %12 = icmp eq i64 %index.next, 432
+; PREDFLAG: br i1 %{{.*}}, label %middle.block, label %vector.body, !llvm.loop !6
entry:
br label %for.body
@@ -109,59 +102,6 @@ for.body:
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
}
-define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
-; PREDFLAG-LABEL: interleave4(
-; PREDFLAG: %[[ADD1:.*]] = add i32 %index, 0
-; PREDFLAG: %[[ADD2:.*]] = add i32 %index, 4
-; PREDFLAG: %[[ADD3:.*]] = add i32 %index, 8
-; PREDFLAG: %[[ADD4:.*]] = add i32 %index, 12
-; PREDFLAG: %[[BTC:.*]] = extractelement <4 x i32> %broadcast.splat, i32 0
-; PREDFLAG: %[[ALM1:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD1]], i32 %[[BTC]])
-; PREDFLAG: %[[ALM2:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD2]], i32 %[[BTC]])
-; PREDFLAG: %[[ALM3:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD3]], i32 %[[BTC]])
-; PREDFLAG: %[[ALM4:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD4]], i32 %[[BTC]])
-;
-; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]],{{.*}}
-; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]],{{.*}}
-; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]],{{.*}}
-; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]],{{.*}}
-; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]],{{.*}}
-; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]],{{.*}}
-; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]],{{.*}}
-; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]],{{.*}}
-;
-; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]])
-; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]])
-; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]])
-; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]])
-;
-entry:
- %cmp8 = icmp sgt i32 %N, 0
- br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader: ; preds = %entry
- br label %for.body
-
-for.cond.cleanup.loopexit: ; preds = %for.body
- br label %for.cond.cleanup
-
-for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
- ret void
-
-for.body: ; preds = %for.body.preheader, %for.body
- %i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
- %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
- %0 = load i32, i32* %arrayidx, align 4
- %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
- %1 = load i32, i32* %arrayidx1, align 4
- %add = add nsw i32 %1, %0
- %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
- store i32 %add, i32* %arrayidx2, align 4
- %inc = add nuw nsw i32 %i.09, 1
- %exitcond = icmp eq i32 %inc, %N
- br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !14
-}
-
; CHECK: !0 = distinct !{!0, !1}
; CHECK-NEXT: !1 = !{!"llvm.loop.isvectorized", i32 1}
; CHECK-NEXT: !2 = distinct !{!2, !3, !1}
@@ -169,7 +109,6 @@ for.body: ; preds = %for.body.preheader,
; CHECK-NEXT: !4 = distinct !{!4, !1}
; CHECK-NEXT: !5 = distinct !{!5, !3, !1}
; CHECK-NEXT: !6 = distinct !{!6, !1}
-
attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" }
!6 = distinct !{!6, !7, !8}
@@ -179,6 +118,3 @@ attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+
!10 = distinct !{!10, !11, !12}
!11 = !{!"llvm.loop.vectorize.predicate.enable", i1 false}
!12 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-!14 = distinct !{!14, !15}
-!15 = !{!"llvm.loop.interleave.count", i32 4}
More information about the llvm-commits
mailing list