[llvm] [LoopVectorize][AArch64][SVE] Generate wide active lane masks (PR #81140)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 8 06:42:44 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Momchil Velikov (momchil-velikov)
<details>
<summary>Changes</summary>
This patch makes the `LoopVectorize` pass able to generate lane
masks longer than the VF to allow the target to better utilise
the instruction set. The vectoriser can emit one or more wide
`llvm.get.active.lane.mask.*` calls plus several `llvm.vector.extract.*`
calls to yield the required number of VF-wide masks.
The motivating example is a vectorised loop with unroll factor 2 that
can use the SVE2.1 `whilelo` instruction with predicate pair result, or
a SVE `whilelo` instruction with smaller element size plus
`punpklo`/`punpkhi`.
How wide is the lane mask that the vectoriser emits is controlled
by a `TargetTransformInfo` hook `getMaxPredicateLength`. The default
implementation (return the same length as the VF) keeps the
change non-functional for targets that can't or are not prepared
to handle wider lane masks.
---
Patch is 497.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/81140.diff
23 Files Affected:
- (modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+10)
- (modified) llvm/include/llvm/Analysis/TargetTransformInfoImpl.h (+2)
- (modified) llvm/include/llvm/CodeGen/BasicTTIImpl.h (+2)
- (modified) llvm/lib/Analysis/TargetTransformInfo.cpp (+4)
- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+86-35)
- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+9)
- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h (+2)
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h (+8)
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+6-1)
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+42-4)
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+70-15)
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+7-11)
- (modified) llvm/lib/Transforms/Vectorize/VPlanValue.h (+1)
- (modified) llvm/test/CodeGen/AArch64/active_lane_mask.ll (+1)
- (added) llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll (+90)
- (added) llvm/test/CodeGen/AArch64/sve-wide-lane-mask.ll (+1069)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll (+171-186)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll (+890-878)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll (+197-189)
- (added) llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll (+664)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll (+77-89)
- (modified) llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll (+3-3)
- (modified) llvm/test/Transforms/LoopVectorize/strict-fadd-interleave-only.ll (+10-10)
``````````diff
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 58577a6b6eb5c..67e1b45cce29c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1228,6 +1228,8 @@ class TargetTransformInfo {
/// and the number of execution units in the CPU.
unsigned getMaxInterleaveFactor(ElementCount VF) const;
+ ElementCount getMaxPredicateLength(ElementCount VF) const;
+
/// Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
static OperandValueInfo getOperandInfo(const Value *V);
@@ -1981,6 +1983,9 @@ class TargetTransformInfo::Concept {
virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0;
virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0;
+
+ virtual ElementCount getMaxPredicateLength(ElementCount VF) const = 0;
+
virtual InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
OperandValueInfo Opd1Info, OperandValueInfo Opd2Info,
@@ -2601,6 +2606,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
unsigned getMaxInterleaveFactor(ElementCount VF) override {
return Impl.getMaxInterleaveFactor(VF);
}
+
+ ElementCount getMaxPredicateLength(ElementCount VF) const override {
+ return Impl.getMaxPredicateLength(VF);
+ }
+
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
unsigned &JTSize,
ProfileSummaryInfo *PSI,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 3d5db96e86b80..b6d01e0764ab1 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -528,6 +528,8 @@ class TargetTransformInfoImplBase {
unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }
+ ElementCount getMaxPredicateLength(ElementCount VF) const { return VF; }
+
InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info,
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index bb17298daba03..2b0d0f3ed6f70 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -881,6 +881,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
+ ElementCount getMaxPredicateLength(ElementCount VF) const { return VF; }
+
InstructionCost getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None},
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 1f11f0d7dd620..daea8e48981ec 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -808,6 +808,10 @@ unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
return TTIImpl->getMaxInterleaveFactor(VF);
}
+ElementCount TargetTransformInfo::getMaxPredicateLength(ElementCount VF) const {
+ return TTIImpl->getMaxPredicateLength(VF);
+}
+
TargetTransformInfo::OperandValueInfo
TargetTransformInfo::getOperandInfo(const Value *V) {
OperandValueKind OpInfo = OK_AnyValue;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8573939b04389..4405e8d3f91df 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1813,8 +1813,8 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT) {
bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
EVT OpVT) const {
- // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
- if (!Subtarget->hasSVE())
+ // Only SVE/SME has a 1:1 mapping from intrinsic -> instruction (whilelo).
+ if (!Subtarget->hasSVEorSME())
return true;
// We can only support legal predicate result types. We can use the SVE
@@ -20004,47 +20004,98 @@ static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
return SDValue();
}
-static SDValue performIntrinsicCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- const AArch64Subtarget *Subtarget) {
+static SDValue tryCombineGetActiveLaneMask(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
SelectionDAG &DAG = DCI.DAG;
- unsigned IID = getIntrinsicID(N);
- switch (IID) {
- default:
- break;
- case Intrinsic::get_active_lane_mask: {
- SDValue Res = SDValue();
- EVT VT = N->getValueType(0);
- if (VT.isFixedLengthVector()) {
- // We can use the SVE whilelo instruction to lower this intrinsic by
- // creating the appropriate sequence of scalable vector operations and
- // then extracting a fixed-width subvector from the scalable vector.
+ EVT VT = N->getValueType(0);
+ if (VT.isFixedLengthVector()) {
+ // We can use the SVE whilelo instruction to lower this intrinsic by
+ // creating the appropriate sequence of scalable vector operations and
+ // then extracting a fixed-width subvector from the scalable vector.
+ SDLoc DL(N);
+ SDValue ID =
+ DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
- SDLoc DL(N);
- SDValue ID =
- DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
+ EVT WhileVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+ ElementCount::getScalable(VT.getVectorNumElements()));
- EVT WhileVT = EVT::getVectorVT(
- *DAG.getContext(), MVT::i1,
- ElementCount::getScalable(VT.getVectorNumElements()));
+ // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
+ EVT PromVT = getPromotedVTForPredicate(WhileVT);
- // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
- EVT PromVT = getPromotedVTForPredicate(WhileVT);
+ // Get the fixed-width equivalent of PromVT for extraction.
+ EVT ExtVT =
+ EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
+ VT.getVectorElementCount());
- // Get the fixed-width equivalent of PromVT for extraction.
- EVT ExtVT =
- EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
- VT.getVectorElementCount());
+ SDValue Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
+ N->getOperand(1), N->getOperand(2));
+ Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
+ DAG.getConstant(0, DL, MVT::i64));
+ Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
- Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
- N->getOperand(1), N->getOperand(2));
- Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
- DAG.getConstant(0, DL, MVT::i64));
- Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
- }
return Res;
}
+
+ if (!Subtarget->hasSVE2p1() && !Subtarget->hasSME2())
+ return SDValue();
+
+ if (!N->hasNUsesOfValue(2, 0))
+ return SDValue();
+
+ auto It = N->use_begin();
+ SDNode *Lo = *It++;
+ SDNode *Hi = *It;
+
+ const uint64_t HalfSize = VT.getVectorMinNumElements() / 2;
+ uint64_t OffLo, OffHi;
+ if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ !isIntImmediate(Lo->getOperand(1).getNode(), OffLo) ||
+ (OffLo != 0 && OffLo != HalfSize) ||
+ Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ !isIntImmediate(Hi->getOperand(1).getNode(), OffHi) ||
+ (OffHi != 0 && OffHi != HalfSize))
+ return SDValue();
+
+ if (OffLo > OffHi) {
+ std::swap(Lo, Hi);
+ std::swap(OffLo, OffHi);
+ }
+
+ if (OffLo != 0 || OffHi != HalfSize)
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue ID =
+ DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
+ SDValue Idx = N->getOperand(1);
+ SDValue TC = N->getOperand(2);
+ if (Idx.getValueType() != MVT::i64) {
+ Idx = DAG.getZExtOrTrunc(Idx, DL, MVT::i64);
+ TC = DAG.getZExtOrTrunc(TC, DL, MVT::i64);
+ }
+ auto R =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL,
+ {Lo->getValueType(0), Hi->getValueType(0)}, {ID, Idx, TC});
+
+ DCI.CombineTo(Lo, R.getValue(0));
+ DCI.CombineTo(Hi, R.getValue(1));
+
+ return SDValue(N, 0);
+}
+
+static SDValue performIntrinsicCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ SelectionDAG &DAG = DCI.DAG;
+ unsigned IID = getIntrinsicID(N);
+ switch (IID) {
+ default:
+ break;
+ case Intrinsic::get_active_lane_mask:
+ return tryCombineGetActiveLaneMask(N, DCI, Subtarget);
case Intrinsic::aarch64_neon_vcvtfxs2fp:
case Intrinsic::aarch64_neon_vcvtfxu2fp:
return tryCombineFixedPointConvert(N, DCI, DAG);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cdd2750521d2c..73aca77305df1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3285,6 +3285,15 @@ unsigned AArch64TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
return ST->getMaxInterleaveFactor();
}
+ElementCount AArch64TTIImpl::getMaxPredicateLength(ElementCount VF) const {
+ // Do not create masks bigger than `<vscale x 16 x i1>`.
+ unsigned N = ST->hasSVE() ? 16 : 0;
+ // Do not create masks that are more than twice the VF.
+ N = std::min(N, 2 * VF.getKnownMinValue());
+ return VF.isScalable() ? ElementCount::getScalable(N)
+ : ElementCount::getFixed(N);
+}
+
// For Falkor, we want to avoid having too many strided loads in a loop since
// that can exhaust the HW prefetcher resources. We adjust the unroller
// MaxCount preference below to attempt to ensure unrolling doesn't create too
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index de39dea2be43e..6501cc4a85e8d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -157,6 +157,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
unsigned getMaxInterleaveFactor(ElementCount VF);
+ ElementCount getMaxPredicateLength(ElementCount VF) const;
+
bool prefersVectorizedAddressing() const;
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index a7ebf78e54ceb..0e681c8080bfd 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -184,6 +184,14 @@ class VPBuilder {
VPValue *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
DebugLoc DL = {}, const Twine &Name = "");
+ VPValue *createGetActiveLaneMask(VPValue *IV, VPValue *TC, DebugLoc DL,
+ const Twine &Name = "") {
+ auto *ALM = new VPActiveLaneMaskRecipe(IV, TC, DL, Name);
+ if (BB)
+ BB->insert(ALM, InsertPt);
+ return ALM;
+ }
+
//===--------------------------------------------------------------------===//
// RAII helpers.
//===--------------------------------------------------------------------===//
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1a7b301c35f2b..bac66e633a6f3 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -599,6 +599,10 @@ class InnerLoopVectorizer {
/// count of the original loop for both main loop and epilogue vectorization.
void setTripCount(Value *TC) { TripCount = TC; }
+ ElementCount getMaxPredicateLength(ElementCount VF) const {
+ return TTI->getMaxPredicateLength(VF);
+ }
+
protected:
friend class LoopVectorizationPlanner;
@@ -7550,7 +7554,8 @@ LoopVectorizationPlanner::executePlan(
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
// Perform the actual loop transformation.
- VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
+ VPTransformState State(BestVF, BestUF, TTI.getMaxPredicateLength(BestVF), LI,
+ DT, ILV.Builder, &ILV, &BestVPlan,
OrigLoop->getHeader()->getContext());
// 0. Generate SCEV-dependent code into the preheader, including TripCount,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 162a3c4b195e5..6f20bc148e72e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -234,15 +234,16 @@ struct VPIteration {
/// VPTransformState holds information passed down when "executing" a VPlan,
/// needed for generating the output IR.
struct VPTransformState {
- VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
- DominatorTree *DT, IRBuilderBase &Builder,
+ VPTransformState(ElementCount VF, unsigned UF, ElementCount MaxPred,
+ LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder,
InnerLoopVectorizer *ILV, VPlan *Plan, LLVMContext &Ctx)
- : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan),
- LVer(nullptr), TypeAnalysis(Ctx) {}
+ : VF(VF), UF(UF), MaxPred(MaxPred), LI(LI), DT(DT), Builder(Builder),
+ ILV(ILV), Plan(Plan), LVer(nullptr), TypeAnalysis(Ctx) {}
/// The chosen Vectorization and Unroll Factors of the loop being vectorized.
ElementCount VF;
unsigned UF;
+ ElementCount MaxPred;
/// Hold the indices to generate specific scalar instructions. Null indicates
/// that all instances are to be generated, using either scalar or vector
@@ -1275,6 +1276,43 @@ class VPInstruction : public VPRecipeWithIRFlags {
}
};
+class VPActiveLaneMaskRecipe : public VPRecipeWithIRFlags {
+ const std::string Name;
+
+public:
+ VPActiveLaneMaskRecipe(VPValue *IV, VPValue *TC, DebugLoc DL = {},
+ const Twine &Name = "")
+ : VPRecipeWithIRFlags(VPDef::VPActiveLaneMaskSC,
+ std::initializer_list<VPValue *>{IV, TC}, DL),
+ Name(Name.str()) {}
+
+ VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskSC)
+
+ VPRecipeBase *clone() override {
+ SmallVector<VPValue *, 2> Operands(operands());
+ assert(Operands.size() == 2 && "by construction");
+ auto *New = new VPActiveLaneMaskRecipe(Operands[0], Operands[1],
+ getDebugLoc(), Name);
+ New->transferFlags(*this);
+ return New;
+ }
+
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+
+ return getOperand(0) == Op;
+ }
+};
+
/// VPWidenRecipe is a recipe for producing a copy of vector type its
/// ingredient. This recipe covers most of the traditional vectorization cases
/// where each ingredient transforms into a vectorized version of itself.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9ee0cb2bd6153..c0c75072f4023 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -307,18 +307,7 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
Value *Op2 = State.get(getOperand(2), Part);
return Builder.CreateSelect(Cond, Op1, Op2, Name);
}
- case VPInstruction::ActiveLaneMask: {
- // Get first lane of vector induction variable.
- Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
- // Get the original loop tripcount.
- Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
- auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
- auto *PredTy = VectorType::get(Int1Ty, State.VF);
- return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
- {PredTy, ScalarTC->getType()},
- {VIVElem0, ScalarTC}, nullptr, Name);
- }
case VPInstruction::FirstOrderRecurrenceSplice: {
// Generate code to combine the previous and current values in vector v3.
//
@@ -526,7 +515,6 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
case Instruction::ICmp:
// TODO: Cover additional opcodes.
return vputils::onlyFirstLaneUsed(this);
- case VPInstruction::ActiveLaneMask:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::BranchOnCount:
@@ -561,9 +549,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::SLPStore:
O << "combined store";
break;
- case VPInstruction::ActiveLaneMask:
- O << "active lane mask";
- break;
case VPInstruction::FirstOrderRecurrenceSplice:
O << "first-order splice";
break;
@@ -594,8 +579,78 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
DL.print(O);
}
}
+
+void VPActiveLaneMaskRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "EMIT ";
+
+ printAsOperand(O, SlotTracker);
+ O << " = active lane mask";
+ printFlags(O);
+ printOperands(O, SlotTracker);
+
+ if (auto DL = getDebugLoc()) {
+ O << ", !dbg ";
+ DL.print(O);
+ }
+}
+
#endif
+void VPActiveLaneMaskRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "VPInstruction executing an Instance");
+
+ IRBuilderBase &Builder = State.Builder;
+ Builder.SetCurrentDebugLocation(getDebugLoc());
+
+ auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
+ auto *PredTy = VectorType::get(Int1Ty, State.VF);
+
+ unsigned MaxPred = std::min(State.MaxPred.getKnownMinValue(),
+ State.UF * State.VF.getKnownMinValue());
+ if (State.UF <= 1 || MaxPred <= State.VF.getKnownMinValue() ||
+ MaxPred % State.VF.getKnownMinValue() != 0) {
+ for (int Part = State.UF - 1; Part >= 0; --Part) {
+ // Get first lane of vector induction variable.
+ Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
+ // Get the original loop tripcount.
+ Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0));
+ Value *V = Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
+ {PredTy, ScalarTC->getType()},
+ {VIVElem0, ScalarTC}, nullptr, Name);
+ State.set(this, V, Part);
+ }
+ return;
+ }
+
+ // Generate long active lane masks covering all the unrolled iterations.
+ unsigned PartsPerMask = MaxPred / State.VF.getKnownMinValue();
+ auto *LongPredTy = VectorType::get(Int1Ty, MaxPred, State.VF.isScalable());
+ SmallVector<Value *> LongMask(State.UF / PartsPerMask, nullptr);
+ for (int Part = State.UF - PartsPerMask; Part >= 0; Part -= PartsPerMask) {
+ // Get first lane of vector induction variable.
+ Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
+ // Get the original loop tripcount.
+ Value *ScalarTC = State.get(getOperand(1), VPIteration(0, 0));
+ Value *V = Builder.C...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/81140
More information about the llvm-commits
mailing list