[llvm] [SLP]Initial support for (masked)loads + compress and (masked)interleaved (PR #132099)
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 27 04:33:03 PDT 2025
================
@@ -5741,6 +5748,154 @@ static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec,
return Builder.CreateShuffleVector(Vec, Mask);
}
+/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
+/// with \p Order.
+/// \return true if the mask represents strided access, false - otherwise.
+static bool buildCompressMask(ArrayRef<Value *> PointerOps,
+ ArrayRef<unsigned> Order, Type *ScalarTy,
+ const DataLayout &DL, ScalarEvolution &SE,
+ SmallVectorImpl<int> &CompressMask) {
+ const unsigned Sz = PointerOps.size();
+ CompressMask.assign(Sz, PoisonMaskElem);
+ // The first element always set.
+ CompressMask[0] = 0;
+ // Check if the mask represents strided access.
+ std::optional<unsigned> Stride = 0;
+ Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
+ for (unsigned I : seq<unsigned>(1, Sz)) {
+ Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
+ unsigned Pos = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE);
+ CompressMask[I] = Pos;
+ if (!Stride)
+ continue;
+ if (*Stride == 0) {
+ *Stride = Pos;
+ continue;
+ }
+ if (Pos != *Stride * I)
+ Stride.reset();
+ }
+ return Stride.has_value();
+}
+
+/// Checks if the \p VL can be transformed to a (masked)load + compress or
+/// (masked) interleaved load.
+static bool isMaskedLoadCompress(
+ ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
+ ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
+ const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC,
+ const DominatorTree &DT, const TargetLibraryInfo &TLI,
+ const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
+ unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
+ VectorType *&LoadVecTy) {
+ InterleaveFactor = 0;
+ Type *ScalarTy = VL.front()->getType();
+ const unsigned Sz = VL.size();
+ auto *VecTy = getWidenedType(ScalarTy, Sz);
+ constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ // Check external uses.
+ for (const auto [I, V] : enumerate(VL)) {
+ if (AreAllUsersVectorized(V))
+ continue;
+ InstructionCost ExtractCost =
+ TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind, I);
+ InstructionCost ScalarCost =
+ TTI.getInstructionCost(cast<Instruction>(V), CostKind);
+ if (ExtractCost <= ScalarCost)
+ return false;
+ }
+ Value *Ptr0;
+ Value *PtrN;
+ if (Order.empty()) {
+ Ptr0 = PointerOps.front();
+ PtrN = PointerOps.back();
+ } else {
+ Ptr0 = PointerOps[Order.front()];
+ PtrN = PointerOps[Order.back()];
+ }
+ std::optional<int> Diff =
+ getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, DL, SE);
+ if (!Diff)
+ return false;
+ const unsigned MaxRegSize =
+ TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
+ .getFixedValue();
+ // Check for very large distances between elements.
+ if (*Diff / Sz >= MaxRegSize / 8)
+ return false;
+ Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
+ LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
+ auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
+ IsMasked = !isSafeToLoadUnconditionally(
+ Ptr0, LoadVecTy, CommonAlignment, DL,
+ cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
+ &TLI);
+ // TODO: perform the analysis of each scalar load for better
+ // safe-load-unconditionally analysis.
+ bool IsStrided =
+ buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
+ assert(CompressMask.size() >= 2 && "At least two elements are required");
+ auto [ScalarGEPCost, VectorGEPCost] =
+ getGEPCosts(TTI, PointerOps, PointerOps.front(),
+ Instruction::GetElementPtr, CostKind, ScalarTy, LoadVecTy);
+ // The cost of scalar loads.
+ InstructionCost ScalarLoadsCost =
+ std::accumulate(VL.begin(), VL.end(), InstructionCost(),
+ [&](InstructionCost C, Value *V) {
+ return C + TTI.getInstructionCost(cast<Instruction>(V),
+ CostKind);
+ }) +
+ ScalarGEPCost;
+ APInt DemandedElts = APInt::getAllOnes(Sz);
+ InstructionCost GatherCost =
+ getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts,
+ /*Insert=*/true,
+ /*Extract=*/false, CostKind) +
+ ScalarLoadsCost;
+ InstructionCost LoadCost = 0;
+ if (IsMasked) {
+ LoadCost =
+ TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
+ LI->getPointerAddressSpace(), CostKind);
+ } else {
+ CommonAlignment = LI->getAlign();
+ LoadCost =
+ TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
+ LI->getPointerAddressSpace(), CostKind);
+ }
+ SmallVector<int> Mask;
+ if (!Order.empty())
+ inversePermutation(Order, Mask);
+ if (IsStrided) {
+ // Check for potential segmented(interleaved) loads.
+ if (TTI.isLegalInterleavedAccessType(LoadVecTy, CompressMask[1],
+ CommonAlignment,
+ LI->getPointerAddressSpace())) {
+ InstructionCost InterleavedCost = TTI.getInterleavedMemoryOpCost(
+ Instruction::Load, LoadVecTy, CompressMask[1], std::nullopt,
+ CommonAlignment, LI->getPointerAddressSpace(), CostKind, IsMasked);
+ if (!Mask.empty())
----------------
alexey-bataev wrote:
Need to add the cost for Mask, not the CompressMask
https://github.com/llvm/llvm-project/pull/132099
More information about the llvm-commits
mailing list