[llvm] [LV][AArch64] Improve strided access vectorization for AArch64 SVE (PR #164205)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 20 00:31:26 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: Kinoshita Kotaro (kinoshita-fj)
<details>
<summary>Changes</summary>
Currently, LLVM vectorizes strided accesses with SVE as follows.
```c
void func(double* restrict a, double* b, int n)
{
for (int i = 0; i < n; i++) {
a[i] = b[i * 10] + 1;
}
}
```
=>
```
...
index z1.d, #<!-- -->0, #<!-- -->1
loop:
add z2.d, z1.d, z0.d
mul z1.d, z1.d, #<!-- -->80
ld1d { z1.d }, p0/z, [x1, z1.d]
...
mov z1.d, z2.d
...
```
This generated code is inefficient because it performs address calculation inside the loop using vector instructions. This can lead to performance degradation. #<!-- -->129474
Ideally, we want to generate efficient instructions that keep the offset vector `z1` constant and update the base register `x1` with a scalar instruction.
```
...
index z1.d, #<!-- -->0, #<!-- -->10
loop:
ld1d z2.d, p7/z, [x1, z1.d, lsl 3]
...
add x1, x1, x2
...
```
This patch enables strided accesses to be vectorized efficiently as shown above. This patch is based on #<!-- -->147297. #<!-- -->147297 detects strided accesses and converts them into stride recipes. This patch then changes it to a legal and efficient sequence of recipes for AArch64.
I am making this patch as a draft for the following reasons:
- I have not yet been able to create sufficient test cases for this patch.
- I have not yet been able to confirm that there are no performance degradations.
---
Patch is 149.36 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/164205.diff
23 Files Affected:
- (modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+5)
- (modified) llvm/include/llvm/Analysis/TargetTransformInfoImpl.h (+5)
- (modified) llvm/lib/Analysis/TargetTransformInfo.cpp (+5)
- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h (+6)
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+26-9)
- (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+82-11)
- (modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+4-2)
- (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+65-8)
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+252-6)
- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.h (+11)
- (modified) llvm/lib/Transforms/Vectorize/VPlanValue.h (+1)
- (modified) llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp (+2-1)
- (added) llvm/test/Transforms/LoopVectorize/AArch64/strided-accesses.ll (+130)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll (+20-18)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll (+21-5)
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll (+5-2)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/blocks-with-dead-instructions.ll (+5-1)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll (+18-12)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll (+41-7)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll (+98-85)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll (+62-14)
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll (+64-94)
- (modified) llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll (+2-2)
``````````diff
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 7a4abe9ee5082..b8e128fe8cf3f 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -852,6 +852,11 @@ class TargetTransformInfo {
/// Return true if the target supports strided load.
LLVM_ABI bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const;
+ /// Return true if the target benefits from the generation of a more
+ /// efficient instruction sequence for strided accesses.
+ LLVM_ABI bool preferToUseStrideRecipesForVectorization(Type *DataType,
+ Align Alignment) const;
+
/// Return true is the target supports interleaved access for the given vector
/// type \p VTy, interleave factor \p Factor, alignment \p Alignment and
/// address space \p AddrSpace.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 566e1cf51631a..43aa6f7be0dbb 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -374,6 +374,11 @@ class TargetTransformInfoImplBase {
return false;
}
+ virtual bool preferToUseStrideRecipesForVectorization(Type *DataType,
+ Align Alignment) const {
+ return false;
+ }
+
virtual bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor,
Align Alignment,
unsigned AddrSpace) const {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index bf62623099a97..f142a705dcf6b 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -532,6 +532,11 @@ bool TargetTransformInfo::isLegalStridedLoadStore(Type *DataType,
return TTIImpl->isLegalStridedLoadStore(DataType, Alignment);
}
+bool TargetTransformInfo::preferToUseStrideRecipesForVectorization(
+ Type *DataType, Align Alignment) const {
+ return TTIImpl->preferToUseStrideRecipesForVectorization(DataType, Alignment);
+}
+
bool TargetTransformInfo::isLegalInterleavedAccessType(
VectorType *VTy, unsigned Factor, Align Alignment,
unsigned AddrSpace) const {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index fe2e849258e3f..4e69f06a0279f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -346,6 +346,12 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
return isLegalMaskedGatherScatter(DataType);
}
+ bool
+ preferToUseStrideRecipesForVectorization(Type *DataType,
+ Align Alignment) const override {
+ return isLegalMaskedGatherScatter(DataType);
+ }
+
bool isLegalBroadcastLoad(Type *ElementTy,
ElementCount NumElements) const override {
// Return true if we can generate a `ld1r` splat load instruction.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 12fb46da8e71a..08b9a3f6ed9a9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3956,7 +3956,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
[](const auto *R) { return Instruction::Select; })
.Case<VPWidenStoreRecipe>(
[](const auto *R) { return Instruction::Store; })
- .Case<VPWidenLoadRecipe>(
+ .Case<VPWidenLoadRecipe, VPWidenStridedLoadRecipe>(
[](const auto *R) { return Instruction::Load; })
.Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
[](const auto *R) { return Instruction::Call; })
@@ -4056,6 +4056,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPReductionPHISC:
case VPDef::VPInterleaveEVLSC:
case VPDef::VPInterleaveSC:
+ case VPDef::VPWidenStridedLoadSC:
case VPDef::VPWidenLoadEVLSC:
case VPDef::VPWidenLoadSC:
case VPDef::VPWidenStoreEVLSC:
@@ -6940,6 +6941,12 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
RepR->getUnderlyingInstr(), VF))
return true;
}
+
+ // The strided load is transformed from a gather through VPlanTransform,
+ // and its cost will be lower than the original gather.
+ if (isa<VPWidenStridedLoadRecipe>(&R))
+ return true;
+
if (Instruction *UI = GetInstructionForCost(&R)) {
// If we adjusted the predicate of the recipe, the cost in the legacy
// cost model may be different.
@@ -7495,7 +7502,10 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I),
/*Stride*/ -1, Flags, I->getDebugLoc());
} else {
- VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
+ const DataLayout &DL = I->getDataLayout();
+ auto *StrideTy = DL.getIndexType(Ptr->getUnderlyingValue()->getType());
+ VPValue *StrideOne = Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, 1));
+ VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), StrideOne,
GEP ? GEP->getNoWrapFlags()
: GEPNoWrapFlags::none(),
I->getDebugLoc());
@@ -8592,19 +8602,14 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
*Plan))
return nullptr;
+ VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
// Transform recipes to abstract recipes if it is legal and beneficial and
// clamp the range for better cost estimation.
// TODO: Enable following transform when the EVL-version of extended-reduction
// and mulacc-reduction are implemented.
- if (!CM.foldTailWithEVL()) {
- VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
+ if (!CM.foldTailWithEVL())
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
CostCtx, Range);
- }
-
- for (ElementCount VF : Range)
- Plan->addVF(VF);
- Plan->setName("Initial VPlan");
// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
@@ -8617,6 +8622,18 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
VPlanTransforms::runPass(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE,
Legal->getLAI()->getSymbolicStrides());
+ // Convert memory recipes to strided access recipes if the strided access is
+ // legal and profitable.
+ VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan,
+ CostCtx, Range);
+
+ VPlanTransforms::runPass(VPlanTransforms::legalizeStridedAccess, *Plan,
+ CostCtx, Range);
+
+ for (ElementCount VF : Range)
+ Plan->addVF(VF);
+ Plan->setName("Initial VPlan");
+
auto BlockNeedsPredication = [this](BasicBlock *BB) {
return Legal->blockNeedsPredication(BB);
};
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 10d704df289c8..08f6d3d493060 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -563,6 +563,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPInterleaveEVLSC:
case VPRecipeBase::VPInterleaveSC:
case VPRecipeBase::VPIRInstructionSC:
+ case VPRecipeBase::VPWidenStridedLoadSC:
case VPRecipeBase::VPWidenLoadEVLSC:
case VPRecipeBase::VPWidenLoadSC:
case VPRecipeBase::VPWidenStoreEVLSC:
@@ -1769,10 +1770,6 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
Type *SourceElementTy;
- bool isPointerLoopInvariant() const {
- return getOperand(0)->isDefinedOutsideLoopRegions();
- }
-
bool isIndexLoopInvariant(unsigned I) const {
return getOperand(I + 1)->isDefinedOutsideLoopRegions();
}
@@ -1805,6 +1802,29 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
/// This recipe generates a GEP instruction.
unsigned getOpcode() const { return Instruction::GetElementPtr; }
+ bool isPointerLoopInvariant() const {
+ return getOperand(0)->isDefinedOutsideLoopRegions();
+ }
+
+ std::optional<unsigned> getUniqueVariantIndex() const {
+ std::optional<unsigned> VarIdx;
+ for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
+ if (isIndexLoopInvariant(I))
+ continue;
+
+ if (VarIdx)
+ return std::nullopt;
+ VarIdx = I;
+ }
+ return VarIdx;
+ }
+
+ Type *getIndexedType(unsigned I) const {
+ auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
+ SmallVector<Value *, 4> Ops(GEP->idx_begin(), GEP->idx_begin() + I);
+ return GetElementPtrInst::getIndexedType(SourceElementTy, Ops);
+ }
+
/// Generate the gep nodes.
void execute(VPTransformState &State) override;
@@ -1895,20 +1915,23 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
#endif
};
-/// A recipe to compute the pointers for widened memory accesses of IndexTy.
+/// A recipe to compute the pointers for widened memory accesses of IndexedTy,
+/// with the Stride expressed in units of IndexedTy.
class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
- public VPUnrollPartAccessor<1> {
+ public VPUnrollPartAccessor<2> {
Type *SourceElementTy;
public:
- VPVectorPointerRecipe(VPValue *Ptr, Type *SourceElementTy,
+ VPVectorPointerRecipe(VPValue *Ptr, Type *SourceElementTy, VPValue *Stride,
GEPNoWrapFlags GEPFlags, DebugLoc DL)
- : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
- GEPFlags, DL),
+ : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC,
+ ArrayRef<VPValue *>({Ptr, Stride}), GEPFlags, DL),
SourceElementTy(SourceElementTy) {}
VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
+ VPValue *getStride() const { return getOperand(1); }
+
void execute(VPTransformState &State) override;
Type *getSourceElementType() const { return SourceElementTy; }
@@ -1929,7 +1952,8 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
VPVectorPointerRecipe *clone() override {
return new VPVectorPointerRecipe(getOperand(0), SourceElementTy,
- getGEPNoWrapFlags(), getDebugLoc());
+ getStride(), getGEPNoWrapFlags(),
+ getDebugLoc());
}
/// Return true if this VPVectorPointerRecipe corresponds to part 0. Note that
@@ -3186,7 +3210,8 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC ||
- R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC;
+ R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC ||
+ R->getVPDefID() == VPRecipeBase::VPWidenStridedLoadSC;
}
static inline bool classof(const VPUser *U) {
@@ -3307,6 +3332,52 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
}
};
+/// A recipe for strided load operations, using the base address, stride, and an
+/// optional mask. This recipe will generate an vp.strided.load intrinsic call
+/// to represent memory accesses with a fixed stride.
+struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe,
+ public VPValue {
+ VPWidenStridedLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Stride,
+ VPValue *VF, VPValue *Mask,
+ const VPIRMetadata &Metadata, DebugLoc DL)
+ : VPWidenMemoryRecipe(
+ VPDef::VPWidenStridedLoadSC, Load, {Addr, Stride, VF},
+ /*Consecutive=*/false, /*Reverse=*/false, Metadata, DL),
+ VPValue(this, &Load) {
+ setMask(Mask);
+ }
+
+ VPWidenStridedLoadRecipe *clone() override {
+ return new VPWidenStridedLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
+ getStride(), getVF(), getMask(), *this,
+ getDebugLoc());
+ }
+
+ VP_CLASSOF_IMPL(VPDef::VPWidenStridedLoadSC);
+
+ /// Return the stride operand.
+ VPValue *getStride() const { return getOperand(1); }
+
+ /// Return the VF operand.
+ VPValue *getVF() const { return getOperand(2); }
+
+ /// Generate a strided load.
+ void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getAddr() || Op == getStride() || Op == getVF();
+ }
+};
+
/// A recipe for widening store operations, using the stored value, the address
/// to store to and an optional mask.
struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 07bfe7a896d86..027e5238c74c0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -188,8 +188,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
}
Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
- assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
- "Store recipes should not define any values");
+ assert(
+ (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
+ R)) &&
+ "Store recipes should not define any values");
return cast<LoadInst>(&R->getIngredient())->getType();
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3a55710d59b08..0345bf1bfe721 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -82,6 +82,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPWidenCastSC:
case VPWidenGEPSC:
case VPWidenIntOrFpInductionSC:
+ case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
case VPWidenPHISC:
@@ -105,6 +106,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
case VPInstructionSC:
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
+ case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
return true;
@@ -188,6 +190,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPInterleaveEVLSC:
case VPInterleaveSC:
return mayWriteToMemory();
+ case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
case VPWidenStoreEVLSC:
@@ -2581,13 +2584,21 @@ void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent,
void VPVectorPointerRecipe::execute(VPTransformState &State) {
auto &Builder = State.Builder;
unsigned CurrentPart = getUnrollPart(*this);
- Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
- /*IsUnitStride*/ true, CurrentPart, Builder);
+ Value *Stride = State.get(getStride(), /*IsScalar*/ true);
+
+ auto *StrideC = dyn_cast<ConstantInt>(Stride);
+ bool IsStrideOne = StrideC && StrideC->isOne();
+ bool IsUnitStride = IsStrideOne || (StrideC && StrideC->isMinusOne());
+ Type *IndexTy =
+ getGEPIndexTy(State.VF.isScalable(),
+ /*IsReverse*/ false, IsUnitStride, CurrentPart, Builder);
Value *Ptr = State.get(getOperand(0), VPLane(0));
+ Stride = Builder.CreateSExtOrTrunc(Stride, IndexTy);
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
- Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Increment,
- "", getGEPNoWrapFlags());
+ Value *Index = IsStrideOne ? Increment : Builder.CreateMul(Increment, Stride);
+ Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Index, "",
+ getGEPNoWrapFlags());
State.set(this, ResultPtr, /*IsScalar*/ true);
}
@@ -3355,9 +3366,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
const Align Alignment = getLoadStoreAlignment(&Ingredient);
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
->getAddressSpace();
- unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
- ? Instruction::Load
- : Instruction::Store;
+ unsigned Opcode =
+ isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
+ this)
+ ? Instruction::Load
+ : Instruction::Store;
if (!Consecutive) {
// TODO: Using the original IR may not be accurate.
@@ -3367,8 +3380,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
"Inconsecutive memory access should not have the order.");
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
- Type *PtrTy = Ptr->getType();
+ if (isa<VPWidenStridedLoadRecipe>(this))
+ return Ctx.TTI.getStridedMemoryOpCost(
+ Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient);
+ Type *PtrTy = Ptr->getType();
// If the address value is uniform across all lanes, then the address can be
// calculated with scalar type and broadcast.
if (!vputils::isSingleScalar(getAddr()))
@@ -3523,6 +3539,47 @@ void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
+void VPWidenStridedLoadRecipe::execute(VPTransformState &State) {
+ Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+ auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
+ const Align Alignment = getLoadStoreAlignment(&Ingredient);
+
+ auto &Builder = State.Builder;
+ Value *Addr = State.get(getAddr(), /*IsScalar*/ true);
+ Value *StrideInBytes = State.get(getStride(), /*IsScalar*/ true);
+ Value *Mask = nullptr;
+ if (VPValue *VPMask = getMask())
+ Mask = State.get(VPMask);
+ else
+ Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
+ Value *RunTimeVF = Builder.CreateZExtOrTrunc(State.get(getVF(), VPLane(0)),
+ Builder.getInt32Ty());
+
+ auto *PtrTy = Addr->getType();
+ auto *StrideTy = StrideInBytes->getType();
+ CallInst *NewLI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
+ {Addr, StrideInBytes, Mask, RunTimeVF}, nullptr, "wide.strided.load");
+ NewLI->addParamAttr(
+ 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
+ applyMetadata(*NewLI);
+ State.set(this, NewLI);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenStridedLoadRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "WIDEN ";
+ printAsOperand(O, SlotTracker);
+ O << " = load ";
+ getAddr()->printAsOperand(O, SlotTracker);
+ O << ", stride = ";
+ getStride()->printAsOperand(O, SlotTracker);
+ O << ", runtimeVF = ";
+ getVF()->printAsOperand(O, SlotTracker);
+}
+#endif
+
void VPWidenSt...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/164205
More information about the llvm-commits
mailing list