[llvm] [Vectorize] Vectorization for __builtin_prefetch (PR #66160)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 12 21:17:11 PDT 2023
https://github.com/m-saito-fj updated https://github.com/llvm/llvm-project/pull/66160:
>From 51d6a07a9f3de9ac3893d90bdbc6f4aa49cc899b Mon Sep 17 00:00:00 2001
From: Moriyuki Saito <saitou.moriyuki at fujitsu.com>
Date: Tue, 12 Sep 2023 19:24:15 +0900
Subject: [PATCH] [Vectorize] Vectorization for __builtin_prefetch
Allow vectorization of loops containing __builtin_prefetch. Add
masked_prefetch intrinsic and masked_gather_prefetch intrinsic for this
purpose. Also, add a process to vectorize prefetch intrinsic in
LoopVectorize.
---
.../llvm/Analysis/TargetTransformInfo.h | 12 ++
.../llvm/Analysis/TargetTransformInfoImpl.h | 8 +
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 16 ++
llvm/include/llvm/IR/IRBuilder.h | 11 ++
llvm/include/llvm/IR/IntrinsicInst.h | 109 ++++++++++++
llvm/include/llvm/IR/Intrinsics.td | 15 ++
llvm/lib/Analysis/TargetTransformInfo.cpp | 10 ++
llvm/lib/Analysis/VectorUtils.cpp | 1 +
llvm/lib/IR/IRBuilder.cpp | 49 ++++++
.../Vectorize/LoopVectorizationLegality.cpp | 8 +-
.../Transforms/Vectorize/LoopVectorize.cpp | 161 ++++++++++++++----
llvm/lib/Transforms/Vectorize/VPlan.h | 14 +-
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 8 +-
13 files changed, 383 insertions(+), 39 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 1ae595d2110457d..935860420afa5b6 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -743,6 +743,8 @@ class TargetTransformInfo {
bool isLegalMaskedStore(Type *DataType, Align Alignment) const;
/// Return true if the target supports masked load.
bool isLegalMaskedLoad(Type *DataType, Align Alignment) const;
+ /// Return true if the target supports masked load.
+ bool isLegalMaskedPrefetch(Type *DataType, Align Alignment) const;
/// Return true if the target supports nontemporal store.
bool isLegalNTStore(Type *DataType, Align Alignment) const;
@@ -757,6 +759,8 @@ class TargetTransformInfo {
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const;
/// Return true if the target supports masked gather.
bool isLegalMaskedGather(Type *DataType, Align Alignment) const;
+ /// Return true if the target supports masked gather prefetch.
+ bool isLegalMaskedGatherPrefetch(Type *DataType, Align Alignment) const;
/// Return true if the target forces scalarizing of llvm.masked.gather
/// intrinsics.
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const;
@@ -1769,12 +1773,14 @@ class TargetTransformInfo::Concept {
getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const = 0;
virtual bool isLegalMaskedStore(Type *DataType, Align Alignment) = 0;
virtual bool isLegalMaskedLoad(Type *DataType, Align Alignment) = 0;
+ virtual bool isLegalMaskedPrefetch(Type *DataType, Align Alignment) = 0;
virtual bool isLegalNTStore(Type *DataType, Align Alignment) = 0;
virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0;
virtual bool isLegalBroadcastLoad(Type *ElementTy,
ElementCount NumElements) const = 0;
virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0;
virtual bool isLegalMaskedGather(Type *DataType, Align Alignment) = 0;
+ virtual bool isLegalMaskedGatherPrefetch(Type *DataType, Align Alignment) = 0;
virtual bool forceScalarizeMaskedGather(VectorType *DataType,
Align Alignment) = 0;
virtual bool forceScalarizeMaskedScatter(VectorType *DataType,
@@ -2225,6 +2231,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
bool isLegalMaskedLoad(Type *DataType, Align Alignment) override {
return Impl.isLegalMaskedLoad(DataType, Alignment);
}
+ bool isLegalMaskedPrefetch(Type *DataType, Align Alignment) override {
+ return Impl.isLegalMaskedPrefetch(DataType, Alignment);
+ }
bool isLegalNTStore(Type *DataType, Align Alignment) override {
return Impl.isLegalNTStore(DataType, Alignment);
}
@@ -2241,6 +2250,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
bool isLegalMaskedGather(Type *DataType, Align Alignment) override {
return Impl.isLegalMaskedGather(DataType, Alignment);
}
+ bool isLegalMaskedGatherPrefetch(Type *DataType, Align Alignment) override {
+ return Impl.isLegalMaskedGatherPrefetch(DataType, Alignment);
+ }
bool forceScalarizeMaskedGather(VectorType *DataType,
Align Alignment) override {
return Impl.forceScalarizeMaskedGather(DataType, Alignment);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 4ab33995618213c..076f39f922a8c4c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -254,6 +254,10 @@ class TargetTransformInfoImplBase {
return false;
}
+ bool isLegalMaskedPrefetch(Type *DataType, Align Alignment) const {
+ return false;
+ }
+
bool isLegalNTStore(Type *DataType, Align Alignment) const {
// By default, assume nontemporal memory stores are available for stores
// that are aligned and have a size that is a power of 2.
@@ -280,6 +284,10 @@ class TargetTransformInfoImplBase {
return false;
}
+ bool isLegalMaskedGatherPrefetch(Type *DataType, Align Alignment) const {
+ return false;
+ }
+
bool forceScalarizeMaskedGather(VectorType *DataType, Align Alignment) const {
return false;
}
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index c11d558a73e9d09..d9af917754f14cc 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1563,6 +1563,15 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
VarMask, Alignment, CostKind, I);
}
+ case Intrinsic::masked_gather_prefetch: {
+ const Value *Mask = Args[4];
+ bool VarMask = !isa<Constant>(Mask);
+ Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
+ auto *MaskVT = cast<VectorType>(Mask->getType());
+ auto *PsudoDataTy = MaskVT->getWithNewBitWidth(Alignment.value()*8);
+ return thisT()->getGatherScatterOpCost(Instruction::Call, PsudoDataTy, Args[0],
+ VarMask, Alignment, CostKind, I);
+ }
case Intrinsic::experimental_stepvector: {
if (isa<ScalableVectorType>(RetTy))
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
@@ -1880,6 +1889,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
CostKind);
}
+ case Intrinsic::masked_prefetch: {
+ auto *MaskVT = cast<VectorType>(ICA.getArgTypes()[4]);
+ Type *PsudoTy = MaskVT->getWithNewBitWidth(32);
+ Align TyAlign = thisT()->DL.getABITypeAlign(PsudoTy);
+ return thisT()->getMaskedMemoryOpCost(Instruction::Call, PsudoTy, TyAlign, 0,
+ CostKind);
+ }
case Intrinsic::vector_reduce_add:
return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
std::nullopt, CostKind);
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index ef86eefdf33b834..bfe0ccb81256e88 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -801,6 +801,11 @@ class IRBuilderBase {
CallInst *CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment,
Value *Mask);
+ /// Create a call to Masked Prefetch intrinsic
+ CallInst *CreateMaskedPrefetch(Value *Ptr, Value *ElemSize, Value *Mask,
+ Value *RW = nullptr, Value *Locality = nullptr,
+ const Twine &Name = "");
+
/// Create a call to Masked Gather intrinsic
CallInst *CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment,
Value *Mask = nullptr, Value *PassThru = nullptr,
@@ -810,6 +815,12 @@ class IRBuilderBase {
CallInst *CreateMaskedScatter(Value *Val, Value *Ptrs, Align Alignment,
Value *Mask = nullptr);
+ /// Create a call to Masked Gather Prefetch intrinsic
+ CallInst *CreateMaskedGatherPrefetch(Value *Ptrs, Value *ElemSize,
+ Value *Mask = nullptr, Value *RW = nullptr,
+ Value *Locality = nullptr,
+ const Twine &Name = "");
+
/// Create a call to Masked Expand Load intrinsic
CallInst *CreateMaskedExpandLoad(Type *Ty, Value *Ptr, Value *Mask = nullptr,
Value *PassThru = nullptr,
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 62bd833198f022b..a81592aca6dd25e 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -1328,6 +1328,115 @@ class AnyMemCpyInst : public AnyMemTransferInst {
}
};
+/// This class prefetch intrinsic
+/// i.e. llvm.prefetch
+class PrefetchInst : public IntrinsicInst {
+public:
+ static bool classof(const IntrinsicInst *I) {
+ return I->getIntrinsicID() == Intrinsic::prefetch;
+ }
+ static bool classof(const Value *V) {
+ return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+ }
+
+ Value *getPointerOperand() { return getOperand(0); }
+ const Value *getPointerOperand() const { return getOperand(0); }
+ static unsigned getPointerOperandIndex() { return 0U; }
+ Type *getPointerOperandType() const { return getPointerOperand()->getType(); }
+};
+
+/// A helper function that returns the pointer operand of a prefetch
+/// instruction. Returns nullptr if not prefetch.
+inline const Value *getPrefetchPointerOperand(const Value *V) {
+ if (auto *Prefetch = dyn_cast<PrefetchInst>(V))
+ return Prefetch->getPointerOperand();
+ return nullptr;
+}
+inline Value *getPrefetchPointerOperand(Value *V) {
+ return const_cast<Value *>(
+ getPrefetchPointerOperand(static_cast<const Value *>(V)));
+}
+
+/// A helper function that returns the address space of the pointer operand of
+/// prefetch instruction.
+inline unsigned getPrefetchAddressSpace(Value *I) {
+ assert(isa<PrefetchInst>(I) && "Expected prefetch instruction");
+ auto *PtrTy = dyn_cast<PrefetchInst>(I)->getPointerOperandType();
+ return dyn_cast<PointerType>(PtrTy)->getAddressSpace();
+}
+
+/// A helper function that psuedo returns type of a prefetch instruction.
+inline Type *getPrefetchPseudoType(Value *I) {
+ assert(isa<PrefetchInst>(I) && "Expected Prefetch instruction");
+ auto *Prefetch = dyn_cast<PrefetchInst>(I);
+
+ // Get type for the following pattern
+ // ex) %1 = add nuw nsw i64 %indvars.iv, 8
+ // %arrayidx = getelementptr inbounds double, ptr %b, i64 %1
+ // tail call void @llvm.prefetch.p0(ptr nonnull %arrayidx, i32 0, i32 3, i32 1)
+ auto *GEP = dyn_cast<GetElementPtrInst>(Prefetch->getPointerOperand());
+ if (GEP) {
+ auto *ElemTy = GEP->getSourceElementType();
+ if (isa<ArrayType>(ElemTy) || isa<StructType>(ElemTy))
+ return Type::getInt64Ty(I->getContext());
+ return ElemTy;
+ }
+
+ // Get type for the following pattern
+ // ex) %a = alloca [100 x double], align 8
+ // tail call void @llvm.prefetch.p0(ptr nonnull %a, i32 0, i32 3, i32 1)
+ auto *Alloca = dyn_cast<AllocaInst>(Prefetch->getPointerOperand());
+ if (Alloca) {
+ auto *ElemTy = Alloca->getAllocatedType()->getArrayElementType();
+ if (isa<ArrayType>(ElemTy) || isa<StructType>(ElemTy))
+ return Type::getInt64Ty(I->getContext());
+ return ElemTy;
+ }
+
+ return Type::getInt64Ty(I->getContext());
+}
+
+/// A helper function that returns the pseudo-alignment of prefetch instruction.
+inline Align getPrefetchPseudoAlignment(Value *I) {
+ assert(isa<PrefetchInst>(I) && "Expected Prefetch instruction");
+ auto *Ty = getPrefetchPseudoType(I);
+ return Ty? Align(Ty->getScalarSizeInBits()>>3) : Align(1ULL);
+}
+
+/// A helper function that returns the alignment of load/store/prefetch instruction.
+inline Align getLdStPfAlignment(Value *I) {
+ if (isa<PrefetchInst>(I))
+ return getPrefetchPseudoAlignment(I);
+ return getLoadStoreAlignment(I);
+}
+
+/// A helper function that returns the pointer operand of a load/store/prefetch
+/// instruction. Returns nullptr if not prefetch.
+inline const Value *getLdStPfPointerOperand(const Value *I) {
+ if (isa<PrefetchInst>(I))
+ return getPrefetchPointerOperand(I);
+ return getLoadStorePointerOperand(I);
+}
+inline Value *getLdStPfPointerOperand(Value *V) {
+ return const_cast<Value *>(
+ getLdStPfPointerOperand(static_cast<const Value *>(V)));
+}
+
+/// A helper function that returns the address space of the pointer operand of
+/// load/store/prefetch instruction.
+inline unsigned getLdStPfAddressSpace(Value *I) {
+ if (isa<PrefetchInst>(I))
+ return getPrefetchAddressSpace(I);
+ return getLoadStoreAddressSpace(I);
+}
+
+/// A helper function that returns the type of a load/store/prefetch instruction.
+inline Type *getLdStPfType(Value *I) {
+ if (isa<PrefetchInst>(I))
+ return getPrefetchPseudoType(I);
+ return getLoadStoreType(I);
+}
+
/// This class represents any memmove intrinsic
/// i.e. llvm.element.unordered.atomic.memmove
/// and llvm.memmove
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index cd6061a190fbbc0..940e20a7a070191 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2230,6 +2230,21 @@ def int_masked_compressstore:
[IntrWriteMem, IntrArgMemOnly, IntrWillReturn,
NoCapture<ArgIndex<1>>]>;
+def int_masked_prefetch:
+ DefaultAttrsIntrinsic<[],
+ [llvm_anyptr_ty,
+ llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty],
+ [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn,
+ ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+
+def int_masked_gather_prefetch:
+ DefaultAttrsIntrinsic<[],
+ [llvm_anyvector_ty,
+ llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+ [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn,
+ ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+
// Test whether a pointer is associated with a type metadata identifier.
def int_type_test : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty],
[IntrNoMem, IntrWillReturn, IntrSpeculatable]>;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c751d174a48ab1f..ce0ebd05df29d84 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -433,6 +433,11 @@ bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType,
return TTIImpl->isLegalMaskedLoad(DataType, Alignment);
}
+bool TargetTransformInfo::isLegalMaskedPrefetch(Type *DataType,
+ Align Alignment) const {
+ return TTIImpl->isLegalMaskedPrefetch(DataType, Alignment);
+}
+
bool TargetTransformInfo::isLegalNTStore(Type *DataType,
Align Alignment) const {
return TTIImpl->isLegalNTStore(DataType, Alignment);
@@ -481,6 +486,11 @@ bool TargetTransformInfo::isLegalMaskedExpandLoad(Type *DataType) const {
return TTIImpl->isLegalMaskedExpandLoad(DataType);
}
+bool TargetTransformInfo::isLegalMaskedGatherPrefetch(Type *DataType,
+ Align Alignment) const {
+ return TTIImpl->isLegalMaskedGatherPrefetch(DataType, Alignment);
+}
+
bool TargetTransformInfo::enableOrderedReductions() const {
return TTIImpl->enableOrderedReductions();
}
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 13bb4e83a5b94d6..cf17f6e8d38284b 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -92,6 +92,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
case Intrinsic::canonicalize:
case Intrinsic::fptosi_sat:
case Intrinsic::fptoui_sat:
+ case Intrinsic::prefetch:
return true;
default:
return false;
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 974e29841e1bc63..fc62ae568f6ca32 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -539,6 +539,27 @@ CallInst *IRBuilderBase::CreateMaskedStore(Value *Val, Value *Ptr,
return CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, OverloadedTypes);
}
+/// Create a call to a Masked Prefetch intrinsic.
+/// \p Ptr - base pointer for the load
+/// \p ElemSize - element size for memory address generation
+/// \p Mask - vector of booleans which indicates what vector lanes should
+/// be accessed in memory
+/// \p RW - Read or Write
+/// \p Locality - Cache Level
+/// \p Name - name of the result variable
+CallInst *IRBuilderBase::CreateMaskedPrefetch(Value *Ptr,
+ Value *ElemSize,
+ Value *Mask, Value *RW,
+ Value *Locality,
+ const Twine &Name) {
+ auto *PtrTy = cast<PointerType>(Ptr->getType());
+ assert(Mask && "Mask should not be all-ones (null)");
+ Type *OverloadedTypes[] = {PtrTy, Mask->getType()};
+ Value *Ops[] = {Ptr, ElemSize, RW, Locality, Mask};
+ return CreateMaskedIntrinsic(Intrinsic::masked_prefetch, Ops,
+ OverloadedTypes, Name);
+}
+
/// Create a call to a Masked intrinsic, with given intrinsic Id,
/// an array of operands - Ops, and an array of overloaded types -
/// OverloadedTypes.
@@ -645,6 +666,34 @@ CallInst *IRBuilderBase::CreateMaskedCompressStore(Value *Val, Value *Ptr,
OverloadedTypes);
}
+/// Create a call to a Masked Gather Prefetch intrinsic.
+/// \p Ptrs - vector of pointers for loading
+/// \p ElemSize - element size for memory address generation
+/// \p Mask - vector of booleans which indicates what vector lanes should
+/// be accessed in memory
+/// \p RW - Read or Write
+/// \p Locality - Cache Level
+/// \p Name - name of the result variable
+CallInst *IRBuilderBase::CreateMaskedGatherPrefetch(Value *Ptrs, Value *ElemSize,
+ Value *Mask, Value *RW,
+ Value *Locality,
+ const Twine &Name) {
+ auto *PtrsTy = cast<VectorType>(Ptrs->getType());
+ ElementCount NumElts = PtrsTy->getElementCount();
+
+ if (!Mask)
+ Mask = Constant::getAllOnesValue(
+ VectorType::get(Type::getInt1Ty(Context), NumElts));
+
+ Type *OverloadedTypes[] = {PtrsTy};
+ Value *Ops[] = {Ptrs, ElemSize, RW, Locality, Mask};
+
+ // We specify only one type when we create this intrinsic. Types of other
+ // arguments are derived from this type.
+ return CreateMaskedIntrinsic(Intrinsic::masked_gather_prefetch, Ops, OverloadedTypes,
+ Name);
+}
+
template <typename T0>
static std::vector<Value *>
getStatepointArgs(IRBuilderBase &B, uint64_t ID, uint32_t NumPatchBytes,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 35d69df56dc7220..5c994bf2d21e042 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -600,7 +600,7 @@ bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const {
bool LoopVectorizationLegality::isUniformMemOp(Instruction &I,
ElementCount VF) const {
- Value *Ptr = getLoadStorePointerOperand(&I);
+ Value *Ptr = getLdStPfPointerOperand(&I);
if (!Ptr)
return false;
// Note: There's nothing inherent which prevents predicated loads and
@@ -1284,6 +1284,12 @@ bool LoopVectorizationLegality::blockCanBePredicated(
continue;
}
+ // Prefetchs are handeled via masking
+ if (auto *PF = dyn_cast<PrefetchInst>(&I)) {
+ MaskedOp.insert(PF);
+ continue;
+ }
+
if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow())
return false;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a203c4794eac943..6386df48e336fd8 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1468,19 +1468,28 @@ class LoopVectorizationCostModel {
TTI.isLegalMaskedLoad(DataType, Alignment);
}
+ /// Returns true if the target machine supports masked prefetch operation
+ /// for the given \p DataType and kind of access to \p Ptr.
+ bool isLegalMaskedPrefetch(Type *DataType, Value *Ptr, Align Alignment) const {
+ return Legal->isConsecutivePtr(DataType, Ptr) &&
+ TTI.isLegalMaskedPrefetch(DataType, Alignment);
+ }
+
/// Returns true if the target machine can represent \p V as a masked gather
/// or scatter operation.
bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
bool LI = isa<LoadInst>(V);
bool SI = isa<StoreInst>(V);
- if (!LI && !SI)
+ bool PF = isa<PrefetchInst>(V);
+ if (!LI && !SI && !PF)
return false;
- auto *Ty = getLoadStoreType(V);
- Align Align = getLoadStoreAlignment(V);
+ auto *Ty = getLdStPfType(V);
+ Align Align = getLdStPfAlignment(V);
if (VF.isVector())
Ty = VectorType::get(Ty, VF);
return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
- (SI && TTI.isLegalMaskedScatter(Ty, Align));
+ (SI && TTI.isLegalMaskedScatter(Ty, Align)) ||
+ (PF && TTI.isLegalMaskedPrefetch(Ty, Align));
}
/// Returns true if the target machine supports all of the reduction
@@ -4278,8 +4287,18 @@ bool LoopVectorizationCostModel::isScalarWithPredication(
switch(I->getOpcode()) {
default:
return true;
- case Instruction::Call:
- return !VFDatabase::hasMaskedVariant(*(cast<CallInst>(I)), VF);
+ case Instruction::Call: {
+ if (!isa<PrefetchInst>(I))
+ return !VFDatabase::hasMaskedVariant(*(cast<CallInst>(I)), VF);
+ auto *Ptr = getPrefetchPointerOperand(I);
+ auto *Ty = getPrefetchPseudoType(I);
+ Type *VTy = Ty;
+ if (VF.isVector())
+ VTy = VectorType::get(Ty, VF);
+ const Align Alignment = getPrefetchPseudoAlignment(I);
+ return !(isLegalMaskedPrefetch(Ty, Ptr, Alignment) ||
+ TTI.isLegalMaskedGatherPrefetch(VTy, Alignment));
+ }
case Instruction::Load:
case Instruction::Store: {
auto *Ptr = getLoadStorePointerOperand(I);
@@ -4486,10 +4505,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
Instruction *I, ElementCount VF) {
// Get and ensure we have a valid memory instruction.
- assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
+ assert((isa<LoadInst, StoreInst, PrefetchInst>(I)) && "Invalid memory instruction");
- auto *Ptr = getLoadStorePointerOperand(I);
- auto *ScalarTy = getLoadStoreType(I);
+ auto *Ptr = getLdStPfPointerOperand(I);
+ auto *ScalarTy = getLdStPfType(I);
// In order to be widened, the pointer should be consecutive, first of all.
if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
@@ -6346,11 +6365,11 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
if (VF.isScalable())
return InstructionCost::getInvalid();
- Type *ValTy = getLoadStoreType(I);
+ Type *ValTy = getLdStPfType(I);
auto SE = PSE.getSE();
- unsigned AS = getLoadStoreAddressSpace(I);
- Value *Ptr = getLoadStorePointerOperand(I);
+ unsigned AS = getLdStPfAddressSpace(I);
+ Value *Ptr = getLdStPfPointerOperand(I);
Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
// NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
// that it is being called from this specific place.
@@ -6366,7 +6385,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
// Don't pass *I here, since it is scalar but will actually be part of a
// vectorized loop where the user of it is a vectorized instruction.
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- const Align Alignment = getLoadStoreAlignment(I);
+ const Align Alignment = getLdStPfAlignment(I);
Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
ValTy->getScalarType(),
Alignment, AS, CostKind);
@@ -6401,16 +6420,16 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
InstructionCost
LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
ElementCount VF) {
- Type *ValTy = getLoadStoreType(I);
+ Type *ValTy = getLdStPfType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
- Value *Ptr = getLoadStorePointerOperand(I);
- unsigned AS = getLoadStoreAddressSpace(I);
+ Value *Ptr = getLdStPfPointerOperand(I);
+ unsigned AS = getLdStPfAddressSpace(I);
int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Stride should be 1 or -1 for consecutive memory access");
- const Align Alignment = getLoadStoreAlignment(I);
+ const Align Alignment = getLdStPfAlignment(I);
InstructionCost Cost = 0;
if (Legal->isMaskRequired(I)) {
Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
@@ -6433,11 +6452,16 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
ElementCount VF) {
assert(Legal->isUniformMemOp(*I, VF));
- Type *ValTy = getLoadStoreType(I);
+ Type *ValTy = getLdStPfType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
- const Align Alignment = getLoadStoreAlignment(I);
- unsigned AS = getLoadStoreAddressSpace(I);
+ const Align Alignment = getLdStPfAlignment(I);
+ unsigned AS = getLdStPfAddressSpace(I);
enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ if (isa<PrefetchInst>(I)) {
+ return TTI.getAddressComputationCost(ValTy) +
+ TTI.getMemoryOpCost(Instruction::Call, ValTy, Alignment, AS,
+ CostKind);
+ }
if (isa<LoadInst>(I)) {
return TTI.getAddressComputationCost(ValTy) +
TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
@@ -6459,10 +6483,10 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
InstructionCost
LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
ElementCount VF) {
- Type *ValTy = getLoadStoreType(I);
+ Type *ValTy = getLdStPfType(I);
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
- const Align Alignment = getLoadStoreAlignment(I);
- const Value *Ptr = getLoadStorePointerOperand(I);
+ const Align Alignment = getLdStPfAlignment(I);
+ const Value *Ptr = getLdStPfPointerOperand(I);
return TTI.getAddressComputationCost(VectorTy) +
TTI.getGatherScatterOpCost(
@@ -6688,9 +6712,9 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
// Calculate scalar cost only. Vectorization cost should be ready at this
// moment.
if (VF.isScalar()) {
- Type *ValTy = getLoadStoreType(I);
- const Align Alignment = getLoadStoreAlignment(I);
- unsigned AS = getLoadStoreAddressSpace(I);
+ Type *ValTy = getLdStPfType(I);
+ const Align Alignment = getLdStPfAlignment(I);
+ unsigned AS = getLdStPfAddressSpace(I);
TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
return TTI.getAddressComputationCost(ValTy) +
@@ -6791,7 +6815,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
for (BasicBlock *BB : TheLoop->blocks()) {
// For each instruction in the old loop.
for (Instruction &I : *BB) {
- Value *Ptr = getLoadStorePointerOperand(&I);
+ Value *Ptr = getLdStPfPointerOperand(&I);
if (!Ptr)
continue;
@@ -6851,7 +6875,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
if (memoryInstructionCanBeWidened(&I, VF)) {
InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
int ConsecutiveStride = Legal->isConsecutivePtr(
- getLoadStoreType(&I), getLoadStorePointerOperand(&I));
+ getLdStPfType(&I), getLdStPfPointerOperand(&I));
assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
"Expected consecutive stride.");
InstWidening Decision =
@@ -7285,6 +7309,20 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
}
case Instruction::Call: {
+ if (isa<PrefetchInst>(I)) {
+ ElementCount Width = VF;
+ if (Width.isVector()) {
+ InstWidening Decision = getWideningDecision(I, Width);
+ assert(Decision != CM_Unknown &&
+ "CM decision should be taken at this point");
+ if (getWideningCost(I, VF) == InstructionCost::getInvalid())
+ return InstructionCost::getInvalid();
+ if (Decision == CM_Scalarize)
+ Width = ElementCount::getFixed(1);
+ }
+ VectorTy = ToVectorTy(getLdStPfType(I), Width);
+ return getMemoryInstructionCost(I, VF);
+ }
if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
return *RedCost;
@@ -8134,7 +8172,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
ArrayRef<VPValue *> Operands,
VFRange &Range,
VPlanPtr &Plan) {
- assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+ assert((isa<LoadInst>(I) || isa<StoreInst>(I) || isa<PrefetchInst>(I)) &&
"Must be called with either a load or store");
auto willWiden = [&](ElementCount VF) -> bool {
@@ -8169,6 +8207,10 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
Consecutive, Reverse);
+ if (PrefetchInst *Prefetch = dyn_cast<PrefetchInst>(I))
+ return new VPWidenMemoryInstructionRecipe(*Prefetch, Operands[0], Mask,
+ Consecutive, Reverse);
+
StoreInst *Store = cast<StoreInst>(I);
return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
Mask, Consecutive, Reverse);
@@ -8588,10 +8630,12 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
[&](ElementCount VF) { return VF.isScalar(); }, Range))
return nullptr;
- if (auto *CI = dyn_cast<CallInst>(Instr))
+ if (isa<CallInst>(Instr) && !isa<PrefetchInst>(Instr)) {
+ auto *CI = dyn_cast<CallInst>(Instr);
return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));
+ }
- if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
+ if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr) || isa<PrefetchInst>(Instr))
return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
if (!shouldWiden(Instr, Range))
@@ -9400,7 +9444,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
if (IsUniform) {
// If the recipe is uniform across all parts (instead of just per VF), only
// generate a single instance.
- if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
+ if ((isa<LoadInst>(UI) || isa<StoreInst>(UI) || isa<PrefetchInst>(UI)) &&
all_of(operands(), [](VPValue *Op) {
return Op->isDefinedOutsideVectorRegions();
})) {
@@ -9430,6 +9474,16 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
return;
}
+ // A prefetch of a loop varying value to a uniform address only needs the last
+ // copy of the store.
+ if (isa<PrefetchInst>(UI) &&
+ vputils::isUniformAfterVectorization(getOperand(0))) {
+ auto Lane = VPLane::getLastLaneForVF(State.VF);
+ State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
+ State);
+ return;
+ }
+
// Generate scalar instances for all VF lanes of all UF parts.
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
const unsigned EndLane = State.VF.getKnownMinValue();
@@ -9444,15 +9498,17 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
// Attempt to issue a wide load.
LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
+ PrefetchInst *PF = dyn_cast<PrefetchInst>(&Ingredient);
- assert((LI || SI) && "Invalid Load/Store instruction");
+ assert((LI || SI || PF) && "Invalid Load/Store/Prefetch instruction");
assert((!SI || StoredValue) && "No stored value provided for widened store");
assert((!LI || !StoredValue) && "Stored value provided for widened load");
+ assert((!PF || !StoredValue) && "Stored value provided for widened prefetch");
- Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+ Type *ScalarDataTy = getLdStPfType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
- const Align Alignment = getLoadStoreAlignment(&Ingredient);
+ const Align Alignment = getLdStPfAlignment(&Ingredient);
bool CreateGatherScatter = !isConsecutive();
auto &Builder = State.Builder;
@@ -9535,6 +9591,41 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
return;
}
+ if (PF) {
+ State.setDebugLocFrom(PF->getDebugLoc());
+
+ Type *ESizeTy = Type::getInt32Ty(PF->getContext());
+ int32_t ESize = ScalarDataTy->getScalarSizeInBits() >> 3;
+ Value *ElemSize = ConstantInt::get(ESizeTy, ESize);
+ Value *RW = PF->getArgOperand(1);
+ Value *Locality = PF->getArgOperand(2);
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Instruction *NewPF = nullptr;
+ if (CreateGatherScatter) {
+ Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+ Value *VectorGep = State.get(getAddr(), Part);
+ NewPF = Builder.CreateMaskedGatherPrefetch(VectorGep, ElemSize,
+ MaskPart, RW, Locality);
+ } else {
+ auto *VecPtr =
+ CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
+ if (isMaskRequired)
+ NewPF = Builder.CreateMaskedPrefetch(VecPtr, ElemSize,
+ BlockInMaskParts[Part],
+ RW, Locality);
+ else {
+ auto *MaskPart = Constant::getAllOnesValue(
+ VectorType::get(Type::getInt1Ty(DataTy->getContext()), DataTy));
+ NewPF = Builder.CreateMaskedPrefetch(VecPtr, ElemSize,
+ MaskPart, RW, Locality);
+ }
+ }
+ State.addMetadata(NewPF, PF);
+ }
+ return;
+ }
+
// Handle loads.
assert(LI && "Must have a load instruction");
State.setDebugLocFrom(LI->getDebugLoc());
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1f5e667cf2e5a04..ae9d70b80c1e397 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1949,7 +1949,8 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
}
bool isMasked() const {
- return isStore() ? getNumOperands() == 3 : getNumOperands() == 2;
+ return isPrefetch() ? getNumOperands() == 5 :
+ isStore() ? getNumOperands() == 3 : getNumOperands() == 2;
}
public:
@@ -1971,6 +1972,14 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
setMask(Mask);
}
+ VPWidenMemoryInstructionRecipe(PrefetchInst &Prefetch, VPValue *Addr, VPValue *Mask,
+ bool Consecutive, bool Reverse)
+ : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr}),
+ Ingredient(Prefetch), Consecutive(Consecutive), Reverse(Reverse) {
+ assert((Consecutive || !Reverse) && "Reverse implies consecutive");
+ setMask(Mask);
+ }
+
VP_CLASSOF_IMPL(VPDef::VPWidenMemoryInstructionSC)
/// Return the address accessed by this recipe.
@@ -1988,6 +1997,9 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
/// Returns true if this recipe is a store.
bool isStore() const { return isa<StoreInst>(Ingredient); }
+ /// Returns true if this recipe is a prefetch.
+ bool isPrefetch() const { return isa<PrefetchInst>(Ingredient); }
+
/// Return the address accessed by this recipe.
VPValue *getStoredValue() const {
assert(isStore() && "Stored value only available for store instructions");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index fbb67fa3308d0fa..ca055b9798cf3ee 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -146,13 +146,17 @@ bool VPRecipeBase::mayHaveSideEffects() const {
"underlying instruction has side-effects");
return false;
}
- case VPWidenMemoryInstructionSC:
+ case VPWidenMemoryInstructionSC: {
+ auto *R = cast<VPWidenMemoryInstructionRecipe>(this);
+ if (isa<PrefetchInst>(R->getIngredient()))
+ return true;
assert(cast<VPWidenMemoryInstructionRecipe>(this)
->getIngredient()
.mayHaveSideEffects() == mayWriteToMemory() &&
"mayHaveSideffects result for ingredient differs from this "
"implementation");
return mayWriteToMemory();
+ }
case VPReplicateSC: {
auto *R = cast<VPReplicateRecipe>(this);
return R->getUnderlyingInstr()->mayHaveSideEffects();
@@ -1390,7 +1394,7 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN ";
- if (!isStore()) {
+ if (!isStore() && !isPrefetch()) {
getVPSingleValue()->printAsOperand(O, SlotTracker);
O << " = ";
}
More information about the llvm-commits
mailing list