[llvm] [Vectorize] Vectorization for __builtin_prefetch (PR #66160)

Tue Sep 12 17:10:14 PDT 2023

https://github.com/m-saito-fj created https://github.com/llvm/llvm-project/pull/66160:

Allow vectorization of loops containing __builtin_prefetch. Add masked_prefetch intrinsic and masked_gather_prefetch intrinsic for this purpose. Also, add a process to vectorize prefetch intrinsic in LoopVectorize.

>From 4c3070e3fd66b522fcf6f8f19cc42cba8b492654 Mon Sep 17 00:00:00 2001
From: Moriyuki Saito <saitou.moriyuki at fujitsu.com>
Date: Tue, 12 Sep 2023 19:24:15 +0900
Subject: [PATCH] [Vectorize] Vectorization for __builtin_prefetch

Allow vectorization of loops containing __builtin_prefetch. Add
masked_prefetch intrinsic and masked_gather_prefetch intrinsic for this
purpose. Also, add a process to vectorize prefetch intrinsic in
LoopVectorize.
---
 .../llvm/Analysis/TargetTransformInfo.h       |  12 ++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   8 +
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |  16 ++
 llvm/include/llvm/IR/IRBuilder.h              |  11 ++
 llvm/include/llvm/IR/IntrinsicInst.h          | 109 ++++++++++++
 llvm/include/llvm/IR/Intrinsics.td            |  15 ++
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  10 ++
 llvm/lib/Analysis/VectorUtils.cpp             |   1 +
 llvm/lib/IR/IRBuilder.cpp                     |  49 ++++++
 .../Vectorize/LoopVectorizationLegality.cpp   |   8 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 160 ++++++++++++++----
 llvm/lib/Transforms/Vectorize/VPlan.h         |  14 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   8 +-
 13 files changed, 382 insertions(+), 39 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 1ae595d2110457d..935860420afa5b6 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -743,6 +743,8 @@ class TargetTransformInfo {
   bool isLegalMaskedStore(Type *DataType, Align Alignment) const;
   /// Return true if the target supports masked load.
   bool isLegalMaskedLoad(Type *DataType, Align Alignment) const;
+  /// Return true if the target supports masked load.
+  bool isLegalMaskedPrefetch(Type *DataType, Align Alignment) const;
 
   /// Return true if the target supports nontemporal store.
   bool isLegalNTStore(Type *DataType, Align Alignment) const;
@@ -757,6 +759,8 @@ class TargetTransformInfo {
   bool isLegalMaskedScatter(Type *DataType, Align Alignment) const;
   /// Return true if the target supports masked gather.
   bool isLegalMaskedGather(Type *DataType, Align Alignment) const;
+  /// Return true if the target supports masked gather prefetch.
+  bool isLegalMaskedGatherPrefetch(Type *DataType, Align Alignment) const;
   /// Return true if the target forces scalarizing of llvm.masked.gather
   /// intrinsics.
   bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const;
@@ -1769,12 +1773,14 @@ class TargetTransformInfo::Concept {
     getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const = 0;
   virtual bool isLegalMaskedStore(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalMaskedLoad(Type *DataType, Align Alignment) = 0;
+  virtual bool isLegalMaskedPrefetch(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalNTStore(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalNTLoad(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalBroadcastLoad(Type *ElementTy,
                                     ElementCount NumElements) const = 0;
   virtual bool isLegalMaskedScatter(Type *DataType, Align Alignment) = 0;
   virtual bool isLegalMaskedGather(Type *DataType, Align Alignment) = 0;
+  virtual bool isLegalMaskedGatherPrefetch(Type *DataType, Align Alignment) = 0;
   virtual bool forceScalarizeMaskedGather(VectorType *DataType,
                                           Align Alignment) = 0;
   virtual bool forceScalarizeMaskedScatter(VectorType *DataType,
@@ -2225,6 +2231,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   bool isLegalMaskedLoad(Type *DataType, Align Alignment) override {
     return Impl.isLegalMaskedLoad(DataType, Alignment);
   }
+  bool isLegalMaskedPrefetch(Type *DataType, Align Alignment) override {
+    return Impl.isLegalMaskedPrefetch(DataType, Alignment);
+  }
   bool isLegalNTStore(Type *DataType, Align Alignment) override {
     return Impl.isLegalNTStore(DataType, Alignment);
   }
@@ -2241,6 +2250,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   bool isLegalMaskedGather(Type *DataType, Align Alignment) override {
     return Impl.isLegalMaskedGather(DataType, Alignment);
   }
+  bool isLegalMaskedGatherPrefetch(Type *DataType, Align Alignment) override {
+    return Impl.isLegalMaskedGatherPrefetch(DataType, Alignment);
+  }
   bool forceScalarizeMaskedGather(VectorType *DataType,
                                   Align Alignment) override {
     return Impl.forceScalarizeMaskedGather(DataType, Alignment);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 4ab33995618213c..076f39f922a8c4c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -254,6 +254,10 @@ class TargetTransformInfoImplBase {
     return false;
   }
 
+  bool isLegalMaskedPrefetch(Type *DataType, Align Alignment) const {
+    return false;
+  }
+
   bool isLegalNTStore(Type *DataType, Align Alignment) const {
     // By default, assume nontemporal memory stores are available for stores
     // that are aligned and have a size that is a power of 2.
@@ -280,6 +284,10 @@ class TargetTransformInfoImplBase {
     return false;
   }
 
+  bool isLegalMaskedGatherPrefetch(Type *DataType, Align Alignment) const {
+    return false;
+  }
+
   bool forceScalarizeMaskedGather(VectorType *DataType, Align Alignment) const {
     return false;
   }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index c11d558a73e9d09..d9af917754f14cc 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1563,6 +1563,15 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
                                              VarMask, Alignment, CostKind, I);
     }
+    case Intrinsic::masked_gather_prefetch: {
+      const Value *Mask = Args[4];
+      bool VarMask = !isa<Constant>(Mask);
+      Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
+      auto *MaskVT = cast<VectorType>(Mask->getType());
+      auto *PsudoDataTy = MaskVT->getWithNewBitWidth(Alignment.value()*8);
+      return thisT()->getGatherScatterOpCost(Instruction::Call, PsudoDataTy, Args[0],
+                                             VarMask, Alignment, CostKind, I);
+    }
     case Intrinsic::experimental_stepvector: {
       if (isa<ScalableVectorType>(RetTy))
         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
@@ -1880,6 +1889,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
                                             CostKind);
     }
+    case Intrinsic::masked_prefetch: {
+      auto *MaskVT = cast<VectorType>(ICA.getArgTypes()[4]);
+      Type *PsudoTy = MaskVT->getWithNewBitWidth(32);
+      Align TyAlign = thisT()->DL.getABITypeAlign(PsudoTy);
+      return thisT()->getMaskedMemoryOpCost(Instruction::Call, PsudoTy, TyAlign, 0,
+                                            CostKind);
+    }
     case Intrinsic::vector_reduce_add:
       return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
                                                  std::nullopt, CostKind);
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index ef86eefdf33b834..cbfcfe8b4c5227c 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -801,6 +801,11 @@ class IRBuilderBase {
   CallInst *CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment,
                               Value *Mask);
 
+  /// Create a call to Masked Load intrinsic
+  CallInst *CreateMaskedPrefetch(Value *Ptr, Value *ElemSize, Value *Mask,
+                                 Value *RW = nullptr, Value *Locality = nullptr,
+                                 const Twine &Name = "");
+
   /// Create a call to Masked Gather intrinsic
   CallInst *CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment,
                                Value *Mask = nullptr, Value *PassThru = nullptr,
@@ -810,6 +815,12 @@ class IRBuilderBase {
   CallInst *CreateMaskedScatter(Value *Val, Value *Ptrs, Align Alignment,
                                 Value *Mask = nullptr);
 
+  /// Create a call to Masked Gather Prefetch intrinsic
+  CallInst *CreateMaskedGatherPrefetch(Value *Ptrs, Value *ElemSize,
+                                       Value *Mask = nullptr, Value *RW = nullptr,
+                                       Value *Locality = nullptr,
+                                       const Twine &Name = "");
+
   /// Create a call to Masked Expand Load intrinsic
   CallInst *CreateMaskedExpandLoad(Type *Ty, Value *Ptr, Value *Mask = nullptr,
                                    Value *PassThru = nullptr,
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 62bd833198f022b..a81592aca6dd25e 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -1328,6 +1328,115 @@ class AnyMemCpyInst : public AnyMemTransferInst {
   }
 };
 
+/// This class prefetch intrinsic
+/// i.e. llvm.prefetch
+class PrefetchInst : public IntrinsicInst {
+public:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::prefetch;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+
+  Value *getPointerOperand() { return getOperand(0); }
+  const Value *getPointerOperand() const { return getOperand(0); }
+  static unsigned getPointerOperandIndex() { return 0U; }
+  Type *getPointerOperandType() const { return getPointerOperand()->getType(); }
+};
+
+/// A helper function that returns the pointer operand of a prefetch
+/// instruction. Returns nullptr if not prefetch.
+inline const Value *getPrefetchPointerOperand(const Value *V) {
+  if (auto *Prefetch = dyn_cast<PrefetchInst>(V))
+    return Prefetch->getPointerOperand();
+  return nullptr;
+}
+inline Value *getPrefetchPointerOperand(Value *V) {
+  return const_cast<Value *>(
+      getPrefetchPointerOperand(static_cast<const Value *>(V)));
+}
+
+/// A helper function that returns the address space of the pointer operand of
+/// prefetch instruction.
+inline unsigned getPrefetchAddressSpace(Value *I) {
+  assert(isa<PrefetchInst>(I) && "Expected prefetch instruction");
+  auto *PtrTy = dyn_cast<PrefetchInst>(I)->getPointerOperandType();
+  return dyn_cast<PointerType>(PtrTy)->getAddressSpace();
+}
+
+/// A helper function that psuedo returns type of a prefetch instruction.
+inline Type *getPrefetchPseudoType(Value *I) {
+  assert(isa<PrefetchInst>(I) && "Expected Prefetch instruction");
+  auto *Prefetch = dyn_cast<PrefetchInst>(I);
+
+  // Get type for the following pattern
+  // ex) %1 = add nuw nsw i64 %indvars.iv, 8
+  //     %arrayidx = getelementptr inbounds double, ptr %b, i64 %1
+  //     tail call void @llvm.prefetch.p0(ptr nonnull %arrayidx, i32 0, i32 3, i32 1)
+  auto *GEP = dyn_cast<GetElementPtrInst>(Prefetch->getPointerOperand());
+  if (GEP) {
+    auto *ElemTy = GEP->getSourceElementType();
+    if (isa<ArrayType>(ElemTy) || isa<StructType>(ElemTy))
+      return Type::getInt64Ty(I->getContext());
+    return ElemTy;
+  }
+
+  // Get type for the following pattern
+  // ex) %a = alloca [100 x double], align 8
+  //     tail call void @llvm.prefetch.p0(ptr nonnull %a, i32 0, i32 3, i32 1)
+  auto *Alloca = dyn_cast<AllocaInst>(Prefetch->getPointerOperand());
+  if (Alloca) {
+    auto *ElemTy = Alloca->getAllocatedType()->getArrayElementType();
+    if (isa<ArrayType>(ElemTy) || isa<StructType>(ElemTy))
+      return Type::getInt64Ty(I->getContext());
+    return ElemTy;
+  }
+
+  return Type::getInt64Ty(I->getContext());
+}
+
+/// A helper function that returns the pseudo-alignment of prefetch instruction.
+inline Align getPrefetchPseudoAlignment(Value *I) {
+  assert(isa<PrefetchInst>(I) && "Expected Prefetch instruction");
+  auto *Ty = getPrefetchPseudoType(I);
+  return Ty? Align(Ty->getScalarSizeInBits()>>3) : Align(1ULL);
+}
+
+/// A helper function that returns the alignment of load/store/prefetch instruction.
+inline Align getLdStPfAlignment(Value *I) {
+  if (isa<PrefetchInst>(I))
+    return getPrefetchPseudoAlignment(I);
+  return getLoadStoreAlignment(I);
+}
+
+/// A helper function that returns the pointer operand of a load/store/prefetch
+/// instruction. Returns nullptr if not prefetch.
+inline const Value *getLdStPfPointerOperand(const Value *I) {
+  if (isa<PrefetchInst>(I))
+    return getPrefetchPointerOperand(I);
+  return getLoadStorePointerOperand(I);
+}
+inline Value *getLdStPfPointerOperand(Value *V) {
+  return const_cast<Value *>(
+      getLdStPfPointerOperand(static_cast<const Value *>(V)));
+}
+
+/// A helper function that returns the address space of the pointer operand of
+/// load/store/prefetch instruction.
+inline unsigned getLdStPfAddressSpace(Value *I) {
+  if (isa<PrefetchInst>(I))
+    return getPrefetchAddressSpace(I);
+  return getLoadStoreAddressSpace(I);
+}
+
+/// A helper function that returns the type of a load/store/prefetch instruction.
+inline Type *getLdStPfType(Value *I) {
+  if (isa<PrefetchInst>(I))
+    return getPrefetchPseudoType(I);
+  return getLoadStoreType(I);
+}
+
 /// This class represents any memmove intrinsic
 /// i.e. llvm.element.unordered.atomic.memmove
 ///  and llvm.memmove
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index cd6061a190fbbc0..940e20a7a070191 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2230,6 +2230,21 @@ def int_masked_compressstore:
             [IntrWriteMem, IntrArgMemOnly, IntrWillReturn,
              NoCapture<ArgIndex<1>>]>;
 
+def int_masked_prefetch:
+  DefaultAttrsIntrinsic<[],
+            [llvm_anyptr_ty,
+             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty],
+            [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn,
+             ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+
+def int_masked_gather_prefetch:
+  DefaultAttrsIntrinsic<[],
+            [llvm_anyvector_ty,
+             llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+             LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+            [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn,
+             ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+
 // Test whether a pointer is associated with a type metadata identifier.
 def int_type_test : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty],
                               [IntrNoMem, IntrWillReturn, IntrSpeculatable]>;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c751d174a48ab1f..ce0ebd05df29d84 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -433,6 +433,11 @@ bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType,
   return TTIImpl->isLegalMaskedLoad(DataType, Alignment);
 }
 
+bool TargetTransformInfo::isLegalMaskedPrefetch(Type *DataType,
+                                                Align Alignment) const {
+  return TTIImpl->isLegalMaskedPrefetch(DataType, Alignment);
+}
+
 bool TargetTransformInfo::isLegalNTStore(Type *DataType,
                                          Align Alignment) const {
   return TTIImpl->isLegalNTStore(DataType, Alignment);
@@ -481,6 +486,11 @@ bool TargetTransformInfo::isLegalMaskedExpandLoad(Type *DataType) const {
   return TTIImpl->isLegalMaskedExpandLoad(DataType);
 }
 
+bool TargetTransformInfo::isLegalMaskedGatherPrefetch(Type *DataType,
+                                                      Align Alignment) const {
+  return TTIImpl->isLegalMaskedGatherPrefetch(DataType, Alignment);
+}
+
 bool TargetTransformInfo::enableOrderedReductions() const {
   return TTIImpl->enableOrderedReductions();
 }
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 13bb4e83a5b94d6..cf17f6e8d38284b 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -92,6 +92,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::canonicalize:
   case Intrinsic::fptosi_sat:
   case Intrinsic::fptoui_sat:
+  case Intrinsic::prefetch:
     return true;
   default:
     return false;
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 974e29841e1bc63..fc62ae568f6ca32 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -539,6 +539,27 @@ CallInst *IRBuilderBase::CreateMaskedStore(Value *Val, Value *Ptr,
   return CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, OverloadedTypes);
 }
 
+/// Create a call to a Masked Prefetch intrinsic.
+/// \p Ptr      - base pointer for the load
+/// \p ElemSize - element size for memory address generation
+/// \p Mask     - vector of booleans which indicates what vector lanes should
+///               be accessed in memory
+/// \p RW       - Read or Write
+/// \p Locality - Cache Level
+/// \p Name     - name of the result variable
+CallInst *IRBuilderBase::CreateMaskedPrefetch(Value *Ptr,
+                                              Value *ElemSize,
+                                              Value *Mask, Value *RW,
+                                              Value *Locality,
+                                              const Twine &Name) {
+  auto *PtrTy = cast<PointerType>(Ptr->getType());
+  assert(Mask && "Mask should not be all-ones (null)");
+  Type *OverloadedTypes[] = {PtrTy, Mask->getType()};
+  Value *Ops[] = {Ptr, ElemSize, RW, Locality, Mask};
+  return CreateMaskedIntrinsic(Intrinsic::masked_prefetch, Ops,
+                               OverloadedTypes, Name);
+}
+
 /// Create a call to a Masked intrinsic, with given intrinsic Id,
 /// an array of operands - Ops, and an array of overloaded types -
 /// OverloadedTypes.
@@ -645,6 +666,34 @@ CallInst *IRBuilderBase::CreateMaskedCompressStore(Value *Val, Value *Ptr,
                                OverloadedTypes);
 }
 
+/// Create a call to a Masked Gather Prefetch intrinsic.
+/// \p Ptrs     - vector of pointers for loading
+/// \p ElemSize - element size for memory address generation
+/// \p Mask     - vector of booleans which indicates what vector lanes should
+///               be accessed in memory
+/// \p RW       - Read or Write
+/// \p Locality - Cache Level
+/// \p Name     - name of the result variable
+CallInst *IRBuilderBase::CreateMaskedGatherPrefetch(Value *Ptrs, Value *ElemSize,
+                                                    Value *Mask, Value *RW,
+                                                    Value *Locality,
+                                                    const Twine &Name) {
+  auto *PtrsTy = cast<VectorType>(Ptrs->getType());
+  ElementCount NumElts = PtrsTy->getElementCount();
+
+  if (!Mask)
+    Mask = Constant::getAllOnesValue(
+        VectorType::get(Type::getInt1Ty(Context), NumElts));
+
+  Type *OverloadedTypes[] = {PtrsTy};
+  Value *Ops[] = {Ptrs, ElemSize, RW, Locality, Mask};
+
+  // We specify only one type when we create this intrinsic. Types of other
+  // arguments are derived from this type.
+  return CreateMaskedIntrinsic(Intrinsic::masked_gather_prefetch, Ops, OverloadedTypes,
+                               Name);
+}
+
 template <typename T0>
 static std::vector<Value *>
 getStatepointArgs(IRBuilderBase &B, uint64_t ID, uint32_t NumPatchBytes,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 35d69df56dc7220..5c994bf2d21e042 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -600,7 +600,7 @@ bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const {
 
 bool LoopVectorizationLegality::isUniformMemOp(Instruction &I,
                                                ElementCount VF) const {
-  Value *Ptr = getLoadStorePointerOperand(&I);
+  Value *Ptr = getLdStPfPointerOperand(&I);
   if (!Ptr)
     return false;
   // Note: There's nothing inherent which prevents predicated loads and
@@ -1284,6 +1284,12 @@ bool LoopVectorizationLegality::blockCanBePredicated(
       continue;
     }
 
+    // Prefetchs are handeled via masking
+    if (auto *PF = dyn_cast<PrefetchInst>(&I)) {
+      MaskedOp.insert(PF);
+      continue;
+    }
+
     if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow())
       return false;
   }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a203c4794eac943..66a09ea32a9293f 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1468,19 +1468,28 @@ class LoopVectorizationCostModel {
            TTI.isLegalMaskedLoad(DataType, Alignment);
   }
 
+  /// Returns true if the target machine supports masked prefetch operation
+  /// for the given \p DataType and kind of access to \p Ptr.
+  bool isLegalMaskedPrefetch(Type *DataType, Value *Ptr, Align Alignment) const {
+    return Legal->isConsecutivePtr(DataType, Ptr) &&
+           TTI.isLegalMaskedPrefetch(DataType, Alignment);
+  }
+
   /// Returns true if the target machine can represent \p V as a masked gather
   /// or scatter operation.
   bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
     bool LI = isa<LoadInst>(V);
     bool SI = isa<StoreInst>(V);
-    if (!LI && !SI)
+    bool PF = isa<PrefetchInst>(V);
+    if (!LI && !SI && !PF)
       return false;
-    auto *Ty = getLoadStoreType(V);
-    Align Align = getLoadStoreAlignment(V);
+    auto *Ty = getLdStPfType(V);
+    Align Align = getLdStPfAlignment(V);
     if (VF.isVector())
       Ty = VectorType::get(Ty, VF);
     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
-           (SI && TTI.isLegalMaskedScatter(Ty, Align));
+           (SI && TTI.isLegalMaskedScatter(Ty, Align)) ||
+           (PF && TTI.isLegalMaskedPrefetch(Ty, Align));
   }
 
   /// Returns true if the target machine supports all of the reduction
@@ -4278,8 +4287,18 @@ bool LoopVectorizationCostModel::isScalarWithPredication(
   switch(I->getOpcode()) {
   default:
     return true;
-  case Instruction::Call:
-    return !VFDatabase::hasMaskedVariant(*(cast<CallInst>(I)), VF);
+  case Instruction::Call: {
+    if (!isa<PrefetchInst>(I))
+      return !VFDatabase::hasMaskedVariant(*(cast<CallInst>(I)), VF);
+    auto *Ptr = getPrefetchPointerOperand(I);
+    auto *Ty = getPrefetchPseudoType(I);
+    Type *VTy = Ty;
+    if (VF.isVector())
+      VTy = VectorType::get(Ty, VF);
+    const Align Alignment = getPrefetchPseudoAlignment(I);
+    return !(isLegalMaskedPrefetch(Ty, Ptr, Alignment) ||
+             TTI.isLegalMaskedGatherPrefetch(VTy, Alignment));
+  }
   case Instruction::Load:
   case Instruction::Store: {
     auto *Ptr = getLoadStorePointerOperand(I);
@@ -4486,10 +4505,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
     Instruction *I, ElementCount VF) {
   // Get and ensure we have a valid memory instruction.
-  assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
+  assert((isa<LoadInst, StoreInst, PrefetchInst>(I)) && "Invalid memory instruction");
 
-  auto *Ptr = getLoadStorePointerOperand(I);
-  auto *ScalarTy = getLoadStoreType(I);
+  auto *Ptr = getLdStPfPointerOperand(I);
+  auto *ScalarTy = getLdStPfType(I);
 
   // In order to be widened, the pointer should be consecutive, first of all.
   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
@@ -6346,11 +6365,11 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   if (VF.isScalable())
     return InstructionCost::getInvalid();
 
-  Type *ValTy = getLoadStoreType(I);
+  Type *ValTy = getLdStPfType(I);
   auto SE = PSE.getSE();
 
-  unsigned AS = getLoadStoreAddressSpace(I);
-  Value *Ptr = getLoadStorePointerOperand(I);
+  unsigned AS = getLdStPfAddressSpace(I);
+  Value *Ptr = getLdStPfPointerOperand(I);
   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
   //       that it is being called from this specific place.
@@ -6366,7 +6385,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   // Don't pass *I here, since it is scalar but will actually be part of a
   // vectorized loop where the user of it is a vectorized instruction.
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-  const Align Alignment = getLoadStoreAlignment(I);
+  const Align Alignment = getLdStPfAlignment(I);
   Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(I->getOpcode(),
                                                       ValTy->getScalarType(),
                                                       Alignment, AS, CostKind);
@@ -6401,16 +6420,16 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
 InstructionCost
 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
                                                     ElementCount VF) {
-  Type *ValTy = getLoadStoreType(I);
+  Type *ValTy = getLdStPfType(I);
   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
-  Value *Ptr = getLoadStorePointerOperand(I);
-  unsigned AS = getLoadStoreAddressSpace(I);
+  Value *Ptr = getLdStPfPointerOperand(I);
+  unsigned AS = getLdStPfAddressSpace(I);
   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
          "Stride should be 1 or -1 for consecutive memory access");
-  const Align Alignment = getLoadStoreAlignment(I);
+  const Align Alignment = getLdStPfAlignment(I);
   InstructionCost Cost = 0;
   if (Legal->isMaskRequired(I)) {
     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
@@ -6433,11 +6452,16 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
                                                 ElementCount VF) {
   assert(Legal->isUniformMemOp(*I, VF));
 
-  Type *ValTy = getLoadStoreType(I);
+  Type *ValTy = getLdStPfType(I);
   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
-  const Align Alignment = getLoadStoreAlignment(I);
-  unsigned AS = getLoadStoreAddressSpace(I);
+  const Align Alignment = getLdStPfAlignment(I);
+  unsigned AS = getLdStPfAddressSpace(I);
   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  if (isa<PrefetchInst>(I)) {
+    return TTI.getAddressComputationCost(ValTy) +
+           TTI.getMemoryOpCost(Instruction::Call, ValTy, Alignment, AS,
+                               CostKind);
+  }
   if (isa<LoadInst>(I)) {
     return TTI.getAddressComputationCost(ValTy) +
            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
@@ -6459,10 +6483,10 @@ LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
 InstructionCost
 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
                                                  ElementCount VF) {
-  Type *ValTy = getLoadStoreType(I);
+  Type *ValTy = getLdStPfType(I);
   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
-  const Align Alignment = getLoadStoreAlignment(I);
-  const Value *Ptr = getLoadStorePointerOperand(I);
+  const Align Alignment = getLdStPfAlignment(I);
+  const Value *Ptr = getLdStPfPointerOperand(I);
 
   return TTI.getAddressComputationCost(VectorTy) +
          TTI.getGatherScatterOpCost(
@@ -6688,9 +6712,9 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
   // Calculate scalar cost only. Vectorization cost should be ready at this
   // moment.
   if (VF.isScalar()) {
-    Type *ValTy = getLoadStoreType(I);
-    const Align Alignment = getLoadStoreAlignment(I);
-    unsigned AS = getLoadStoreAddressSpace(I);
+    Type *ValTy = getLdStPfType(I);
+    const Align Alignment = getLdStPfAlignment(I);
+    unsigned AS = getLdStPfAddressSpace(I);
 
     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
     return TTI.getAddressComputationCost(ValTy) +
@@ -6791,7 +6815,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
   for (BasicBlock *BB : TheLoop->blocks()) {
     // For each instruction in the old loop.
     for (Instruction &I : *BB) {
-      Value *Ptr =  getLoadStorePointerOperand(&I);
+      Value *Ptr = getLdStPfPointerOperand(&I);
       if (!Ptr)
         continue;
 
@@ -6851,7 +6875,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
       if (memoryInstructionCanBeWidened(&I, VF)) {
         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
         int ConsecutiveStride = Legal->isConsecutivePtr(
-            getLoadStoreType(&I), getLoadStorePointerOperand(&I));
+            getLdStPfType(&I), getLdStPfPointerOperand(&I));
         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
                "Expected consecutive stride.");
         InstWidening Decision =
@@ -7285,6 +7309,20 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
   }
   case Instruction::Call: {
+    if (isa<PrefetchInst>(I)) {
+      ElementCount Width = VF;
+      if (Width.isVector()) {
+        InstWidening Decision = getWideningDecision(I, Width);
+        assert(Decision != CM_Unknown &&
+               "CM decision should be taken at this point");
+        if (getWideningCost(I, VF) == InstructionCost::getInvalid())
+          return InstructionCost::getInvalid();
+        if (Decision == CM_Scalarize)
+          Width = ElementCount::getFixed(1);
+      }
+      VectorTy = ToVectorTy(getLdStPfType(I), Width);
+      return getMemoryInstructionCost(I, VF);
+    }
     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
         return *RedCost;
@@ -8134,7 +8172,7 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
                                                 ArrayRef<VPValue *> Operands,
                                                 VFRange &Range,
                                                 VPlanPtr &Plan) {
-  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+  assert((isa<LoadInst>(I) || isa<StoreInst>(I) || isa<PrefetchInst>(I)) &&
          "Must be called with either a load or store");
 
   auto willWiden = [&](ElementCount VF) -> bool {
@@ -8169,6 +8207,10 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
                                               Consecutive, Reverse);
 
+  if (PrefetchInst *Prefetch = dyn_cast<PrefetchInst>(I))
+    return new VPWidenMemoryInstructionRecipe(*Prefetch, Operands[0], Mask,
+                                              Consecutive, Reverse);
+
   StoreInst *Store = cast<StoreInst>(I);
   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
                                             Mask, Consecutive, Reverse);
@@ -8588,10 +8630,12 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
           [&](ElementCount VF) { return VF.isScalar(); }, Range))
     return nullptr;
 
-  if (auto *CI = dyn_cast<CallInst>(Instr))
+  if (isa<CallInst>(Instr) && !isa<PrefetchInst>(Instr)) {
+    auto *CI = dyn_cast<CallInst>(Instr);
     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));
+  }
 
-  if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
+  if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr) || isa<PrefetchInst>(Instr))
     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
 
   if (!shouldWiden(Instr, Range))
@@ -9400,7 +9444,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
   if (IsUniform) {
     // If the recipe is uniform across all parts (instead of just per VF), only
     // generate a single instance.
-    if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
+    if ((isa<LoadInst>(UI) || isa<StoreInst>(UI) || isa<PrefetchInst>(UI)) &&
         all_of(operands(), [](VPValue *Op) {
           return Op->isDefinedOutsideVectorRegions();
         })) {
@@ -9430,6 +9474,16 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
     return;
   }
 
+  // A prefetch of a loop varying value to a uniform address only needs the last
+  // copy of the store.
+  if (isa<PrefetchInst>(UI) &&
+      vputils::isUniformAfterVectorization(getOperand(0))) {
+    auto Lane = VPLane::getLastLaneForVF(State.VF);
+    State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
+                                    State);
+    return;
+  }
+
   // Generate scalar instances for all VF lanes of all UF parts.
   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
   const unsigned EndLane = State.VF.getKnownMinValue();
@@ -9444,15 +9498,17 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   // Attempt to issue a wide load.
   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
+  PrefetchInst *PF = dyn_cast<PrefetchInst>(&Ingredient);
 
-  assert((LI || SI) && "Invalid Load/Store instruction");
+  assert((LI || SI || PF) && "Invalid Load/Store/Prefetch instruction");
   assert((!SI || StoredValue) && "No stored value provided for widened store");
   assert((!LI || !StoredValue) && "Stored value provided for widened load");
+  assert((!PF || !StoredValue) && "Stored value provided for widened prefetch");
 
-  Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+  Type *ScalarDataTy = getLdStPfType(&Ingredient);
 
   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
-  const Align Alignment = getLoadStoreAlignment(&Ingredient);
+  const Align Alignment = getLdStPfAlignment(&Ingredient);
   bool CreateGatherScatter = !isConsecutive();
 
   auto &Builder = State.Builder;
@@ -9535,6 +9591,40 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
     return;
   }
 
+  if (PF) {
+    State.setDebugLocFromInst(PF);
+    Type *ESizeTy = Type::getInt32Ty(PF->getContext());
+    int32_t ESize = ScalarDataTy->getScalarSizeInBits() >> 3;
+    Value *ElemSize = ConstantInt::get(ESizeTy, ESize);
+    Value *RW = PF->getArgOperand(1);
+    Value *Locality = PF->getArgOperand(2);
+
+    for (unsigned Part = 0; Part < State.UF; ++Part) {
+      Instruction *NewPF = nullptr;
+      if (CreateGatherScatter) {
+        Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+        Value *VectorGep = State.get(getAddr(), Part);
+        NewPF = Builder.CreateMaskedGatherPrefetch(VectorGep, ElemSize,
+                                                   MaskPart, RW, Locality);
+      } else {
+        auto *VecPtr =
+            CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
+        if (isMaskRequired)
+          NewPF = Builder.CreateMaskedPrefetch(VecPtr, ElemSize,
+                                               BlockInMaskParts[Part],
+                                               RW, Locality);
+        else {
+          auto *MaskPart = Constant::getAllOnesValue(
+              VectorType::get(Type::getInt1Ty(DataTy->getContext()), DataTy));
+          NewPF = Builder.CreateMaskedPrefetch(VecPtr, ElemSize,
+                                               MaskPart, RW, Locality);
+        }
+      }
+      State.addMetadata(NewPF, PF);
+    }
+    return;
+  }
+
   // Handle loads.
   assert(LI && "Must have a load instruction");
   State.setDebugLocFrom(LI->getDebugLoc());
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1f5e667cf2e5a04..ae9d70b80c1e397 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1949,7 +1949,8 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
   }
 
   bool isMasked() const {
-    return isStore() ? getNumOperands() == 3 : getNumOperands() == 2;
+    return isPrefetch() ? getNumOperands() == 5 :
+              isStore() ? getNumOperands() == 3 : getNumOperands() == 2;
   }
 
 public:
@@ -1971,6 +1972,14 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
     setMask(Mask);
   }
 
+  VPWidenMemoryInstructionRecipe(PrefetchInst &Prefetch, VPValue *Addr, VPValue *Mask,
+                                 bool Consecutive, bool Reverse)
+      : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr}),
+        Ingredient(Prefetch), Consecutive(Consecutive), Reverse(Reverse) {
+    assert((Consecutive || !Reverse) && "Reverse implies consecutive");
+    setMask(Mask);
+  }
+
   VP_CLASSOF_IMPL(VPDef::VPWidenMemoryInstructionSC)
 
   /// Return the address accessed by this recipe.
@@ -1988,6 +1997,9 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
   /// Returns true if this recipe is a store.
   bool isStore() const { return isa<StoreInst>(Ingredient); }
 
+  /// Returns true if this recipe is a prefetch.
+  bool isPrefetch() const { return isa<PrefetchInst>(Ingredient); }
+
   /// Return the address accessed by this recipe.
   VPValue *getStoredValue() const {
     assert(isStore() && "Stored value only available for store instructions");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index fbb67fa3308d0fa..ca055b9798cf3ee 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -146,13 +146,17 @@ bool VPRecipeBase::mayHaveSideEffects() const {
            "underlying instruction has side-effects");
     return false;
   }
-  case VPWidenMemoryInstructionSC:
+  case VPWidenMemoryInstructionSC: {
+    auto *R = cast<VPWidenMemoryInstructionRecipe>(this);
+    if (isa<PrefetchInst>(R->getIngredient()))
+      return true;
     assert(cast<VPWidenMemoryInstructionRecipe>(this)
                    ->getIngredient()
                    .mayHaveSideEffects() == mayWriteToMemory() &&
            "mayHaveSideffects result for ingredient differs from this "
            "implementation");
     return mayWriteToMemory();
+  }
   case VPReplicateSC: {
     auto *R = cast<VPReplicateRecipe>(this);
     return R->getUnderlyingInstr()->mayHaveSideEffects();
@@ -1390,7 +1394,7 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
                                            VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN ";
 
-  if (!isStore()) {
+  if (!isStore() && !isPrefetch()) {
     getVPSingleValue()->printAsOperand(O, SlotTracker);
     O << " = ";
   }