[llvm] 9ebaec4 - [VE] (masked) load|store v256.32|64 isel

Wed Mar 2 04:32:59 PST 2022

Author: Simon Moll
Date: 2022-03-02T13:31:29+01:00
New Revision: 9ebaec461af41ca86d414d2386aa154e60f02c6d

URL: https://github.com/llvm/llvm-project/commit/9ebaec461af41ca86d414d2386aa154e60f02c6d
DIFF: https://github.com/llvm/llvm-project/commit/9ebaec461af41ca86d414d2386aa154e60f02c6d.diff

LOG: [VE] (masked) load|store v256.32|64 isel

Add `vvp_load|store` nodes. Lower to `vld`, `vst` where possible. Use
`vgt` for masked loads for now.

Reviewed By: kaz7

Differential Revision: https://reviews.llvm.org/D120413

Added: 
    llvm/test/CodeGen/VE/Vector/vec_load.ll
    llvm/test/CodeGen/VE/Vector/vec_store.ll

Modified: 
    llvm/lib/Target/VE/VECustomDAG.cpp
    llvm/lib/Target/VE/VECustomDAG.h
    llvm/lib/Target/VE/VEISelLowering.cpp
    llvm/lib/Target/VE/VEISelLowering.h
    llvm/lib/Target/VE/VETargetTransformInfo.h
    llvm/lib/Target/VE/VVPISelLowering.cpp
    llvm/lib/Target/VE/VVPInstrInfo.td
    llvm/lib/Target/VE/VVPInstrPatternsVec.td
    llvm/lib/Target/VE/VVPNodes.def

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/VE/VECustomDAG.cpp b/llvm/lib/Target/VE/VECustomDAG.cpp
index ed463fe624ad0..c513b7a43d0af 100644

--- a/llvm/lib/Target/VE/VECustomDAG.cpp
+++ b/llvm/lib/Target/VE/VECustomDAG.cpp
@@ -61,6 +61,10 @@ bool isMaskArithmetic(SDValue Op) {
 /// \returns the VVP_* SDNode opcode corresponsing to \p OC.
 Optional<unsigned> getVVPOpcode(unsigned Opcode) {
   switch (Opcode) {
+  case ISD::MLOAD:
+    return VEISD::VVP_LOAD;
+  case ISD::MSTORE:
+    return VEISD::VVP_STORE;
 #define HANDLE_VP_TO_VVP(VPOPC, VVPNAME)                                       \
   case ISD::VPOPC:                                                             \
     return VEISD::VVPNAME;
@@ -166,8 +170,12 @@ Optional<int> getMaskPos(unsigned Opc) {
   if (isVVPBinaryOp(Opc))
     return 2;
 
-  // VM Opcodes.
+  // Other opcodes.
   switch (Opc) {
+  case ISD::MSTORE:
+    return 4;
+  case ISD::MLOAD:
+    return 3;
   case VEISD::VVP_SELECT:
     return 2;
   }
@@ -177,6 +185,116 @@ Optional<int> getMaskPos(unsigned Opc) {
 
 bool isLegalAVL(SDValue AVL) { return AVL->getOpcode() == VEISD::LEGALAVL; }
 
+/// Node Properties {
+
+SDValue getNodeChain(SDValue Op) {
+  if (MemSDNode *MemN = dyn_cast<MemSDNode>(Op.getNode()))
+    return MemN->getChain();
+
+  switch (Op->getOpcode()) {
+  case VEISD::VVP_LOAD:
+  case VEISD::VVP_STORE:
+    return Op->getOperand(0);
+  }
+  return SDValue();
+}
+
+SDValue getMemoryPtr(SDValue Op) {
+  if (auto *MemN = dyn_cast<MemSDNode>(Op.getNode()))
+    return MemN->getBasePtr();
+
+  switch (Op->getOpcode()) {
+  case VEISD::VVP_LOAD:
+    return Op->getOperand(1);
+  case VEISD::VVP_STORE:
+    return Op->getOperand(2);
+  }
+  return SDValue();
+}
+
+Optional<EVT> getIdiomaticVectorType(SDNode *Op) {
+  unsigned OC = Op->getOpcode();
+
+  // For memory ops -> the transfered data type
+  if (auto MemN = dyn_cast<MemSDNode>(Op))
+    return MemN->getMemoryVT();
+
+  switch (OC) {
+  // Standard ISD.
+  case ISD::SELECT: // not aliased with VVP_SELECT
+  case ISD::CONCAT_VECTORS:
+  case ISD::EXTRACT_SUBVECTOR:
+  case ISD::VECTOR_SHUFFLE:
+  case ISD::BUILD_VECTOR:
+  case ISD::SCALAR_TO_VECTOR:
+    return Op->getValueType(0);
+  }
+
+  // Translate to VVP where possible.
+  if (auto VVPOpc = getVVPOpcode(OC))
+    OC = *VVPOpc;
+
+  switch (OC) {
+  default:
+  case VEISD::VVP_SETCC:
+    return Op->getOperand(0).getValueType();
+
+  case VEISD::VVP_SELECT:
+#define ADD_BINARY_VVP_OP(VVP_NAME, ...) case VEISD::VVP_NAME:
+#include "VVPNodes.def"
+    return Op->getValueType(0);
+
+  case VEISD::VVP_LOAD:
+    return Op->getValueType(0);
+
+  case VEISD::VVP_STORE:
+    return Op->getOperand(1)->getValueType(0);
+
+  // VEC
+  case VEISD::VEC_BROADCAST:
+    return Op->getValueType(0);
+  }
+}
+
+SDValue getLoadStoreStride(SDValue Op, VECustomDAG &CDAG) {
+  if (Op->getOpcode() == VEISD::VVP_STORE)
+    return Op->getOperand(3);
+  if (Op->getOpcode() == VEISD::VVP_LOAD)
+    return Op->getOperand(2);
+
+  if (isa<MemSDNode>(Op.getNode())) {
+    // Regular MLOAD/MSTORE/LOAD/STORE
+    // No stride argument -> use the contiguous element size as stride.
+    uint64_t ElemStride = getIdiomaticVectorType(Op.getNode())
+                              ->getVectorElementType()
+                              .getStoreSize();
+    return CDAG.getConstant(ElemStride, MVT::i64);
+  }
+  return SDValue();
+}
+
+SDValue getStoredValue(SDValue Op) {
+  switch (Op->getOpcode()) {
+  case VEISD::VVP_STORE:
+    return Op->getOperand(1);
+  }
+  if (auto *StoreN = dyn_cast<StoreSDNode>(Op.getNode()))
+    return StoreN->getValue();
+  if (auto *StoreN = dyn_cast<MaskedStoreSDNode>(Op.getNode()))
+    return StoreN->getValue();
+  if (auto *StoreN = dyn_cast<VPStoreSDNode>(Op.getNode()))
+    return StoreN->getValue();
+  return SDValue();
+}
+
+SDValue getNodePassthru(SDValue Op) {
+  if (auto *N = dyn_cast<MaskedLoadSDNode>(Op.getNode()))
+    return N->getPassThru();
+  return SDValue();
+}
+
+/// } Node Properties
+
 SDValue getNodeAVL(SDValue Op) {
   auto PosOpt = getAVLPos(Op->getOpcode());
   return PosOpt ? Op->getOperand(*PosOpt) : SDValue();

diff  --git a/llvm/lib/Target/VE/VECustomDAG.h b/llvm/lib/Target/VE/VECustomDAG.h
index 6553b90a2b69b..ad3371c5cf042 100644
--- a/llvm/lib/Target/VE/VECustomDAG.h
+++ b/llvm/lib/Target/VE/VECustomDAG.h
@@ -88,6 +88,22 @@ std::pair<SDValue, bool> getAnnotatedNodeAVL(SDValue);
 
 /// } AVL Functions
 
+/// Node Properties {
+
+Optional<EVT> getIdiomaticVectorType(SDNode *Op);
+
+SDValue getLoadStoreStride(SDValue Op, VECustomDAG &CDAG);
+
+SDValue getMemoryPtr(SDValue Op);
+
+SDValue getNodeChain(SDValue Op);
+
+SDValue getStoredValue(SDValue Op);
+
+SDValue getNodePassthru(SDValue Op);
+
+/// } Node Properties
+
 enum class Packing {
   Normal = 0, // 256 element standard mode.
   Dense = 1   // 512 element packed mode.
@@ -157,6 +173,10 @@ class VECustomDAG {
   SDValue getPack(EVT DestVT, SDValue LoVec, SDValue HiVec, SDValue AVL) const;
   /// } Packing
 
+  SDValue getMergeValues(ArrayRef<SDValue> Values) const {
+    return DAG.getMergeValues(Values, DL);
+  }
+
   SDValue getConstant(uint64_t Val, EVT VT, bool IsTarget = false,
                       bool IsOpaque = false) const;
 

diff  --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 1f75dcc6324ce..f1247598ddea6 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -322,6 +322,17 @@ void VETargetLowering::initVPUActions() {
     setOperationAction(ISD::INSERT_VECTOR_ELT, LegalPackedVT, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, LegalPackedVT, Custom);
   }
+
+  // vNt32, vNt64 ops (legal element types)
+  for (MVT VT : MVT::vector_valuetypes()) {
+    MVT ElemVT = VT.getVectorElementType();
+    unsigned ElemBits = ElemVT.getScalarSizeInBits();
+    if (ElemBits != 32 && ElemBits != 64)
+      continue;
+
+    for (unsigned MemOpc : {ISD::MLOAD, ISD::MSTORE, ISD::LOAD, ISD::STORE})
+      setOperationAction(MemOpc, VT, Custom);
+  }
 }
 
 SDValue
@@ -1321,6 +1332,12 @@ static SDValue lowerLoadF128(SDValue Op, SelectionDAG &DAG) {
 SDValue VETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   LoadSDNode *LdNode = cast<LoadSDNode>(Op.getNode());
 
+  EVT MemVT = LdNode->getMemoryVT();
+
+  // Dispatch to vector isel.
+  if (MemVT.isVector() && !isMaskType(MemVT))
+    return lowerToVVP(Op, DAG);
+
   SDValue BasePtr = LdNode->getBasePtr();
   if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
     // Do not expand store instruction with frame index here because of
@@ -1328,7 +1345,6 @@ SDValue VETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     return Op;
   }
 
-  EVT MemVT = LdNode->getMemoryVT();
   if (MemVT == MVT::f128)
     return lowerLoadF128(Op, DAG);
 
@@ -1375,6 +1391,11 @@ SDValue VETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   StoreSDNode *StNode = cast<StoreSDNode>(Op.getNode());
   assert(StNode && StNode->getOffset().isUndef() && "Unexpected node type");
 
+    // always expand non-mask vector loads to VVP
+  EVT MemVT = StNode->getMemoryVT();
+  if (MemVT.isVector() && !isMaskType(MemVT))
+    return lowerToVVP(Op, DAG);
+
   SDValue BasePtr = StNode->getBasePtr();
   if (isa<FrameIndexSDNode>(BasePtr.getNode())) {
     // Do not expand store instruction with frame index here because of
@@ -1382,7 +1403,6 @@ SDValue VETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     return Op;
   }
 
-  EVT MemVT = StNode->getMemoryVT();
   if (MemVT == MVT::f128)
     return lowerStoreF128(Op, DAG);
 
@@ -1699,12 +1719,9 @@ VETargetLowering::getCustomOperationAction(SDNode &Op) const {
 SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   LLVM_DEBUG(dbgs() << "::LowerOperation"; Op->print(dbgs()););
   unsigned Opcode = Op.getOpcode();
-  if (ISD::isVPOpcode(Opcode))
-    return lowerToVVP(Op, DAG);
 
+  /// Scalar isel.
   switch (Opcode) {
-  default:
-    llvm_unreachable("Should not custom lower this!");
   case ISD::ATOMIC_FENCE:
     return lowerATOMIC_FENCE(Op, DAG);
   case ISD::ATOMIC_SWAP:
@@ -1748,6 +1765,16 @@ SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return lowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT:
     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+  }
+
+  /// Vector isel.
+  LLVM_DEBUG(dbgs() << "::LowerOperation_VVP"; Op->print(dbgs()););
+  if (ISD::isVPOpcode(Opcode))
+    return lowerToVVP(Op, DAG);
+
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Should not custom lower this!");
 
   // Legalize the AVL of this internal node.
   case VEISD::VEC_BROADCAST:
@@ -1759,6 +1786,8 @@ SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return legalizeInternalVectorOp(Op, DAG);
 
     // Translate into a VEC_*/VVP_* layer operation.
+  case ISD::MLOAD:
+  case ISD::MSTORE:
 #define ADD_VVP_OP(VVP_NAME, ISD_NAME) case ISD::ISD_NAME:
 #include "VVPNodes.def"
     if (isMaskArithmetic(Op) && isPackedVectorType(Op.getValueType()))

diff  --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h
index cc7a156d5b937..f88eb23821efd 100644
--- a/llvm/lib/Target/VE/VEISelLowering.h
+++ b/llvm/lib/Target/VE/VEISelLowering.h
@@ -186,6 +186,8 @@ class VETargetLowering : public TargetLowering {
 
   /// VVP Lowering {
   SDValue lowerToVVP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVVP_LOAD_STORE(SDValue Op, VECustomDAG&) const;
+
   SDValue legalizeInternalVectorOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue splitVectorOp(SDValue Op, VECustomDAG &CDAG) const;
   SDValue legalizePackedAVL(SDValue Op, VECustomDAG &CDAG) const;

diff  --git a/llvm/lib/Target/VE/VETargetTransformInfo.h b/llvm/lib/Target/VE/VETargetTransformInfo.h
index 0242fa1b01179..7cca3d496f6e1 100644
--- a/llvm/lib/Target/VE/VETargetTransformInfo.h
+++ b/llvm/lib/Target/VE/VETargetTransformInfo.h
@@ -21,6 +21,32 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
 
+static llvm::Type *getVectorElementType(llvm::Type *Ty) {
+  return llvm::cast<llvm::FixedVectorType>(Ty)->getElementType();
+}
+
+static llvm::Type *getLaneType(llvm::Type *Ty) {
+  using namespace llvm;
+  if (!isa<VectorType>(Ty))
+    return Ty;
+  return getVectorElementType(Ty);
+}
+
+static bool isVectorLaneType(llvm::Type &ElemTy) {
+  // check element sizes for vregs
+  if (ElemTy.isIntegerTy()) {
+    unsigned ScaBits = ElemTy.getScalarSizeInBits();
+    return ScaBits == 1 || ScaBits == 32 || ScaBits == 64;
+  }
+  if (ElemTy.isPointerTy()) {
+    return true;
+  }
+  if (ElemTy.isFloatTy() || ElemTy.isDoubleTy()) {
+    return true;
+  }
+  return false;
+}
+
 namespace llvm {
 
 class VETTIImpl : public BasicTTIImplBase<VETTIImpl> {
@@ -86,6 +112,21 @@ class VETTIImpl : public BasicTTIImplBase<VETTIImpl> {
     //   output
     return false;
   }
+
+  // Load & Store {
+  bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment) {
+    return isVectorLaneType(*getLaneType(DataType));
+  }
+  bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) {
+    return isVectorLaneType(*getLaneType(DataType));
+  }
+  bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
+    return isVectorLaneType(*getLaneType(DataType));
+  };
+  bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
+    return isVectorLaneType(*getLaneType(DataType));
+  }
+  // } Load & Store
 };
 
 } // namespace llvm

diff  --git a/llvm/lib/Target/VE/VVPISelLowering.cpp b/llvm/lib/Target/VE/VVPISelLowering.cpp
index e3fba730e5ad4..e72b7cf54dbb6 100644
--- a/llvm/lib/Target/VE/VVPISelLowering.cpp
+++ b/llvm/lib/Target/VE/VVPISelLowering.cpp
@@ -46,6 +46,13 @@ SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
 
   // The representative and legalized vector type of this operation.
   VECustomDAG CDAG(DAG, Op);
+  // Dispatch to complex lowering functions.
+  switch (VVPOpcode) {
+  case VEISD::VVP_LOAD:
+  case VEISD::VVP_STORE:
+    return lowerVVP_LOAD_STORE(Op, CDAG);
+  };
+
   EVT OpVecVT = Op.getValueType();
   EVT LegalVecVT = getTypeToTransformTo(*DAG.getContext(), OpVecVT);
   auto Packing = getTypePacking(LegalVecVT.getSimpleVT());
@@ -89,6 +96,60 @@ SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
   llvm_unreachable("lowerToVVP called for unexpected SDNode.");
 }
 
+SDValue VETargetLowering::lowerVVP_LOAD_STORE(SDValue Op,
+                                              VECustomDAG &CDAG) const {
+  auto VVPOpc = *getVVPOpcode(Op->getOpcode());
+  const bool IsLoad = (VVPOpc == VEISD::VVP_LOAD);
+
+  // Shares.
+  SDValue BasePtr = getMemoryPtr(Op);
+  SDValue Mask = getNodeMask(Op);
+  SDValue Chain = getNodeChain(Op);
+  SDValue AVL = getNodeAVL(Op);
+  // Store specific.
+  SDValue Data = getStoredValue(Op);
+  // Load specific.
+  SDValue PassThru = getNodePassthru(Op);
+
+  auto DataVT = *getIdiomaticVectorType(Op.getNode());
+  auto Packing = getTypePacking(DataVT);
+
+  assert(Packing == Packing::Normal && "TODO Packed load store isel");
+
+  // TODO: Infer lower AVL from mask.
+  if (!AVL)
+    AVL = CDAG.getConstant(DataVT.getVectorNumElements(), MVT::i32);
+
+  // Default to the all-true mask.
+  if (!Mask)
+    Mask = CDAG.getConstantMask(Packing, true);
+
+  SDValue StrideV = getLoadStoreStride(Op, CDAG);
+  if (IsLoad) {
+    MVT LegalDataVT = getLegalVectorType(
+        Packing, DataVT.getVectorElementType().getSimpleVT());
+
+    auto NewLoadV = CDAG.getNode(VEISD::VVP_LOAD, {LegalDataVT, MVT::Other},
+                                 {Chain, BasePtr, StrideV, Mask, AVL});
+
+    if (!PassThru || PassThru->isUndef())
+      return NewLoadV;
+
+    // Convert passthru to an explicit select node.
+    SDValue DataV = CDAG.getNode(VEISD::VVP_SELECT, DataVT,
+                                 {NewLoadV, PassThru, Mask, AVL});
+    SDValue NewLoadChainV = SDValue(NewLoadV.getNode(), 1);
+
+    // Merge them back into one node.
+    return CDAG.getMergeValues({DataV, NewLoadChainV});
+  }
+
+  // VVP_STORE
+  assert(VVPOpc == VEISD::VVP_STORE);
+  return CDAG.getNode(VEISD::VVP_STORE, Op.getNode()->getVTList(),
+                      {Chain, Data, BasePtr, StrideV, Mask, AVL});
+}
+
 SDValue VETargetLowering::legalizeInternalVectorOp(SDValue Op,
                                                    SelectionDAG &DAG) const {
   VECustomDAG CDAG(DAG, Op);

diff  --git a/llvm/lib/Target/VE/VVPInstrInfo.td b/llvm/lib/Target/VE/VVPInstrInfo.td
index a607788b884df..b1ac71ced8238 100644
--- a/llvm/lib/Target/VE/VVPInstrInfo.td
+++ b/llvm/lib/Target/VE/VVPInstrInfo.td
@@ -18,6 +18,24 @@
 // TODO explain how VVP nodes relate to VP SDNodes once VP ISel is uptream.
 //===----------------------------------------------------------------------===//
 
+// vvp_load(ptr, stride, mask, avl)
+def SDTLoadVVP : SDTypeProfile<1, 4, [
+  SDTCisVec<0>,
+  SDTCisPtrTy<1>,
+  SDTCisInt<2>,
+  SDTCisVec<3>,
+  IsVLVT<4>
+]>;
+
+// vvp_store(data, ptr, stride, mask, avl)
+def SDTStoreVVP: SDTypeProfile<0, 5, [
+  SDTCisVec<0>,
+  SDTCisPtrTy<1>,
+  SDTCisInt<2>,
+  SDTCisVec<3>,
+  IsVLVT<4>
+]>;
+
 // Binary Operators {
 
 // BinaryOp(x,y,mask,vl)
@@ -102,6 +120,12 @@ def vvp_fdiv    : SDNode<"VEISD::VVP_FDIV",  SDTFPBinOpVVP>;
 
 // } Binary Operators
 
+def vvp_load    : SDNode<"VEISD::VVP_LOAD",  SDTLoadVVP,
+                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand ]>;
+def vvp_store   : SDNode<"VEISD::VVP_STORE", SDTStoreVVP,
+                         [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+
 def vvp_select : SDNode<"VEISD::VVP_SELECT", SDTSelectVVP>;
 
 // setcc (lhs, rhs, cc, mask, vl)

diff  --git a/llvm/lib/Target/VE/VVPInstrPatternsVec.td b/llvm/lib/Target/VE/VVPInstrPatternsVec.td
index 22de6ddf9d6be..ab544cfb78819 100644
--- a/llvm/lib/Target/VE/VVPInstrPatternsVec.td
+++ b/llvm/lib/Target/VE/VVPInstrPatternsVec.td
@@ -17,6 +17,85 @@
 //===----------------------------------------------------------------------===//
 include "VVPInstrInfo.td"
 
+multiclass VectorStore<ValueType DataVT,
+    ValueType PtrVT, ValueType MaskVT,
+    string STWithMask, string STNoMask> {
+  // Unmasked (imm stride).
+  def : Pat<(vvp_store
+               DataVT:$val, PtrVT:$addr,
+               (i64 simm7:$stride), (MaskVT true_mask), i32:$avl),
+            (!cast<Instruction>(STNoMask#"irvl")
+               (LO7 $stride), $addr, $val, $avl)>;
+  // Unmasked.
+  def : Pat<(vvp_store
+               DataVT:$val, PtrVT:$addr,
+               i64:$stride, (MaskVT true_mask), i32:$avl),
+            (!cast<Instruction>(STNoMask#"rrvl")
+               $stride, $addr, $val, $avl)>;
+  // Masked (imm stride).
+  def : Pat<(vvp_store
+               DataVT:$val, PtrVT:$addr,
+               (i64 simm7:$stride), MaskVT:$mask, i32:$avl),
+            (!cast<Instruction>(STWithMask#"irvml")
+               (LO7 $stride), $addr, $val, $mask, $avl)>;
+  // Masked.
+  def : Pat<(vvp_store
+               DataVT:$val, PtrVT:$addr,
+               i64:$stride, MaskVT:$mask, i32:$avl),
+            (!cast<Instruction>(STWithMask#"rrvml")
+               $stride, $addr, $val, $mask, $avl)>;
+}
+
+defm : VectorStore<v256f64, i64, v256i1, "VST",  "VST">;
+defm : VectorStore<v256i64, i64, v256i1, "VST",  "VST">;
+defm : VectorStore<v256f32, i64, v256i1, "VSTU", "VSTU">;
+defm : VectorStore<v256i32, i64, v256i1, "VSTL", "VSTL">;
+
+multiclass VectorLoad<ValueType DataVT,
+    ValueType PtrVT, ValueType MaskVT,
+    string GTWithMask, string LDNoMask> {
+  // Unmasked (imm stride).
+  def : Pat<(DataVT (vvp_load
+               PtrVT:$addr, (i64 simm7:$stride),
+               (MaskVT true_mask), i32:$avl)),
+            (!cast<Instruction>(LDNoMask#"irl")
+               (LO7 $stride), $addr, $avl)>;
+  // Unmasked.
+  def : Pat<(DataVT (vvp_load
+               PtrVT:$addr, i64:$stride,
+               (MaskVT true_mask), i32:$avl)),
+            (!cast<Instruction>(LDNoMask#"rrl")
+               $stride, PtrVT:$addr, $avl)>;
+  // Masked (imm stride).
+  def : Pat<(DataVT (vvp_load
+               PtrVT:$addr, (i64 simm7:$stride),
+               MaskVT:$mask, i32:$avl)),
+            (!cast<Instruction>(GTWithMask#"vizml")
+               (VADDULrvml $addr,
+                  (VMULULivml (LO7 $stride), (VSEQl $avl), $mask, $avl),
+                  $mask, $avl),
+               0, 0,
+               $mask,
+               $avl)>;
+  // Masked.
+  def : Pat<(DataVT (vvp_load
+               PtrVT:$addr, i64:$stride, MaskVT:$mask, i32:$avl)),
+            (!cast<Instruction>(GTWithMask#"vizml")
+               (VADDULrvml $addr,
+                  (VMULULrvml $stride, (VSEQl $avl), $mask, $avl),
+                  $mask, $avl),
+               0, 0,
+               $mask,
+               $avl)>;
+}
+
+defm : VectorLoad<v256f64, i64, v256i1, "VGT",    "VLD">;
+defm : VectorLoad<v256i64, i64, v256i1, "VGT",    "VLD">;
+defm : VectorLoad<v256f32, i64, v256i1, "VGTU",   "VLDU">;
+defm : VectorLoad<v256i32, i64, v256i1, "VGTLZX", "VLDLZX">;
+
+
+
 multiclass Binary_rv<SDPatternOperator OpNode,
     ValueType ScalarVT, ValueType DataVT,
     ValueType MaskVT, string OpBaseName> {

diff  --git a/llvm/lib/Target/VE/VVPNodes.def b/llvm/lib/Target/VE/VVPNodes.def
index 1ba602f4f2d36..cbb5f4d5098ed 100644
--- a/llvm/lib/Target/VE/VVPNodes.def
+++ b/llvm/lib/Target/VE/VVPNodes.def
@@ -44,6 +44,9 @@
 #define REGISTER_PACKED(OPC)
 #endif
 
+ADD_VVP_OP(VVP_LOAD,LOAD)   HANDLE_VP_TO_VVP(VP_LOAD, VVP_LOAD)   REGISTER_PACKED(VVP_LOAD)
+ADD_VVP_OP(VVP_STORE,STORE) HANDLE_VP_TO_VVP(VP_STORE, VVP_STORE) REGISTER_PACKED(VVP_STORE)
+
 // Integer arithmetic.
 ADD_BINARY_VVP_OP_COMPACT(ADD) REGISTER_PACKED(VVP_ADD)
 ADD_BINARY_VVP_OP_COMPACT(SUB) REGISTER_PACKED(VVP_SUB)

diff  --git a/llvm/test/CodeGen/VE/Vector/vec_load.ll b/llvm/test/CodeGen/VE/Vector/vec_load.ll
new file mode 100644
index 0000000000000..69af962b33b40
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vec_load.ll
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s
+
+declare <128 x double> @llvm.masked.load.v128f64.p0v128f64(<128 x double>* %0, i32 immarg %1, <128 x i1> %2, <128 x double> %3) #0
+
+; TODO: Custom widen by lowering to vvp_load in ReplaceNodeResult
+; Function Attrs: nounwind
+; define fastcc <128 x double> @vec_mload_v128f64(<128 x double>* %P, <128 x i1> %M) {
+;   %r = call <128 x double> @llvm.masked.load.v128f64.p0v128f64(<128 x double>* %P, i32 16, <128 x i1> %M, <128 x double> undef)
+;   ret <128 x double> %r
+; }
+
+
+declare <256 x double> @llvm.masked.load.v256f64.p0v256f64(<256 x double>* %0, i32 immarg %1, <256 x i1> %2, <256 x double> %3) #0
+
+; Function Attrs: nounwind
+define fastcc <256 x double> @vec_mload_v256f64(<256 x double>* %P, <256 x i1> %M) {
+; CHECK-LABEL: vec_mload_v256f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vseq %v0
+; CHECK-NEXT:    vmulu.l %v0, 8, %v0, %vm1
+; CHECK-NEXT:    vaddu.l %v0, %s0, %v0, %vm1
+; CHECK-NEXT:    vgt %v0, %v0, 0, 0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r = call <256 x double> @llvm.masked.load.v256f64.p0v256f64(<256 x double>* %P, i32 16, <256 x i1> %M, <256 x double> undef)
+  ret <256 x double> %r
+}
+
+; Function Attrs: nounwind
+define fastcc <256 x double> @vec_load_v256f64(<256 x double>* %P) {
+; CHECK-LABEL: vec_load_v256f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vld %v0, 8, %s0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r = load <256 x double>, <256 x double>* %P, align 4
+  ret <256 x double> %r
+}
+
+; Function Attrs: nounwind
+define fastcc <256 x double> @vec_mload_pt_v256f64(<256 x double>* %P, <256 x double> %PT, <256 x i1> %M) {
+; CHECK-LABEL: vec_mload_pt_v256f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vseq %v1
+; CHECK-NEXT:    vmulu.l %v1, 8, %v1, %vm1
+; CHECK-NEXT:    vaddu.l %v1, %s0, %v1, %vm1
+; CHECK-NEXT:    vgt %v1, %v1, 0, 0, %vm1
+; CHECK-NEXT:    vmrg %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r = call <256 x double> @llvm.masked.load.v256f64.p0v256f64(<256 x double>* %P, i32 16, <256 x i1> %M, <256 x double> %PT)
+  ret <256 x double> %r
+}
+
+
+declare <256 x float> @llvm.masked.load.v256f32.p0v256f32(<256 x float>* %0, i32 immarg %1, <256 x i1> %2, <256 x float> %3) #0
+
+; Function Attrs: nounwind
+define fastcc <256 x float> @vec_mload_v256f32(<256 x float>* %P, <256 x i1> %M) {
+; CHECK-LABEL: vec_mload_v256f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vseq %v0
+; CHECK-NEXT:    vmulu.l %v0, 4, %v0, %vm1
+; CHECK-NEXT:    vaddu.l %v0, %s0, %v0, %vm1
+; CHECK-NEXT:    vgtu %v0, %v0, 0, 0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r = call <256 x float> @llvm.masked.load.v256f32.p0v256f32(<256 x float>* %P, i32 16, <256 x i1> %M, <256 x float> undef)
+  ret <256 x float> %r
+}
+
+; Function Attrs: nounwind
+define fastcc <256 x float> @vec_mload_pt_v256f32(<256 x float>* %P, <256 x float> %PT, <256 x i1> %M) {
+; CHECK-LABEL: vec_mload_pt_v256f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vseq %v1
+; CHECK-NEXT:    vmulu.l %v1, 4, %v1, %vm1
+; CHECK-NEXT:    vaddu.l %v1, %s0, %v1, %vm1
+; CHECK-NEXT:    vgtu %v1, %v1, 0, 0, %vm1
+; CHECK-NEXT:    vmrg %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r = call <256 x float> @llvm.masked.load.v256f32.p0v256f32(<256 x float>* %P, i32 16, <256 x i1> %M, <256 x float> %PT)
+  ret <256 x float> %r
+}
+
+
+declare <256 x i32> @llvm.masked.load.v256i32.p0v256i32(<256 x i32>* %0, i32 immarg %1, <256 x i1> %2, <256 x i32> %3) #0
+
+; Function Attrs: nounwind
+define fastcc <256 x i32> @vec_mload_v256i32(<256 x i32>* %P, <256 x i1> %M) {
+; CHECK-LABEL: vec_mload_v256i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vseq %v0
+; CHECK-NEXT:    vmulu.l %v0, 4, %v0, %vm1
+; CHECK-NEXT:    vaddu.l %v0, %s0, %v0, %vm1
+; CHECK-NEXT:    vgtl.zx %v0, %v0, 0, 0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r = call <256 x i32> @llvm.masked.load.v256i32.p0v256i32(<256 x i32>* %P, i32 16, <256 x i1> %M, <256 x i32> undef)
+  ret <256 x i32> %r
+}
+
+; Function Attrs: nounwind
+define fastcc <256 x i32> @vec_mload_pt_v256i32(<256 x i32>* %P, <256 x i32> %PT, <256 x i1> %M) {
+; CHECK-LABEL: vec_mload_pt_v256i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vseq %v1
+; CHECK-NEXT:    vmulu.l %v1, 4, %v1, %vm1
+; CHECK-NEXT:    vaddu.l %v1, %s0, %v1, %vm1
+; CHECK-NEXT:    vgtl.zx %v1, %v1, 0, 0, %vm1
+; CHECK-NEXT:    vmrg %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r = call <256 x i32> @llvm.masked.load.v256i32.p0v256i32(<256 x i32>* %P, i32 16, <256 x i1> %M, <256 x i32> %PT)
+  ret <256 x i32> %r
+}
+
+attributes #0 = { argmemonly nounwind readonly willreturn }

diff  --git a/llvm/test/CodeGen/VE/Vector/vec_store.ll b/llvm/test/CodeGen/VE/Vector/vec_store.ll
new file mode 100644
index 0000000000000..a80d1a12d21be
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Vector/vec_store.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s
+
+declare void @llvm.masked.store.v256f64.p0v256f64(<256 x double>, <256 x double>*, i32 immarg, <256 x i1>)
+
+define fastcc void @vec_mstore_v256f64(<256 x double>* %P, <256 x double> %V, <256 x i1> %M) {
+; CHECK-LABEL: vec_mstore_v256f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vst %v0, 8, %s0
+; CHECK-NEXT:    b.l.t (, %s10)
+  call void @llvm.masked.store.v256f64.p0v256f64(<256 x double> %V, <256 x double>* %P, i32 16, <256 x i1> %M)
+  ret void
+}
+
+
+declare void @llvm.masked.store.v256f32.p0v256f32(<256 x float>, <256 x float>*, i32 immarg, <256 x i1>)
+
+define fastcc void @vec_mstore_v256f32(<256 x float>* %P, <256 x float> %V, <256 x i1> %M) {
+; CHECK-LABEL: vec_mstore_v256f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vstu %v0, 4, %s0
+; CHECK-NEXT:    b.l.t (, %s10)
+  call void @llvm.masked.store.v256f32.p0v256f32(<256 x float> %V, <256 x float>* %P, i32 16, <256 x i1> %M)
+  ret void
+}
+
+
+declare void @llvm.masked.store.v256i32.p0v256i32(<256 x i32>, <256 x i32>*, i32 immarg, <256 x i1>)
+
+define fastcc void @vec_mstore_v256i32(<256 x i32>* %P, <256 x i32> %V, <256 x i1> %M) {
+; CHECK-LABEL: vec_mstore_v256i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vstl %v0, 4, %s0
+; CHECK-NEXT:    b.l.t (, %s10)
+  call void @llvm.masked.store.v256i32.p0v256i32(<256 x i32> %V, <256 x i32>* %P, i32 16, <256 x i1> %M)
+  ret void
+}