[llvm] c574c54 - [VE] Split v512.32 load store into interleaved v256.32 ops

Mon Mar 7 08:40:00 PST 2022

Author: Simon Moll
Date: 2022-03-07T17:38:38+01:00
New Revision: c574c54ebf1559b5793da7f150006c9dfe7472ee

URL: https://github.com/llvm/llvm-project/commit/c574c54ebf1559b5793da7f150006c9dfe7472ee
DIFF: https://github.com/llvm/llvm-project/commit/c574c54ebf1559b5793da7f150006c9dfe7472ee.diff

LOG: [VE] Split v512.32 load store into interleaved v256.32 ops

Without passthru for now. Support for packed passthru requires
evl-into-mask folding.

Reviewed By: kaz7

Differential Revision: https://reviews.llvm.org/D120818

Added: 
    llvm/test/CodeGen/VE/Packed/vec_load.ll
    llvm/test/CodeGen/VE/Packed/vec_store.ll

Modified: 
    llvm/lib/Target/VE/VECustomDAG.cpp
    llvm/lib/Target/VE/VECustomDAG.h
    llvm/lib/Target/VE/VEISelLowering.h
    llvm/lib/Target/VE/VVPISelLowering.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/VE/VECustomDAG.cpp b/llvm/lib/Target/VE/VECustomDAG.cpp
index c513b7a43d0af..26b30f0a5a109 100644

--- a/llvm/lib/Target/VE/VECustomDAG.cpp
+++ b/llvm/lib/Target/VE/VECustomDAG.cpp
@@ -155,6 +155,10 @@ Optional<int> getAVLPos(unsigned Opc) {
     return 1;
   case VEISD::VVP_SELECT:
     return 3;
+  case VEISD::VVP_LOAD:
+    return 4;
+  case VEISD::VVP_STORE:
+    return 5;
   }
 
   return None;
@@ -431,4 +435,19 @@ VETargetMasks VECustomDAG::getTargetSplitMask(SDValue RawMask, SDValue RawAVL,
   return VETargetMasks(NewMask, NewAVL);
 }
 
+SDValue VECustomDAG::getSplitPtrOffset(SDValue Ptr, SDValue ByteStride,
+                                       PackElem Part) const {
+  // High starts at base ptr but has more significant bits in the 64bit vector
+  // element.
+  if (Part == PackElem::Hi)
+    return Ptr;
+  return getNode(ISD::ADD, MVT::i64, {Ptr, ByteStride});
+}
+
+SDValue VECustomDAG::getSplitPtrStride(SDValue PackStride) const {
+  if (auto ConstBytes = dyn_cast<ConstantSDNode>(PackStride))
+    return getConstant(2 * ConstBytes->getSExtValue(), MVT::i64);
+  return getNode(ISD::SHL, MVT::i64, {PackStride, getConstant(1, MVT::i32)});
+}
+
 } // namespace llvm

diff  --git a/llvm/lib/Target/VE/VECustomDAG.h b/llvm/lib/Target/VE/VECustomDAG.h
index ad3371c5cf042..f2bc8fff2af35 100644
--- a/llvm/lib/Target/VE/VECustomDAG.h
+++ b/llvm/lib/Target/VE/VECustomDAG.h
@@ -188,6 +188,11 @@ class VECustomDAG {
   SDValue annotateLegalAVL(SDValue AVL) const;
   VETargetMasks getTargetSplitMask(SDValue RawMask, SDValue RawAVL,
                                    PackElem Part) const;
+
+  // Splitting support
+  SDValue getSplitPtrOffset(SDValue Ptr, SDValue ByteStride,
+                            PackElem Part) const;
+  SDValue getSplitPtrStride(SDValue PackStride) const;
 };
 
 } // namespace llvm

diff  --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h
index f88eb23821efd..af613955e3f26 100644
--- a/llvm/lib/Target/VE/VEISelLowering.h
+++ b/llvm/lib/Target/VE/VEISelLowering.h
@@ -189,7 +189,9 @@ class VETargetLowering : public TargetLowering {
   SDValue lowerVVP_LOAD_STORE(SDValue Op, VECustomDAG&) const;
 
   SDValue legalizeInternalVectorOp(SDValue Op, SelectionDAG &DAG) const;
+  SDValue legalizeInternalLoadStoreOp(SDValue Op, VECustomDAG &CDAG) const;
   SDValue splitVectorOp(SDValue Op, VECustomDAG &CDAG) const;
+  SDValue splitPackedLoadStore(SDValue Op, VECustomDAG &CDAG) const;
   SDValue legalizePackedAVL(SDValue Op, VECustomDAG &CDAG) const;
   SDValue splitMaskArithmetic(SDValue Op, SelectionDAG &DAG) const;
   /// } VVPLowering

diff  --git a/llvm/lib/Target/VE/VVPISelLowering.cpp b/llvm/lib/Target/VE/VVPISelLowering.cpp
index e72b7cf54dbb6..cff948f9e67ad 100644
--- a/llvm/lib/Target/VE/VVPISelLowering.cpp
+++ b/llvm/lib/Target/VE/VVPISelLowering.cpp
@@ -114,8 +114,6 @@ SDValue VETargetLowering::lowerVVP_LOAD_STORE(SDValue Op,
   auto DataVT = *getIdiomaticVectorType(Op.getNode());
   auto Packing = getTypePacking(DataVT);
 
-  assert(Packing == Packing::Normal && "TODO Packed load store isel");
-
   // TODO: Infer lower AVL from mask.
   if (!AVL)
     AVL = CDAG.getConstant(DataVT.getVectorNumElements(), MVT::i32);
@@ -150,10 +148,117 @@ SDValue VETargetLowering::lowerVVP_LOAD_STORE(SDValue Op,
                       {Chain, Data, BasePtr, StrideV, Mask, AVL});
 }
 
+SDValue VETargetLowering::splitPackedLoadStore(SDValue Op,
+                                               VECustomDAG &CDAG) const {
+  auto VVPOC = *getVVPOpcode(Op.getOpcode());
+  assert((VVPOC == VEISD::VVP_LOAD) || (VVPOC == VEISD::VVP_STORE));
+
+  MVT DataVT = getIdiomaticVectorType(Op.getNode())->getSimpleVT();
+  assert(getTypePacking(DataVT) == Packing::Dense &&
+         "Can only split packed load/store");
+  MVT SplitDataVT = splitVectorType(DataVT);
+
+  SDValue PassThru = getNodePassthru(Op);
+  assert(!PassThru && "Should have been folded in lowering to VVP layer");
+
+  // Analyze the operation
+  SDValue PackedMask = getNodeMask(Op);
+  SDValue PackedAVL = getAnnotatedNodeAVL(Op).first;
+  SDValue PackPtr = getMemoryPtr(Op);
+  SDValue PackData = getStoredValue(Op);
+  SDValue PackStride = getLoadStoreStride(Op, CDAG);
+
+  unsigned ChainResIdx = PackData ? 0 : 1;
+
+  SDValue PartOps[2];
+
+  SDValue UpperPartAVL; // we will use this for packing things back together
+  for (PackElem Part : {PackElem::Hi, PackElem::Lo}) {
+    // VP ops already have an explicit mask and AVL. When expanding from non-VP
+    // attach those additional inputs here.
+    auto SplitTM = CDAG.getTargetSplitMask(PackedMask, PackedAVL, Part);
+
+    // Keep track of the (higher) lvl.
+    if (Part == PackElem::Hi)
+      UpperPartAVL = SplitTM.AVL;
+
+    // Attach non-predicating value operands
+    SmallVector<SDValue, 4> OpVec;
+
+    // Chain
+    OpVec.push_back(getNodeChain(Op));
+
+    // Data
+    if (PackData) {
+      SDValue PartData =
+          CDAG.getUnpack(SplitDataVT, PackData, Part, SplitTM.AVL);
+      OpVec.push_back(PartData);
+    }
+
+    // Ptr & Stride
+    // Push (ptr + ElemBytes * <Part>, 2 * ElemBytes)
+    // Stride info
+    // EVT DataVT = LegalizeVectorType(getMemoryDataVT(Op), Op, DAG, Mode);
+    OpVec.push_back(CDAG.getSplitPtrOffset(PackPtr, PackStride, Part));
+    OpVec.push_back(CDAG.getSplitPtrStride(PackStride));
+
+    // Add predicating args and generate part node
+    OpVec.push_back(SplitTM.Mask);
+    OpVec.push_back(SplitTM.AVL);
+
+    if (PackData) {
+      // Store
+      PartOps[(int)Part] = CDAG.getNode(VVPOC, MVT::Other, OpVec);
+    } else {
+      // Load
+      PartOps[(int)Part] =
+          CDAG.getNode(VVPOC, {SplitDataVT, MVT::Other}, OpVec);
+    }
+  }
+
+  // Merge the chains
+  SDValue LowChain = SDValue(PartOps[(int)PackElem::Lo].getNode(), ChainResIdx);
+  SDValue HiChain = SDValue(PartOps[(int)PackElem::Hi].getNode(), ChainResIdx);
+  SDValue FusedChains =
+      CDAG.getNode(ISD::TokenFactor, MVT::Other, {LowChain, HiChain});
+
+  // Chain only [store]
+  if (PackData)
+    return FusedChains;
+
+  // Re-pack into full packed vector result
+  MVT PackedVT =
+      getLegalVectorType(Packing::Dense, DataVT.getVectorElementType());
+  SDValue PackedVals = CDAG.getPack(PackedVT, PartOps[(int)PackElem::Lo],
+                                    PartOps[(int)PackElem::Hi], UpperPartAVL);
+
+  return CDAG.getMergeValues({PackedVals, FusedChains});
+}
+
+SDValue VETargetLowering::legalizeInternalLoadStoreOp(SDValue Op,
+                                                      VECustomDAG &CDAG) const {
+  LLVM_DEBUG(dbgs() << "::legalizeInternalLoadStoreOp\n";);
+  MVT DataVT = getIdiomaticVectorType(Op.getNode())->getSimpleVT();
+
+  // TODO: Recognize packable load,store.
+  if (isPackedVectorType(DataVT))
+    return splitPackedLoadStore(Op, CDAG);
+
+  return legalizePackedAVL(Op, CDAG);
+}
+
 SDValue VETargetLowering::legalizeInternalVectorOp(SDValue Op,
                                                    SelectionDAG &DAG) const {
+  LLVM_DEBUG(dbgs() << "::legalizeInternalVectorOp\n";);
   VECustomDAG CDAG(DAG, Op);
 
+  // Dispatch to specialized legalization functions.
+  switch (Op->getOpcode()) {
+  case VEISD::VVP_LOAD:
+  case VEISD::VVP_STORE:
+    return legalizeInternalLoadStoreOp(Op, CDAG);
+  }
+
   EVT IdiomVT = Op.getValueType();
   if (isPackedVectorType(IdiomVT) &&
       !supportsPackedMode(Op.getOpcode(), IdiomVT))
@@ -229,7 +334,8 @@ SDValue VETargetLowering::legalizePackedAVL(SDValue Op,
 
   // Half and round up EVL for 32bit element types.
   SDValue LegalAVL = AVL;
-  if (isPackedVectorType(Op.getValueType())) {
+  MVT IdiomVT = getIdiomaticVectorType(Op.getNode())->getSimpleVT();
+  if (isPackedVectorType(IdiomVT)) {
     assert(maySafelyIgnoreMask(Op) &&
            "TODO Shift predication from EVL into Mask");
 

diff  --git a/llvm/test/CodeGen/VE/Packed/vec_load.ll b/llvm/test/CodeGen/VE/Packed/vec_load.ll
new file mode 100644
index 0000000000000..59926371300a3
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Packed/vec_load.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s
+
+declare <512 x float> @llvm.masked.load.v512f32.p0v512f32(<512 x float>* %0, i32 immarg %1, <512 x i1> %2, <512 x float> %3) #0
+
+; Function Attrs: nounwind
+define fastcc <512 x float> @vec_mload_v512f32(<512 x float>* %P, <512 x i1> %M) {
+; CHECK-LABEL: vec_mload_v512f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vldu %v0, 8, %s0
+; CHECK-NEXT:    lea %s0, 4(, %s0)
+; CHECK-NEXT:    vldu %v1, 8, %s0
+; CHECK-NEXT:    vshf %v0, %v1, %v0, 8
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r = call <512 x float> @llvm.masked.load.v512f32.p0v512f32(<512 x float>* %P, i32 16, <512 x i1> %M, <512 x float> undef)
+  ret <512 x float> %r
+}
+
+; TODO: Packed select legalization
+; Function Attrs: nounwind
+; define fastcc <512 x float> @vec_mload_pt_v512f32(<512 x float>* %P, <512 x float> %PT, <512 x i1> %M) {
+;   %r = call <512 x float> @llvm.masked.load.v512f32.p0v512f32(<512 x float>* %P, i32 16, <512 x i1> %M, <512 x float> %PT)
+;   ret <512 x float> %r
+; }
+
+declare <512 x i32> @llvm.masked.load.v512i32.p0v512i32(<512 x i32>* %0, i32 immarg %1, <512 x i1> %2, <512 x i32> %3) #0
+
+; Function Attrs: nounwind
+define fastcc <512 x i32> @vec_mload_v512i32(<512 x i32>* %P, <512 x i1> %M) {
+; CHECK-LABEL: vec_mload_v512i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vldl.zx %v0, 8, %s0
+; CHECK-NEXT:    lea %s0, 4(, %s0)
+; CHECK-NEXT:    vldl.zx %v1, 8, %s0
+; CHECK-NEXT:    vshf %v0, %v1, %v0, 13
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r = call <512 x i32> @llvm.masked.load.v512i32.p0v512i32(<512 x i32>* %P, i32 16, <512 x i1> %M, <512 x i32> undef)
+  ret <512 x i32> %r
+}
+
+; TODO: Packed select legalization
+; ; Function Attrs: nounwind
+; define fastcc <512 x i32> @vec_mload_pt_v512i32(<512 x i32>* %P, <512 x i32> %PT, <512 x i1> %M) {
+;   %r = call <512 x i32> @llvm.masked.load.v512i32.p0v512i32(<512 x i32>* %P, i32 16, <512 x i1> %M, <512 x i32> %PT)
+;   ret <512 x i32> %r
+; }
+
+attributes #0 = { argmemonly nounwind readonly willreturn }

diff  --git a/llvm/test/CodeGen/VE/Packed/vec_store.ll b/llvm/test/CodeGen/VE/Packed/vec_store.ll
new file mode 100644
index 0000000000000..2e8b651d694c9
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Packed/vec_store.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s
+
+declare void @llvm.masked.store.v512f32.p0v512f32(<512 x float>, <512 x float>*, i32 immarg, <512 x i1>)
+
+define fastcc void @vec_mstore_v512f32(<512 x float>* %P, <512 x float> %V, <512 x i1> %M) {
+; CHECK-LABEL: vec_mstore_v512f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 256
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vstu %v0, 8, %s0
+; CHECK-NEXT:    vshf %v0, %v0, %v0, 4
+; CHECK-NEXT:    lea %s0, 4(, %s0)
+; CHECK-NEXT:    vstu %v0, 8, %s0
+; CHECK-NEXT:    b.l.t (, %s10)
+  call void @llvm.masked.store.v512f32.p0v512f32(<512 x float> %V, <512 x float>* %P, i32 16, <512 x i1> %M)
+  ret void
+}
+
+
+declare void @llvm.masked.store.v512i32.p0v512i32(<512 x i32>, <512 x i32>*, i32 immarg, <512 x i1>)
+
+define fastcc void @vec_mstore_v512i32(<512 x i32>* %P, <512 x i32> %V, <512 x i1> %M) {
+; CHECK-LABEL: vec_mstore_v512i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s1, 4(, %s0)
+; CHECK-NEXT:    lea %s2, 256
+; CHECK-NEXT:    lvl %s2
+; CHECK-NEXT:    vstl %v0, 8, %s1
+; CHECK-NEXT:    vshf %v0, %v0, %v0, 0
+; CHECK-NEXT:    vstl %v0, 8, %s0
+; CHECK-NEXT:    b.l.t (, %s10)
+  call void @llvm.masked.store.v512i32.p0v512i32(<512 x i32> %V, <512 x i32>* %P, i32 16, <512 x i1> %M)
+  ret void
+}