[llvm] c574c54 - [VE] Split v512.32 load store into interleaved v256.32 ops
Simon Moll via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 7 08:40:00 PST 2022
Author: Simon Moll
Date: 2022-03-07T17:38:38+01:00
New Revision: c574c54ebf1559b5793da7f150006c9dfe7472ee
URL: https://github.com/llvm/llvm-project/commit/c574c54ebf1559b5793da7f150006c9dfe7472ee
DIFF: https://github.com/llvm/llvm-project/commit/c574c54ebf1559b5793da7f150006c9dfe7472ee.diff
LOG: [VE] Split v512.32 load store into interleaved v256.32 ops
Without passthru for now. Support for packed passthru requires
evl-into-mask folding.
Reviewed By: kaz7
Differential Revision: https://reviews.llvm.org/D120818
Added:
llvm/test/CodeGen/VE/Packed/vec_load.ll
llvm/test/CodeGen/VE/Packed/vec_store.ll
Modified:
llvm/lib/Target/VE/VECustomDAG.cpp
llvm/lib/Target/VE/VECustomDAG.h
llvm/lib/Target/VE/VEISelLowering.h
llvm/lib/Target/VE/VVPISelLowering.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/VE/VECustomDAG.cpp b/llvm/lib/Target/VE/VECustomDAG.cpp
index c513b7a43d0af..26b30f0a5a109 100644
--- a/llvm/lib/Target/VE/VECustomDAG.cpp
+++ b/llvm/lib/Target/VE/VECustomDAG.cpp
@@ -155,6 +155,10 @@ Optional<int> getAVLPos(unsigned Opc) {
return 1;
case VEISD::VVP_SELECT:
return 3;
+ case VEISD::VVP_LOAD:
+ return 4;
+ case VEISD::VVP_STORE:
+ return 5;
}
return None;
@@ -431,4 +435,19 @@ VETargetMasks VECustomDAG::getTargetSplitMask(SDValue RawMask, SDValue RawAVL,
return VETargetMasks(NewMask, NewAVL);
}
+SDValue VECustomDAG::getSplitPtrOffset(SDValue Ptr, SDValue ByteStride,
+ PackElem Part) const {
+ // High starts at base ptr but has more significant bits in the 64bit vector
+ // element.
+ if (Part == PackElem::Hi)
+ return Ptr;
+ return getNode(ISD::ADD, MVT::i64, {Ptr, ByteStride});
+}
+
+SDValue VECustomDAG::getSplitPtrStride(SDValue PackStride) const {
+ if (auto ConstBytes = dyn_cast<ConstantSDNode>(PackStride))
+ return getConstant(2 * ConstBytes->getSExtValue(), MVT::i64);
+ return getNode(ISD::SHL, MVT::i64, {PackStride, getConstant(1, MVT::i32)});
+}
+
} // namespace llvm
diff --git a/llvm/lib/Target/VE/VECustomDAG.h b/llvm/lib/Target/VE/VECustomDAG.h
index ad3371c5cf042..f2bc8fff2af35 100644
--- a/llvm/lib/Target/VE/VECustomDAG.h
+++ b/llvm/lib/Target/VE/VECustomDAG.h
@@ -188,6 +188,11 @@ class VECustomDAG {
SDValue annotateLegalAVL(SDValue AVL) const;
VETargetMasks getTargetSplitMask(SDValue RawMask, SDValue RawAVL,
PackElem Part) const;
+
+ // Splitting support
+ SDValue getSplitPtrOffset(SDValue Ptr, SDValue ByteStride,
+ PackElem Part) const;
+ SDValue getSplitPtrStride(SDValue PackStride) const;
};
} // namespace llvm
diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h
index f88eb23821efd..af613955e3f26 100644
--- a/llvm/lib/Target/VE/VEISelLowering.h
+++ b/llvm/lib/Target/VE/VEISelLowering.h
@@ -189,7 +189,9 @@ class VETargetLowering : public TargetLowering {
SDValue lowerVVP_LOAD_STORE(SDValue Op, VECustomDAG&) const;
SDValue legalizeInternalVectorOp(SDValue Op, SelectionDAG &DAG) const;
+ SDValue legalizeInternalLoadStoreOp(SDValue Op, VECustomDAG &CDAG) const;
SDValue splitVectorOp(SDValue Op, VECustomDAG &CDAG) const;
+ SDValue splitPackedLoadStore(SDValue Op, VECustomDAG &CDAG) const;
SDValue legalizePackedAVL(SDValue Op, VECustomDAG &CDAG) const;
SDValue splitMaskArithmetic(SDValue Op, SelectionDAG &DAG) const;
/// } VVPLowering
diff --git a/llvm/lib/Target/VE/VVPISelLowering.cpp b/llvm/lib/Target/VE/VVPISelLowering.cpp
index e72b7cf54dbb6..cff948f9e67ad 100644
--- a/llvm/lib/Target/VE/VVPISelLowering.cpp
+++ b/llvm/lib/Target/VE/VVPISelLowering.cpp
@@ -114,8 +114,6 @@ SDValue VETargetLowering::lowerVVP_LOAD_STORE(SDValue Op,
auto DataVT = *getIdiomaticVectorType(Op.getNode());
auto Packing = getTypePacking(DataVT);
- assert(Packing == Packing::Normal && "TODO Packed load store isel");
-
// TODO: Infer lower AVL from mask.
if (!AVL)
AVL = CDAG.getConstant(DataVT.getVectorNumElements(), MVT::i32);
@@ -150,10 +148,117 @@ SDValue VETargetLowering::lowerVVP_LOAD_STORE(SDValue Op,
{Chain, Data, BasePtr, StrideV, Mask, AVL});
}
+SDValue VETargetLowering::splitPackedLoadStore(SDValue Op,
+ VECustomDAG &CDAG) const {
+ auto VVPOC = *getVVPOpcode(Op.getOpcode());
+ assert((VVPOC == VEISD::VVP_LOAD) || (VVPOC == VEISD::VVP_STORE));
+
+ MVT DataVT = getIdiomaticVectorType(Op.getNode())->getSimpleVT();
+ assert(getTypePacking(DataVT) == Packing::Dense &&
+ "Can only split packed load/store");
+ MVT SplitDataVT = splitVectorType(DataVT);
+
+ SDValue PassThru = getNodePassthru(Op);
+ assert(!PassThru && "Should have been folded in lowering to VVP layer");
+
+ // Analyze the operation
+ SDValue PackedMask = getNodeMask(Op);
+ SDValue PackedAVL = getAnnotatedNodeAVL(Op).first;
+ SDValue PackPtr = getMemoryPtr(Op);
+ SDValue PackData = getStoredValue(Op);
+ SDValue PackStride = getLoadStoreStride(Op, CDAG);
+
+ unsigned ChainResIdx = PackData ? 0 : 1;
+
+ SDValue PartOps[2];
+
+ SDValue UpperPartAVL; // we will use this for packing things back together
+ for (PackElem Part : {PackElem::Hi, PackElem::Lo}) {
+ // VP ops already have an explicit mask and AVL. When expanding from non-VP
+ // attach those additional inputs here.
+ auto SplitTM = CDAG.getTargetSplitMask(PackedMask, PackedAVL, Part);
+
+ // Keep track of the (higher) lvl.
+ if (Part == PackElem::Hi)
+ UpperPartAVL = SplitTM.AVL;
+
+ // Attach non-predicating value operands
+ SmallVector<SDValue, 4> OpVec;
+
+ // Chain
+ OpVec.push_back(getNodeChain(Op));
+
+ // Data
+ if (PackData) {
+ SDValue PartData =
+ CDAG.getUnpack(SplitDataVT, PackData, Part, SplitTM.AVL);
+ OpVec.push_back(PartData);
+ }
+
+ // Ptr & Stride
+ // Push (ptr + ElemBytes * <Part>, 2 * ElemBytes)
+ // Stride info
+ // EVT DataVT = LegalizeVectorType(getMemoryDataVT(Op), Op, DAG, Mode);
+ OpVec.push_back(CDAG.getSplitPtrOffset(PackPtr, PackStride, Part));
+ OpVec.push_back(CDAG.getSplitPtrStride(PackStride));
+
+ // Add predicating args and generate part node
+ OpVec.push_back(SplitTM.Mask);
+ OpVec.push_back(SplitTM.AVL);
+
+ if (PackData) {
+ // Store
+ PartOps[(int)Part] = CDAG.getNode(VVPOC, MVT::Other, OpVec);
+ } else {
+ // Load
+ PartOps[(int)Part] =
+ CDAG.getNode(VVPOC, {SplitDataVT, MVT::Other}, OpVec);
+ }
+ }
+
+ // Merge the chains
+ SDValue LowChain = SDValue(PartOps[(int)PackElem::Lo].getNode(), ChainResIdx);
+ SDValue HiChain = SDValue(PartOps[(int)PackElem::Hi].getNode(), ChainResIdx);
+ SDValue FusedChains =
+ CDAG.getNode(ISD::TokenFactor, MVT::Other, {LowChain, HiChain});
+
+ // Chain only [store]
+ if (PackData)
+ return FusedChains;
+
+ // Re-pack into full packed vector result
+ MVT PackedVT =
+ getLegalVectorType(Packing::Dense, DataVT.getVectorElementType());
+ SDValue PackedVals = CDAG.getPack(PackedVT, PartOps[(int)PackElem::Lo],
+ PartOps[(int)PackElem::Hi], UpperPartAVL);
+
+ return CDAG.getMergeValues({PackedVals, FusedChains});
+}
+
+SDValue VETargetLowering::legalizeInternalLoadStoreOp(SDValue Op,
+ VECustomDAG &CDAG) const {
+ LLVM_DEBUG(dbgs() << "::legalizeInternalLoadStoreOp\n";);
+ MVT DataVT = getIdiomaticVectorType(Op.getNode())->getSimpleVT();
+
+ // TODO: Recognize packable load,store.
+ if (isPackedVectorType(DataVT))
+ return splitPackedLoadStore(Op, CDAG);
+
+ return legalizePackedAVL(Op, CDAG);
+}
+
SDValue VETargetLowering::legalizeInternalVectorOp(SDValue Op,
SelectionDAG &DAG) const {
+ LLVM_DEBUG(dbgs() << "::legalizeInternalVectorOp\n";);
VECustomDAG CDAG(DAG, Op);
+ // Dispatch to specialized legalization functions.
+ switch (Op->getOpcode()) {
+ case VEISD::VVP_LOAD:
+ case VEISD::VVP_STORE:
+ return legalizeInternalLoadStoreOp(Op, CDAG);
+ }
+
EVT IdiomVT = Op.getValueType();
if (isPackedVectorType(IdiomVT) &&
!supportsPackedMode(Op.getOpcode(), IdiomVT))
@@ -229,7 +334,8 @@ SDValue VETargetLowering::legalizePackedAVL(SDValue Op,
// Half and round up EVL for 32bit element types.
SDValue LegalAVL = AVL;
- if (isPackedVectorType(Op.getValueType())) {
+ MVT IdiomVT = getIdiomaticVectorType(Op.getNode())->getSimpleVT();
+ if (isPackedVectorType(IdiomVT)) {
assert(maySafelyIgnoreMask(Op) &&
"TODO Shift predication from EVL into Mask");
diff --git a/llvm/test/CodeGen/VE/Packed/vec_load.ll b/llvm/test/CodeGen/VE/Packed/vec_load.ll
new file mode 100644
index 0000000000000..59926371300a3
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Packed/vec_load.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s
+
+declare <512 x float> @llvm.masked.load.v512f32.p0v512f32(<512 x float>* %0, i32 immarg %1, <512 x i1> %2, <512 x float> %3) #0
+
+; Function Attrs: nounwind
+define fastcc <512 x float> @vec_mload_v512f32(<512 x float>* %P, <512 x i1> %M) {
+; CHECK-LABEL: vec_mload_v512f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s1, 256
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vldu %v0, 8, %s0
+; CHECK-NEXT: lea %s0, 4(, %s0)
+; CHECK-NEXT: vldu %v1, 8, %s0
+; CHECK-NEXT: vshf %v0, %v1, %v0, 8
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call <512 x float> @llvm.masked.load.v512f32.p0v512f32(<512 x float>* %P, i32 16, <512 x i1> %M, <512 x float> undef)
+ ret <512 x float> %r
+}
+
+; TODO: Packed select legalization
+; Function Attrs: nounwind
+; define fastcc <512 x float> @vec_mload_pt_v512f32(<512 x float>* %P, <512 x float> %PT, <512 x i1> %M) {
+; %r = call <512 x float> @llvm.masked.load.v512f32.p0v512f32(<512 x float>* %P, i32 16, <512 x i1> %M, <512 x float> %PT)
+; ret <512 x float> %r
+; }
+
+declare <512 x i32> @llvm.masked.load.v512i32.p0v512i32(<512 x i32>* %0, i32 immarg %1, <512 x i1> %2, <512 x i32> %3) #0
+
+; Function Attrs: nounwind
+define fastcc <512 x i32> @vec_mload_v512i32(<512 x i32>* %P, <512 x i1> %M) {
+; CHECK-LABEL: vec_mload_v512i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s1, 256
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vldl.zx %v0, 8, %s0
+; CHECK-NEXT: lea %s0, 4(, %s0)
+; CHECK-NEXT: vldl.zx %v1, 8, %s0
+; CHECK-NEXT: vshf %v0, %v1, %v0, 13
+; CHECK-NEXT: b.l.t (, %s10)
+ %r = call <512 x i32> @llvm.masked.load.v512i32.p0v512i32(<512 x i32>* %P, i32 16, <512 x i1> %M, <512 x i32> undef)
+ ret <512 x i32> %r
+}
+
+; TODO: Packed select legalization
+; ; Function Attrs: nounwind
+; define fastcc <512 x i32> @vec_mload_pt_v512i32(<512 x i32>* %P, <512 x i32> %PT, <512 x i1> %M) {
+; %r = call <512 x i32> @llvm.masked.load.v512i32.p0v512i32(<512 x i32>* %P, i32 16, <512 x i1> %M, <512 x i32> %PT)
+; ret <512 x i32> %r
+; }
+
+attributes #0 = { argmemonly nounwind readonly willreturn }
diff --git a/llvm/test/CodeGen/VE/Packed/vec_store.ll b/llvm/test/CodeGen/VE/Packed/vec_store.ll
new file mode 100644
index 0000000000000..2e8b651d694c9
--- /dev/null
+++ b/llvm/test/CodeGen/VE/Packed/vec_store.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s
+
+declare void @llvm.masked.store.v512f32.p0v512f32(<512 x float>, <512 x float>*, i32 immarg, <512 x i1>)
+
+define fastcc void @vec_mstore_v512f32(<512 x float>* %P, <512 x float> %V, <512 x i1> %M) {
+; CHECK-LABEL: vec_mstore_v512f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s1, 256
+; CHECK-NEXT: lvl %s1
+; CHECK-NEXT: vstu %v0, 8, %s0
+; CHECK-NEXT: vshf %v0, %v0, %v0, 4
+; CHECK-NEXT: lea %s0, 4(, %s0)
+; CHECK-NEXT: vstu %v0, 8, %s0
+; CHECK-NEXT: b.l.t (, %s10)
+ call void @llvm.masked.store.v512f32.p0v512f32(<512 x float> %V, <512 x float>* %P, i32 16, <512 x i1> %M)
+ ret void
+}
+
+
+declare void @llvm.masked.store.v512i32.p0v512i32(<512 x i32>, <512 x i32>*, i32 immarg, <512 x i1>)
+
+define fastcc void @vec_mstore_v512i32(<512 x i32>* %P, <512 x i32> %V, <512 x i1> %M) {
+; CHECK-LABEL: vec_mstore_v512i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lea %s1, 4(, %s0)
+; CHECK-NEXT: lea %s2, 256
+; CHECK-NEXT: lvl %s2
+; CHECK-NEXT: vstl %v0, 8, %s1
+; CHECK-NEXT: vshf %v0, %v0, %v0, 0
+; CHECK-NEXT: vstl %v0, 8, %s0
+; CHECK-NEXT: b.l.t (, %s10)
+ call void @llvm.masked.store.v512i32.p0v512i32(<512 x i32> %V, <512 x i32>* %P, i32 16, <512 x i1> %M)
+ ret void
+}
More information about the llvm-commits
mailing list