[llvm] [NVPTX] Add TMA Bulk Copy intrinsics (PR #122344)
Durgadoss R via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 10 01:48:08 PST 2025
================
@@ -3024,13 +3024,90 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
}
+void NVPTXDAGToDAGISel::SelectCpAsyncBulkS2G(SDNode *N) {
+ // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
+ // dst, src, size, cache_hint, cache_hint_flag
+ // NumOperands = {Chain, IID} + {Actual intrinsic args}
+ // = {2} + {5}
+ size_t NumOps = N->getNumOperands();
+ bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
+ size_t NumArgs = IsCacheHint ? 4 : 3; // src, dst, size, cache_hint
+
+ SDLoc DL(N);
+ SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumArgs));
+ Ops.push_back(N->getOperand(0)); // Chain operand
+
+ unsigned Opcode;
+ bool IsShared32 =
+ CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
+ if (IsCacheHint) {
+ Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32_CH
+ : NVPTX::CP_ASYNC_BULK_S2G_CH;
+ } else {
+ Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32
+ : NVPTX::CP_ASYNC_BULK_S2G;
+ }
+ ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
+}
+
+void NVPTXDAGToDAGISel::SelectCpAsyncBulkG2S(SDNode *N) {
+ // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
+ // {dst, mbar, src, size, multicast, cache_hint,
+ // multicast_flag, cache_hint_flag}
+ // NumOperands = {Chain, IID} + {Actual intrinsic args}
+ // = {2} + {8}
+ size_t NumOps = N->getNumOperands();
+ bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
+ bool IsMultiCast = N->getConstantOperandVal(NumOps - 2) == 1;
+ size_t NumBaseArgs = 4; // dst, mbar, src, size
+ size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID
+
+ SDLoc DL(N);
+ SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumBaseArgs));
+
+ // Push MultiCast operand, if available
+ if (IsMultiCast)
+ Ops.push_back(N->getOperand(MultiCastIdx));
+
+ // Push CacheHint operand, if available
+ if (IsCacheHint)
+ Ops.push_back(N->getOperand(MultiCastIdx + 1));
+
+ // Finally, the chain operand
+ Ops.push_back(N->getOperand(0));
+
+ unsigned Opcode;
+ bool IsShared32 =
+ CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
+ if (IsMultiCast && IsCacheHint) {
+ Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_MC_CH
+ : NVPTX::CP_ASYNC_BULK_G2S_MC_CH;
+ } else if (IsMultiCast) {
+ Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_MC
+ : NVPTX::CP_ASYNC_BULK_G2S_MC;
+ } else if (IsCacheHint) {
+ Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_CH
+ : NVPTX::CP_ASYNC_BULK_G2S_CH;
+ } else {
+ Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32
+ : NVPTX::CP_ASYNC_BULK_G2S;
+ }
----------------
durga4github wrote:
Yes, moved it to a lamda in the latest revision,
Resolving this
https://github.com/llvm/llvm-project/pull/122344
More information about the llvm-commits
mailing list