[llvm] [NVPTX] Add TMA bulk tensor copy intrinsics (PR #96083)
Durgadoss R via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 5 02:32:58 PST 2024
================
@@ -4150,3 +4154,235 @@ NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const {
}
bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
+
+#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, suffix) \
+ (IsShared32 \
+ ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
+ : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
+
+#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode) \
+ (IsCacheHint ? (CP_ASYNC_BULK_TENSOR_OPCODE(S2G, dim, mode, _CH)) \
+ : (CP_ASYNC_BULK_TENSOR_OPCODE(S2G, dim, mode, )))
+
+#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode) \
+ [&]() -> auto { \
+ if (IsMultiCast && IsCacheHint) \
+ return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, _MC_CH); \
+ if (IsCacheHint) \
+ return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, _CH); \
+ if (IsMultiCast) \
+ return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, _MC); \
+ return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, ); \
+ }()
+
+static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32,
+ bool IsCacheHint, bool IsIm2Col) {
+ if (IsIm2Col) {
+ switch (Dim) {
+ case 3:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, IM2COL);
+ case 4:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, IM2COL);
+ case 5:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, IM2COL);
+ default:
+ llvm_unreachable("Invalid Dimension in im2col mode for "
+ "GetCpAsyncBulkTensorS2GOpcode.");
+ }
+ } else {
+ switch (Dim) {
+ case 1:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(1D, TILE);
+ case 2:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(2D, TILE);
+ case 3:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, TILE);
+ case 4:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, TILE);
+ case 5:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, TILE);
+ default:
+ llvm_unreachable(
+ "Invalid Dimension in tile mode for GetCpAsyncBulkTensorS2GOpcode.");
+ }
+ }
+}
+
+static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
+ bool IsMultiCast,
+ bool IsCacheHint, bool IsIm2Col) {
+ if (IsIm2Col) {
+ switch (Dim) {
+ case 3:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL);
+ case 4:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL);
+ case 5:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL);
+ default:
+ llvm_unreachable("Invalid Dimension in im2col mode for "
+ "GetCpAsyncBulkTensorG2SOpcode.");
+ }
+ } else {
+ switch (Dim) {
+ case 1:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE);
+ case 2:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE);
+ case 3:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE);
+ case 4:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE);
+ case 5:
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE);
+ default:
+ llvm_unreachable(
+ "Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
+ }
+ }
+}
+
+void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SIm2Col(SDNode *N) {
+ // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
+ // {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2}
+ // multicast, cache_hint,
+ // multicast_flag, cache_hint_flag}
+ const std::map<unsigned, size_t> IntrinsicToDims = {
+ {Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d, 3},
+ {Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d, 4},
+ {Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d, 5}};
----------------
durga4github wrote:
I have moved to the {lamda + switch + llvm_unreachable} in the latest revision.
Please let me know if it looks good now.
https://github.com/llvm/llvm-project/pull/96083
More information about the llvm-commits
mailing list