[llvm] [NVPTX] Add TMA bulk tensor copy intrinsics (PR #96083)
Artem Belevich via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 18 11:20:33 PDT 2024
================
@@ -4091,3 +4096,246 @@ unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
}
}
}
+
+static size_t GetCpAsyncBulkTensorDimFromIntrinsic(unsigned IID) {
+ switch (IID) {
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_1d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_1d:
+ return 1;
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_2d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_2d:
+ return 2;
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_3d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_3d:
+ return 3;
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_4d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_4d:
+ return 4;
+ case Intrinsic::nvvm_cp_async_bulk_tensor_smem_to_gmem_5d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_gmem_to_smem_5d:
+ return 5;
+ default:
+ llvm_unreachable(
+ "Invalid Tensor dim in nvvm_cp_async_bulk_tensor intrinsic");
+ }
+}
+
+#define CP_ASYNC_BULK_TENSOR_OPCODE(dir, dim, mode, suffix) \
+ if (IsShared32) { \
+ return NVPTX:: \
+ CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix; \
+ } else { \
+ return NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix; \
+ }
+
+#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode) \
+ do { \
+ if (IsCacheHint) { \
+ CP_ASYNC_BULK_TENSOR_OPCODE(SMEM_TO_GMEM, dim, mode, _CH); \
+ } else { \
+ CP_ASYNC_BULK_TENSOR_OPCODE(SMEM_TO_GMEM, dim, mode, ); \
+ } \
+ } while (0)
+
+#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode) \
+ do { \
+ if (IsMultiCast && IsCacheHint) { \
+ CP_ASYNC_BULK_TENSOR_OPCODE(GMEM_TO_SMEM, dim, mode, _MC_CH); \
----------------
Artem-B wrote:
TBH, I'm not excited about macros hiding a `return`. Makes it hard to understand what's going on without readingall of the macros themselves. It would be great to refactor it in a way that the macros would produce a value, and return would remain in the function.
One convenient way to do that for the more complex ones is to wrap the logic in an immediatelly invoked lambda:
```
#define FOO(a,b,c) \
[&]() {
if (something)
return a;
if (something else)
return b;
...
return c;
}()
```
Then you could use `return FOO(A,B,C)`
This works much better than the C-style wrapping in `do {} while(0)`
https://github.com/llvm/llvm-project/pull/96083
More information about the llvm-commits
mailing list