[Mlir-commits] [mlir] [MLIR][NVVM] Fix undef in cp.async.bulk.tensor.reduce Op (PR #157423)
Durgadoss R
llvmlistbot at llvm.org
Mon Sep 8 06:50:29 PDT 2025
================
@@ -1641,46 +1641,77 @@ CpAsyncBulkTensorSharedCTAToGlobalOp::getIntrinsicIDAndArgs(
is_im2col ? CP_ASYNC_BULK_TENSOR_REDUCE_MODE(op, dim, im2col) \
: CP_ASYNC_BULK_TENSOR_REDUCE_MODE(op, dim, tile)
-#define GET_CP_ASYNC_BULK_TENSOR_ID(op, dims, is_im2col) \
- [&]() -> auto { \
- switch (dims) { \
- case 1: \
- return CP_ASYNC_BULK_TENSOR_REDUCE_MODE(op, 1, tile); \
- case 2: \
- return CP_ASYNC_BULK_TENSOR_REDUCE_MODE(op, 2, tile); \
- case 3: \
- return CP_ASYNC_BULK_TENSOR_REDUCE(op, 3, is_im2col); \
- case 4: \
- return CP_ASYNC_BULK_TENSOR_REDUCE(op, 4, is_im2col); \
- case 5: \
- return CP_ASYNC_BULK_TENSOR_REDUCE(op, 5, is_im2col); \
- default: \
- llvm_unreachable("Invalid TensorDim in CpAsyncBulkTensorReduceOp."); \
- } \
- }()
+#define GET_CP_ASYNC_BULK_TENSOR_ID(iid, op, dims, is_im2col) \
+ switch (dims) { \
+ case 1: \
+ iid = CP_ASYNC_BULK_TENSOR_REDUCE_MODE(op, 1, tile); \
+ break; \
+ case 2: \
+ iid = CP_ASYNC_BULK_TENSOR_REDUCE_MODE(op, 2, tile); \
+ break; \
+ case 3: \
+ iid = CP_ASYNC_BULK_TENSOR_REDUCE(op, 3, is_im2col); \
+ break; \
+ case 4: \
+ iid = CP_ASYNC_BULK_TENSOR_REDUCE(op, 4, is_im2col); \
+ break; \
+ case 5: \
+ iid = CP_ASYNC_BULK_TENSOR_REDUCE(op, 5, is_im2col); \
+ break; \
+ default: \
+ llvm_unreachable("Invalid TensorDim in CpAsyncBulkTensorReduceOp."); \
+ break; \
+ } \
+ break;
+
+NVVM::IDArgPair CpAsyncBulkTensorReduceOp::getIntrinsicIDAndArgs(
+ Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+ auto thisOp = cast<NVVM::CpAsyncBulkTensorReduceOp>(op);
+ llvm::LLVMContext &ctx = mt.getLLVMContext();
+
+ llvm::SmallVector<llvm::Value *> args;
+
+ // Arguments to the intrinsic:
+ // shared_mem_ptr, tmaDesc, tensorDims
+ // cache_hint(if applicable) and flag(boolean)
+ args.push_back(mt.lookupValue(thisOp.getSrcMem()));
+ args.push_back(mt.lookupValue(thisOp.getTmaDescriptor()));
+
+ for (auto v : thisOp.getCoordinates())
+ args.push_back(mt.lookupValue(v));
+
+ mlir::Value cacheHint = thisOp.getL2CacheHint();
+ const bool hasCacheHint = static_cast<bool>(cacheHint);
+ llvm::Value *i64Unused =
+ llvm::ConstantInt::get(llvm::Type::getInt64Ty(ctx), 0);
+ args.push_back(hasCacheHint ? mt.lookupValue(cacheHint) : i64Unused);
+ args.push_back(builder.getInt1(hasCacheHint));
+
+ llvm::Intrinsic::ID iid;
+ int tensorDims = thisOp.getCoordinates().size();
+ bool isIm2Col = thisOp.getMode() == NVVM::TMAStoreMode::IM2COL;
-llvm::Intrinsic::ID CpAsyncBulkTensorReduceOp::getIntrinsicID(
- int tensorDims, NVVM::TMAReduxKind kind, bool isIm2Col) {
using RedTy = NVVM::TMAReduxKind;
- switch (kind) {
+ switch (thisOp.getRedKind()) {
case RedTy::ADD:
- return GET_CP_ASYNC_BULK_TENSOR_ID(reduce_add, tensorDims, isIm2Col);
----------------
durga4github wrote:
why not declare iid as local here, get the return value and break?
(Then, in the end, we can return the pair-of-values)
This way, the existing macros do not need any change.
https://github.com/llvm/llvm-project/pull/157423
More information about the Mlir-commits
mailing list