[Mlir-commits] [mlir] 108380d - [mlir][nvvm] Add `cp.async.bulk.tensor.shared.cluster.global.multicast` (#72429)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Thu Nov 16 05:35:00 PST 2023
Author: Guray Ozen
Date: 2023-11-16T14:34:56+01:00
New Revision: 108380da357e2db513f016d33adede0d58636bea
URL: https://github.com/llvm/llvm-project/commit/108380da357e2db513f016d33adede0d58636bea
DIFF: https://github.com/llvm/llvm-project/commit/108380da357e2db513f016d33adede0d58636bea.diff
LOG: [mlir][nvvm] Add `cp.async.bulk.tensor.shared.cluster.global.multicast` (#72429)
This PR introduce `cp.async.bulk.tensor.shared.cluster.global.multicast`
Op in NVVM dialect. It loads data using TMA data from global memory to
shared memory of multiple CTAs in the cluster.
It resolves #72368
Added:
Modified:
mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index ffe6f25fcd944b6..b85ac91c5f64fcd 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -1405,13 +1405,29 @@ def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp :
Arguments<(ins LLVM_PointerShared:$dstMem,
LLVM_AnyPointer:$tmaDescriptor,
LLVM_PointerShared:$mbar,
+ Optional<I16>:$multicastMask,
Variadic<I32>:$coordinates,
PtxPredicate:$predicate)> {
+ let description = [{
+ Initiates an asynchronous copy operation on the tensor data from global
+ memory to shared memory.
+
+ The `multicastMask` operand is optional. When it is present, the Op copies
+ data from global memory to shared memory of multiple CTAs in the cluster.
+ Operand `multicastMask` specifies the destination CTAs in the cluster such
+ that each bit position in the 16-bit `multicastMask` operand corresponds to
+ the `nvvm.read.ptx.sreg.ctaid` of the destination CTA.
+
+ [For more information, see PTX ISA]
+ (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor)
+ }];
+
let assemblyFormat = [{
$dstMem `,`
$tmaDescriptor `,`
$mbar `,`
- `box` `[`$coordinates `]`
+ ( `multicast_mask` `=` $multicastMask^ `,` )?
+ `box` `[`$coordinates `]`
(`,` `predicate` `=` $predicate^)?
attr-dict `:` type(operands)
}];
@@ -1422,11 +1438,20 @@ def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp :
std::string ptx = "cp.async.bulk.tensor.";
ptx += std::to_string(dim) + "d.";
ptx += "shared::cluster.global.mbarrier::complete_tx::bytes";
- if(dim == 1) ptx += " [%0], [%1, {%3} ], [%2];";
- if(dim == 2) ptx += " [%0], [%1, {%3, %4} ], [%2];";
- if(dim == 3) ptx += " [%0], [%1, {%3, %4, %5} ], [%2];";
- if(dim == 4) ptx += " [%0], [%1, {%3, %4, %5, %6} ], [%2];";
- if(dim == 5) ptx += " [%0], [%1, {%3, %4, %5, %6, %7} ], [%2];";
+ if(getMulticastMask()) {
+ ptx += ".multicast::cluster";
+ if(dim == 1) ptx += " [%0], [%1, {%4} ], [%2], %3;";
+ if(dim == 2) ptx += " [%0], [%1, {%4, %5} ], [%2], %3;";
+ if(dim == 3) ptx += " [%0], [%1, {%4, %5, %6} ], [%2], %3;";
+ if(dim == 4) ptx += " [%0], [%1, {%4, %5, %6, %7} ], [%2], %3;";
+ if(dim == 5) ptx += " [%0], [%1, {%4, %5, %6, %7, %8} ], [%2], %3;";
+ } else {
+ if(dim == 1) ptx += " [%0], [%1, {%3} ], [%2];";
+ if(dim == 2) ptx += " [%0], [%1, {%3, %4} ], [%2];";
+ if(dim == 3) ptx += " [%0], [%1, {%3, %4, %5} ], [%2];";
+ if(dim == 4) ptx += " [%0], [%1, {%3, %4, %5, %6} ], [%2];";
+ if(dim == 5) ptx += " [%0], [%1, {%3, %4, %5, %6, %7} ], [%2];";
+ }
return ptx;
}
}];
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index 1977a571130ed12..c0932f17e730fb5 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -975,7 +975,7 @@ struct NVGPUTmaAsyncLoadOpLowering
}
rewriter.replaceOpWithNewOp<NVVM::CpAsyncBulkTensorGlobalToSharedClusterOp>(
- op, dest, adaptor.getTensorMapDescriptor(), barrier, coords,
+ op, dest, adaptor.getTensorMapDescriptor(), barrier, Value(), coords,
adaptor.getPredicate());
return success();
}
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index b907a86ebc48072..bd73355321acd78 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -130,6 +130,51 @@ func.func @tma_load_5d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier
return
}
+// CHECK-LABEL: @tma_load_multicast1d
+func.func @tma_load_multicast1d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %p : i1) {
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$4} ], [$2], $3;", "r,l,r,h,r"
+ nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, multicast_mask = %multicastMask, box [%crd0] : !llvm.ptr<3>, !llvm.ptr, !llvm.ptr<3>, i16, i32
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$5 cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$4} ], [$2], $3;", "r,l,r,h,r,b"
+ nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, multicast_mask = %multicastMask, box [%crd0], predicate=%p : !llvm.ptr<3>, !llvm.ptr, !llvm.ptr<3>, i16, i32,i1
+ return
+}
+
+// CHECK-LABEL: @tma_load_multicast2d
+func.func @tma_load_multicast2d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %p : i1) {
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$4, $5} ], [$2], $3;", "r,l,r,h,r,r"
+ nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, multicast_mask = %multicastMask, box [%crd0,%crd1] : !llvm.ptr<3>, !llvm.ptr, !llvm.ptr<3>, i16, i32, i32
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$6 cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$4, $5} ], [$2], $3;", "r,l,r,h,r,r,b"
+ nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, multicast_mask = %multicastMask, box [%crd0,%crd1], predicate=%p : !llvm.ptr<3>, !llvm.ptr, !llvm.ptr<3>, i16, i32, i32, i1
+ return
+}
+
+// CHECK-LABEL: @tma_load_multicast3d
+func.func @tma_load_multicast3d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %p : i1) {
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$4, $5, $6} ], [$2], $3;", "r,l,r,h,r,r,r"
+ nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, multicast_mask = %multicastMask, box [%crd0,%crd1,%crd2] : !llvm.ptr<3>, !llvm.ptr, !llvm.ptr<3>, i16, i32, i32, i32
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$4, $5, $6} ], [$2], $3;", "r,l,r,h,r,r,r,b"
+ nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, multicast_mask = %multicastMask, box [%crd0,%crd1,%crd2], predicate=%p : !llvm.ptr<3>, !llvm.ptr, !llvm.ptr<3>, i16, i32, i32, i32, i1
+ return
+}
+
+// CHECK-LABEL: @tma_load_multicast4d
+func.func @tma_load_multicast4d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %p : i1) {
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$4, $5, $6, $7} ], [$2], $3;", "r,l,r,h,r,r,r,r"
+ nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, multicast_mask = %multicastMask, box [%crd0,%crd1,%crd2,%crd3] : !llvm.ptr<3>, !llvm.ptr, !llvm.ptr<3>, i16, i32, i32, i32, i32
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$7 cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$4, $5, $6} ], [$2], $3;", "r,l,r,h,r,r,r,b"
+ nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, multicast_mask = %multicastMask, box [%crd0,%crd1,%crd2], predicate=%p : !llvm.ptr<3>, !llvm.ptr, !llvm.ptr<3>, i16, i32, i32, i32, i1
+ return
+}
+
+// CHECK-LABEL: @tma_load_multicast5d
+func.func @tma_load_multicast5d(%tmaDescriptor: !llvm.ptr, %dest : !llvm.ptr<3>, %barrier: !llvm.ptr<3>, %multicastMask : i16, %crd0: i32, %crd1: i32, %crd2: i32, %crd3: i32, %crd4: i32, %p : i1) {
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$4, $5, $6, $7, $8} ], [$2], $3;", "r,l,r,h,r,r,r,r,r"
+ nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, multicast_mask = %multicastMask, box [%crd0,%crd1,%crd2,%crd3,%crd4] : !llvm.ptr<3>, !llvm.ptr, !llvm.ptr<3>, i16, i32, i32, i32, i32, i32
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "@$9 cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [$0], [$1, {$4, $5, $6, $7, $8} ], [$2], $3;", "r,l,r,h,r,r,r,r,r,b"
+ nvvm.cp.async.bulk.tensor.shared.cluster.global %dest, %tmaDescriptor, %barrier, multicast_mask = %multicastMask, box [%crd0,%crd1,%crd2,%crd3,%crd4], predicate=%p : !llvm.ptr<3>, !llvm.ptr, !llvm.ptr<3>, i16, i32, i32, i32, i32, i32, i1
+ return
+}
+
// CHECK-LABEL: @tma_store_1d
func.func @tma_store_1d(%tmaDescriptor: !llvm.ptr, %src : !llvm.ptr<3>, %crd0: i32, %p : i1) {
// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.tensor.1d.global.shared::cta.bulk_group [$0, {$2} ], [$1];", "l,r,r"
More information about the Mlir-commits
mailing list