[Mlir-commits] [mlir] 4d33c69 - [MLIR][GPU] Add cooperative launch support to gpu.launch_func (#190639)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed Apr 29 02:33:27 PDT 2026
Author: Jared Hoberock
Date: 2026-04-29T11:33:22+02:00
New Revision: 4d33c692e97dec54556e7d008cd5ec49ce0afaeb
URL: https://github.com/llvm/llvm-project/commit/4d33c692e97dec54556e7d008cd5ec49ce0afaeb
DIFF: https://github.com/llvm/llvm-project/commit/4d33c692e97dec54556e7d008cd5ec49ce0afaeb.diff
LOG: [MLIR][GPU] Add cooperative launch support to gpu.launch_func (#190639)
Add a `cooperative` UnitAttr to `gpu.launch_func` that enables
cooperative kernel launch semantics. Cooperative launches guarantee that
all thread blocks in the grid are co-resident on the GPU simultaneously,
enabling grid-wide synchronization patterns.
## Implementation
When `cooperative` is set (with or without cluster sizes), the lowering
emits a call to the new `mgpuLaunchKernelCooperative` runtime function,
which uses `cuLaunchKernelEx` with a `CUlaunchConfig` and
`CU_LAUNCH_ATTRIBUTE_COOPERATIVE`. This API is guarded behind
`CUDA_VERSION >= 12000`. The HIP path funnels through
`hipModuleLaunchCooperativeKernel`.
## Changes
- **GPUOps.td**: add `cooperative` UnitAttr and assembly format keyword
- **SelectObjectAttr.cpp**: add `getKernelLaunchExFn()`, route
cooperative and/or cluster launches through `mgpuLaunchKernelEx`
- **CudaRuntimeWrappers.cpp**: implement `mgpuLaunchKernelCooperative`
via `cuLaunchKernelEx` or `hipModuleLaunchCooperativeKernel`, depending
on platform
- **GPUToLLVMConversion.cpp**: propagate cooperative attribute through
the legalization pattern
- **test/Dialect/GPU/ops.mlir**: round-trip tests for cooperative
keyword with and without clusters
## Context
MLIR currently has no support for cooperative kernel launches. Flang
works around this with a CUF-specific attribute (PRs #124325, #124362),
but there is no first-class support in the GPU dialect. This patch adds
it at the `gpu.launch_func` level so all frontends can use it.
Assisted-by: Claude (Anthropic)
Added:
Modified:
mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp
mlir/test/Dialect/GPU/ops.mlir
mlir/test/Dialect/GPU/outlining.mlir
mlir/test/Target/LLVMIR/gpu.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 16c3ffd2c8587..a5525580fb320 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -619,6 +619,7 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
Optional<LaunchIndx>:$clusterSizeY,
Optional<LaunchIndx>:$clusterSizeZ,
Optional<I32>:$dynamicSharedMemorySize,
+ UnitAttr:$cooperative,
Variadic<AnyType>:$kernelOperands,
Optional<AnyType>:$asyncObject)>,
Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
@@ -661,6 +662,11 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
arguments are present, the Op launches a kernel that clusters the given
thread blocks. This feature is exclusive to certain architectures.
+ The `cooperative` attribute indicates that the kernel should be launched
+ cooperatively, guaranteeing that all thread blocks in the grid are
+ co-resident on the GPU simultaneously. This enables grid-wide
+ synchronization patterns.
+
Example:
```mlir
@@ -787,6 +793,7 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
`threads` `in` ` ` `(` $blockSizeX `,` $blockSizeY `,` $blockSizeZ `)`
custom<LaunchDimType>(type($gridSizeX), ref($clusterSizeX), type($clusterSizeX), type($clusterSizeY), type($clusterSizeZ))
(`dynamic_shared_memory_size` $dynamicSharedMemorySize^)?
+ (`cooperative` $cooperative^)?
custom<LaunchFuncOperands>($kernelOperands, type($kernelOperands)) attr-dict
}];
let hasVerifier = 1;
@@ -803,6 +810,7 @@ def GPU_LaunchOp : GPU_Op<"launch", [
Optional<Index>:$clusterSizeY,
Optional<Index>:$clusterSizeZ,
Optional<I32>:$dynamicSharedMemorySize,
+ UnitAttr:$cooperative,
OptionalAttr<FlatSymbolRefAttr>:$module,
OptionalAttr<FlatSymbolRefAttr>:$function,
OptionalAttr<ConfinedAttr<I64Attr, [IntNonNegative]>>:$workgroup_attributions)>,
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 2b321ee846d36..21301110cbd42 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -1070,7 +1070,7 @@ LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite(
gpu::KernelDim3{adaptor.getClusterSizeX(), adaptor.getClusterSizeY(),
adaptor.getClusterSizeZ()};
}
- gpu::LaunchFuncOp::create(
+ auto newLaunchOp = gpu::LaunchFuncOp::create(
rewriter, launchOp.getLoc(), launchOp.getKernelAttr(),
gpu::KernelDim3{adaptor.getGridSizeX(), adaptor.getGridSizeY(),
adaptor.getGridSizeZ()},
@@ -1079,6 +1079,8 @@ LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite(
adaptor.getDynamicSharedMemorySize(),
llvmArgumentsWithSizes.empty() ? llvmArguments : llvmArgumentsWithSizes,
stream, clusterSize);
+ if (launchOp.getCooperative())
+ newLaunchOp.setCooperative(true);
if (launchOp.getAsyncToken())
rewriter.replaceOp(launchOp, {stream});
else
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index aa68f02f46b99..d3fb6df2010d2 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -924,6 +924,9 @@ void LaunchOp::print(OpAsmPrinter &p) {
p << ')';
}
+ if (getCooperative())
+ p << " cooperative";
+
printAttributions(p, getWorkgroupKeyword(), getWorkgroupAttributionBBArgs());
printAttributions(p, getPrivateKeyword(), getPrivateAttributions());
@@ -933,7 +936,8 @@ void LaunchOp::print(OpAsmPrinter &p) {
p.printOptionalAttrDict((*this)->getAttrs(), /*elidedAttrs=*/{
LaunchOp::getOperandSegmentSizeAttr(),
getWorkgroupAttributionsAttrName(),
- moduleAttrName, functionAttrName});
+ getCooperativeAttrName(), moduleAttrName,
+ functionAttrName});
}
// Parse the size assignment blocks for blocks and threads. These have the form
@@ -1075,6 +1079,10 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) {
return failure();
}
+ // Parse optional cooperative keyword.
+ if (succeeded(parser.parseOptionalKeyword("cooperative")))
+ result.addAttribute("cooperative", parser.getBuilder().getUnitAttr());
+
// Create the region arguments: fixed launch-config args (`index`), then
// workgroup / private attribution args. The workgroup count is stored in the
// inherent `workgroup_attributions` attribute when non-zero.
diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
index d2e74dce23078..1c05dd4416aba 100644
--- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
@@ -296,6 +296,8 @@ static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
launchOp.getDynamicSharedMemorySize(), operands,
asyncToken ? asyncToken.getType() : nullptr,
launchOp.getAsyncDependencies(), clusterSize);
+ if (launchOp.getCooperative())
+ launchFunc.setCooperative(true);
launchOp.replaceAllUsesWith(launchFunc);
launchOp.erase();
}
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index f60db7b760aa6..2a7641f2df445 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -362,6 +362,73 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
#if (CUDA_VERSION >= 12000)
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuLaunchKernelCooperative(
+ CUfunction function, intptr_t gridX, intptr_t gridY, intptr_t gridZ,
+ intptr_t clusterX, intptr_t clusterY, intptr_t clusterZ, intptr_t blockX,
+ intptr_t blockY, intptr_t blockZ, int32_t smem, CUstream stream,
+ void **params, void **extra) {
+ ScopedContext scopedContext;
+ if (smem > 0) {
+ int32_t maxShmem = 0;
+ CUdevice device = getDefaultCuDevice();
+ CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
+ CUDA_REPORT_IF_ERROR(cuDeviceGetAttribute(
+ &maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
+ device));
+ if (maxShmem < smem) {
+ fprintf(stderr,
+ "Requested shared memory (%dkb) is larger than maximum allowed "
+ "shared memory (%dkb) for this device\n",
+ smem, maxShmem);
+ }
+ CUDA_REPORT_IF_ERROR(cuFuncSetAttribute(
+ function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem));
+ }
+
+ CUlaunchConfig config;
+ config.gridDimX = gridX;
+ config.gridDimY = gridY;
+ config.gridDimZ = gridZ;
+ config.blockDimX = blockX;
+ config.blockDimY = blockY;
+ config.blockDimZ = blockZ;
+ config.sharedMemBytes = smem;
+ config.hStream = stream;
+
+ CUlaunchAttribute launchAttrs[3];
+ int numAttrs = 0;
+
+ bool hasCluster = clusterX > 0 && clusterY > 0 && clusterZ > 0;
+ if (hasCluster) {
+ launchAttrs[numAttrs].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+ launchAttrs[numAttrs].value.clusterDim.x = clusterX;
+ launchAttrs[numAttrs].value.clusterDim.y = clusterY;
+ launchAttrs[numAttrs].value.clusterDim.z = clusterZ;
+ numAttrs++;
+
+ launchAttrs[numAttrs].id =
+ CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
+ launchAttrs[numAttrs].value.clusterSchedulingPolicyPreference =
+ CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
+ numAttrs++;
+ }
+
+ launchAttrs[numAttrs].id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
+ launchAttrs[numAttrs].value.cooperative = 1;
+ numAttrs++;
+
+ config.numAttrs = numAttrs;
+ config.attrs = launchAttrs;
+
+ debug_print("Launching cooperative kernel (cluster=%d), "
+ "grid=%ld,%ld,%ld, "
+ "threads: %ld, %ld, %ld, "
+ "smem: %dkb\n",
+ hasCluster, gridX, gridY, gridZ, blockX, blockY, blockZ, smem);
+
+ CUDA_REPORT_IF_ERROR(cuLaunchKernelEx(&config, function, params, extra));
+}
+
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuLaunchClusterKernel(
CUfunction function, intptr_t clusterX, intptr_t clusterY,
intptr_t clusterZ, intptr_t gridX, intptr_t gridY, intptr_t gridZ,
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
index e729e4f9fca9d..251245106c56e 100644
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -69,6 +69,26 @@ extern "C" void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX,
stream, params, extra));
}
+// Cooperative launch entry point. The cluster dimensions are accepted to
+// match the CUDA wrapper signature, but HIP does not support thread block
+// clusters; passing nonzero cluster dimensions is a usage error.
+extern "C" void mgpuLaunchKernelCooperative(
+ hipFunction_t function, intptr_t gridX, intptr_t gridY, intptr_t gridZ,
+ intptr_t clusterX, intptr_t clusterY, intptr_t clusterZ, intptr_t blockX,
+ intptr_t blockY, intptr_t blockZ, int32_t smem, hipStream_t stream,
+ void **params, void ** /*extra*/) {
+ if (clusterX != 0 || clusterY != 0 || clusterZ != 0) {
+ fprintf(stderr,
+ "mgpuLaunchKernelCooperative: HIP does not support thread block "
+ "clusters (got cluster=%ld,%ld,%ld)\n",
+ clusterX, clusterY, clusterZ);
+ abort();
+ }
+ HIP_REPORT_IF_ERROR(
+ hipModuleLaunchCooperativeKernel(function, gridX, gridY, gridZ, blockX,
+ blockY, blockZ, smem, stream, params));
+}
+
extern "C" hipStream_t mgpuStreamCreate() {
hipStream_t stream = nullptr;
HIP_REPORT_IF_ERROR(hipStreamCreate(&stream));
diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp b/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp
index c25e9a3c36973..016159391c33f 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp
@@ -215,6 +215,9 @@ class LaunchKernel {
// Get the kernel launch callee.
FunctionCallee getClusterKernelLaunchFn();
+ // Get the cooperative kernel launch callee.
+ FunctionCallee getKernelLaunchCooperativeFn();
+
// Get the module function callee.
FunctionCallee getModuleFunctionFn();
@@ -311,6 +314,17 @@ llvm::FunctionCallee llvm::LaunchKernel::getClusterKernelLaunchFn() {
false));
}
+llvm::FunctionCallee llvm::LaunchKernel::getKernelLaunchCooperativeFn() {
+ return module.getOrInsertFunction(
+ "mgpuLaunchKernelCooperative",
+ FunctionType::get(
+ voidTy,
+ ArrayRef<Type *>({ptrTy, intPtrTy, intPtrTy, intPtrTy, intPtrTy,
+ intPtrTy, intPtrTy, intPtrTy, intPtrTy, intPtrTy,
+ i32Ty, ptrTy, ptrTy, ptrTy}),
+ false));
+}
+
llvm::FunctionCallee llvm::LaunchKernel::getModuleFunctionFn() {
return module.getOrInsertFunction(
"mgpuModuleGetFunction",
@@ -452,8 +466,25 @@ llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
// Create the launch call.
Value *nullPtr = ConstantPointerNull::get(ptrTy);
- // Launch kernel with clusters if cluster size is specified.
- if (op.hasClusterSize()) {
+ // Cooperative launches go through mgpuLaunchKernelCooperative, which also
+ // handles an optional cluster. Cluster-only (non-cooperative) launches keep
+ // their existing path through mgpuLaunchClusterKernel. Plain launches go
+ // through mgpuLaunchKernel.
+ if (op.getCooperative()) {
+ Value *cx = ConstantInt::get(intPtrTy, 0);
+ Value *cy = ConstantInt::get(intPtrTy, 0);
+ Value *cz = ConstantInt::get(intPtrTy, 0);
+ if (op.hasClusterSize()) {
+ mlir::gpu::KernelDim3 cluster = op.getClusterSizeOperandValues();
+ cx = llvmValue(cluster.x);
+ cy = llvmValue(cluster.y);
+ cz = llvmValue(cluster.z);
+ }
+ builder.CreateCall(
+ getKernelLaunchCooperativeFn(),
+ ArrayRef<Value *>({moduleFunction, gx, gy, gz, cx, cy, cz, bx, by, bz,
+ dynamicMemorySize, stream, argArray, nullPtr}));
+ } else if (op.hasClusterSize()) {
mlir::gpu::KernelDim3 cluster = op.getClusterSizeOperandValues();
Value *cx = llvmValue(cluster.x), *cy = llvmValue(cluster.y),
*cz = llvmValue(cluster.z);
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index cbafc376fb89a..11cea6f82d7b5 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -17,6 +17,18 @@ module attributes {gpu.container_module} {
return
}
+ // CHECK-LABEL:func @launch_cooperative(%{{.*}}: index)
+ func.func @launch_cooperative(%sz : index) {
+ // CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) cooperative
+ gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)
+ threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz)
+ cooperative {
+ // CHECK: gpu.terminator
+ gpu.terminator
+ }
+ return
+ }
+
// CHECK-LABEL:func @launch_with_module_func_attr(%{{.*}}: index)
func.func @launch_with_module_func_attr(%sz : index) {
// CHECK: gpu.launch blocks(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) threads(%{{.*}}, %{{.*}}, %{{.*}}) in (%{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}, %{{.*}} = %{{.*}}) module(@test_module) function(@test_kernel_func)
@@ -233,6 +245,12 @@ module attributes {gpu.container_module} {
// CHECK: gpu.launch_func @kernels::@kernel_1 clusters in (%{{.*}}, %{{.*}}, %{{.*}}) blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
gpu.launch_func @kernels::@kernel_1 clusters in (%cst, %cst, %cst) blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) args(%0 : f32, %1 : memref<?xf32, 1>)
+ // CHECK: gpu.launch_func @kernels::@kernel_1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) cooperative args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
+ gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) cooperative args(%0 : f32, %1 : memref<?xf32, 1>)
+
+ // CHECK: gpu.launch_func @kernels::@kernel_1 clusters in (%{{.*}}, %{{.*}}, %{{.*}}) blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) cooperative args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
+ gpu.launch_func @kernels::@kernel_1 clusters in (%cst, %cst, %cst) blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) cooperative args(%0 : f32, %1 : memref<?xf32, 1>)
+
gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) dynamic_shared_memory_size %c0 args(%0 : f32, %1 : memref<?xf32, 1>)
// CHECK: gpu.launch_func @kernels::@kernel_2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})
diff --git a/mlir/test/Dialect/GPU/outlining.mlir b/mlir/test/Dialect/GPU/outlining.mlir
index 25220dff7a5bb..f708561bc2f01 100644
--- a/mlir/test/Dialect/GPU/outlining.mlir
+++ b/mlir/test/Dialect/GPU/outlining.mlir
@@ -705,3 +705,17 @@ module attributes {gpu.container_module} {
return
}
}
+
+// -----
+
+// CHECK-LABEL: func @launch_cooperative
+func.func @launch_cooperative() {
+ %cst = arith.constant 8 : index
+ // CHECK: gpu.launch_func @launch_cooperative_kernel::@launch_cooperative_kernel blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) cooperative
+ gpu.launch blocks(%bx, %by, %bz) in (%gx = %cst, %gy = %cst, %gz = %cst)
+ threads(%tx, %ty, %tz) in (%bxs = %cst, %bys = %cst, %bzs = %cst)
+ cooperative {
+ gpu.terminator
+ }
+ return
+}
diff --git a/mlir/test/Target/LLVMIR/gpu.mlir b/mlir/test/Target/LLVMIR/gpu.mlir
index 0d29a95b12266..da816bf24f63a 100644
--- a/mlir/test/Target/LLVMIR/gpu.mlir
+++ b/mlir/test/Target/LLVMIR/gpu.mlir
@@ -93,7 +93,7 @@ module attributes {gpu.container_module} {
%c1 = llvm.mlir.constant(1 : index) : i64
%c2 = llvm.mlir.constant(2 : index) : i64
%c3 = llvm.mlir.constant(3 : index) : i64
- gpu.launch_func @kernel_module::@kernel
+ gpu.launch_func @kernel_module::@kernel
clusters in (%c1, %c1, %c1)
blocks in (%c2, %c2, %c2)
threads in (%c3, %c3, %c3) : i64
@@ -103,6 +103,50 @@ module attributes {gpu.container_module} {
// -----
+// Test cooperative launch without a cluster: lowers to
+// mgpuLaunchKernelCooperative with cluster dims = 0.
+module attributes {gpu.container_module} {
+ gpu.binary @kernel_module [#gpu.object<#nvvm.target, "BLOB">]
+ llvm.func @cooperative_no_cluster() {
+ // CHECK: call void @mgpuLaunchKernelCooperative(
+ // CHECK-SAME: i64 2, i64 2, i64 2,
+ // CHECK-SAME: i64 0, i64 0, i64 0,
+ // CHECK-SAME: i64 3, i64 3, i64 3,
+ %c2 = llvm.mlir.constant(2 : index) : i64
+ %c3 = llvm.mlir.constant(3 : index) : i64
+ gpu.launch_func @kernel_module::@kernel
+ blocks in (%c2, %c2, %c2)
+ threads in (%c3, %c3, %c3) : i64
+ cooperative
+ llvm.return
+ }
+}
+
+// -----
+
+// Test cooperative launch combined with a cluster: lowers to
+// mgpuLaunchKernelCooperative with the real cluster dims.
+module attributes {gpu.container_module} {
+ gpu.binary @kernel_module [#gpu.object<#nvvm.target, "BLOB">]
+ llvm.func @cooperative_with_cluster() {
+ // CHECK: call void @mgpuLaunchKernelCooperative(
+ // CHECK-SAME: i64 2, i64 2, i64 2,
+ // CHECK-SAME: i64 1, i64 1, i64 1,
+ // CHECK-SAME: i64 3, i64 3, i64 3,
+ %c1 = llvm.mlir.constant(1 : index) : i64
+ %c2 = llvm.mlir.constant(2 : index) : i64
+ %c3 = llvm.mlir.constant(3 : index) : i64
+ gpu.launch_func @kernel_module::@kernel
+ clusters in (%c1, %c1, %c1)
+ blocks in (%c2, %c2, %c2)
+ threads in (%c3, %c3, %c3) : i64
+ cooperative
+ llvm.return
+ }
+}
+
+// -----
+
// Checking that ELF section is populated
module attributes {gpu.container_module} {
// CHECK: @cuda_device_mod_binary = internal constant [4 x i8] c"BLOB", section "__nv_rel_fatbin", align 8
More information about the Mlir-commits
mailing list