[flang-commits] [flang] 5be9082 - [flang][cuda] Carry over the dynamic shared memory size to gpu.launch_func (#132837)
via flang-commits
flang-commits at lists.llvm.org
Mon Mar 24 18:37:23 PDT 2025
Author: Valentin Clement (バレンタイン クレメン)
Date: 2025-03-24T18:37:19-07:00
New Revision: 5be9082fed7966dfbbbf6e9dfff44d5fb6c5b4fb
URL: https://github.com/llvm/llvm-project/commit/5be9082fed7966dfbbbf6e9dfff44d5fb6c5b4fb
DIFF: https://github.com/llvm/llvm-project/commit/5be9082fed7966dfbbbf6e9dfff44d5fb6c5b4fb.diff
LOG: [flang][cuda] Carry over the dynamic shared memory size to gpu.launch_func (#132837)
Added:
Modified:
flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
flang/test/Fir/CUDA/cuda-launch.fir
Removed:
################################################################################
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 0fbec8a204b8d..a01100511ec66 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -810,7 +810,7 @@ struct CUFLaunchOpConversion
mlir::PatternRewriter &rewriter) const override {
mlir::Location loc = op.getLoc();
auto idxTy = mlir::IndexType::get(op.getContext());
- auto zero = rewriter.create<mlir::arith::ConstantOp>(
+ mlir::Value zero = rewriter.create<mlir::arith::ConstantOp>(
loc, rewriter.getIntegerType(32), rewriter.getI32IntegerAttr(0));
auto gridSizeX =
rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridX());
@@ -869,10 +869,11 @@ struct CUFLaunchOpConversion
}
args.push_back(arg);
}
-
+ mlir::Value dynamicShmemSize = op.getBytes() ? op.getBytes() : zero;
auto gpuLaunchOp = rewriter.create<mlir::gpu::LaunchFuncOp>(
loc, kernelName, mlir::gpu::KernelDim3{gridSizeX, gridSizeY, gridSizeZ},
- mlir::gpu::KernelDim3{blockSizeX, blockSizeY, blockSizeZ}, zero, args);
+ mlir::gpu::KernelDim3{blockSizeX, blockSizeY, blockSizeZ},
+ dynamicShmemSize, args);
if (clusterDimX && clusterDimY && clusterDimZ) {
gpuLaunchOp.getClusterSizeXMutable().assign(clusterDimX);
gpuLaunchOp.getClusterSizeYMutable().assign(clusterDimY);
diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir
index 7833fc7b490bf..b8d79ca06ffd6 100644
--- a/flang/test/Fir/CUDA/cuda-launch.fir
+++ b/flang/test/Fir/CUDA/cuda-launch.fir
@@ -23,11 +23,15 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
// CHECK: %[[ALLOCA:.*]] = fir.alloca f32
%c1 = arith.constant 1 : index
%c11_i32 = arith.constant 11 : i32
+ %c1024_i32 = arith.constant 1024 : i32
%c6_i32 = arith.constant 6 : i32
%c1_i32 = arith.constant 1 : i32
// CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}}
cuf.kernel_launch @cuda_device_mod::@_QPsub_device1<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>()
+ // CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c1024{{.*}}
+ cuf.kernel_launch @cuda_device_mod::@_QPsub_device1<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1024_i32>>>()
+
// CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}} args(%[[ALLOCA]] : !fir.ref<f32>)
cuf.kernel_launch @cuda_device_mod::@_QPsub_device2<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>(%0) : (!fir.ref<f32>)
return
More information about the flang-commits
mailing list