[flang-commits] [flang] [flang][cuda] Carry over the dynamic shared memory size to gpu.launch_func (PR #132837)

Mon Mar 24 15:15:43 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-flang-fir-hlfir

Author: Valentin Clement (バレンタイン クレメン) (clementval)

<details>
<summary>Changes</summary>

When converting `cuf.kernel_launch` operation to `gpu.launch_func`, the dynamic shared memory was always set to zero. Carry over the information from the `cuf.kernel_launch` op. 

---
Full diff: https://github.com/llvm/llvm-project/pull/132837.diff


2 Files Affected:

- (modified) flang/lib/Optimizer/Transforms/CUFOpConversion.cpp (+4-3) 
- (modified) flang/test/Fir/CUDA/cuda-launch.fir (+4) 


``````````diff

diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 0fbec8a204b8d..a01100511ec66 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -810,7 +810,7 @@ struct CUFLaunchOpConversion
                   mlir::PatternRewriter &rewriter) const override {
     mlir::Location loc = op.getLoc();
     auto idxTy = mlir::IndexType::get(op.getContext());
-    auto zero = rewriter.create<mlir::arith::ConstantOp>(
+    mlir::Value zero = rewriter.create<mlir::arith::ConstantOp>(
         loc, rewriter.getIntegerType(32), rewriter.getI32IntegerAttr(0));
     auto gridSizeX =
         rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridX());
@@ -869,10 +869,11 @@ struct CUFLaunchOpConversion
       }
       args.push_back(arg);
     }
-
+    mlir::Value dynamicShmemSize = op.getBytes() ? op.getBytes() : zero;
     auto gpuLaunchOp = rewriter.create<mlir::gpu::LaunchFuncOp>(
         loc, kernelName, mlir::gpu::KernelDim3{gridSizeX, gridSizeY, gridSizeZ},
-        mlir::gpu::KernelDim3{blockSizeX, blockSizeY, blockSizeZ}, zero, args);
+        mlir::gpu::KernelDim3{blockSizeX, blockSizeY, blockSizeZ},
+        dynamicShmemSize, args);
     if (clusterDimX && clusterDimY && clusterDimZ) {
       gpuLaunchOp.getClusterSizeXMutable().assign(clusterDimX);
       gpuLaunchOp.getClusterSizeYMutable().assign(clusterDimY);
diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir
index 7833fc7b490bf..b8d79ca06ffd6 100644
--- a/flang/test/Fir/CUDA/cuda-launch.fir
+++ b/flang/test/Fir/CUDA/cuda-launch.fir
@@ -23,11 +23,15 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
     // CHECK: %[[ALLOCA:.*]] = fir.alloca f32
     %c1 = arith.constant 1 : index
     %c11_i32 = arith.constant 11 : i32
+    %c1024_i32 = arith.constant 1024 : i32
     %c6_i32 = arith.constant 6 : i32
     %c1_i32 = arith.constant 1 : i32
     // CHECK: gpu.launch_func  @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})  dynamic_shared_memory_size %c0{{.*}}
     cuf.kernel_launch @cuda_device_mod::@_QPsub_device1<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>()
 
+    // CHECK: gpu.launch_func  @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})  dynamic_shared_memory_size %c1024{{.*}}
+    cuf.kernel_launch @cuda_device_mod::@_QPsub_device1<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1024_i32>>>()
+
     // CHECK: gpu.launch_func  @cuda_device_mod::@_QPsub_device2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})  dynamic_shared_memory_size %c0{{.*}} args(%[[ALLOCA]] : !fir.ref<f32>)
     cuf.kernel_launch @cuda_device_mod::@_QPsub_device2<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>(%0) : (!fir.ref<f32>)
     return

``````````

</details>


https://github.com/llvm/llvm-project/pull/132837