[flang-commits] [flang] 5be9082 - [flang][cuda] Carry over the dynamic shared memory size to gpu.launch_func (#132837)

Mon Mar 24 18:37:23 PDT 2025

Author: Valentin Clement (バレンタイン クレメン)
Date: 2025-03-24T18:37:19-07:00
New Revision: 5be9082fed7966dfbbbf6e9dfff44d5fb6c5b4fb

URL: https://github.com/llvm/llvm-project/commit/5be9082fed7966dfbbbf6e9dfff44d5fb6c5b4fb
DIFF: https://github.com/llvm/llvm-project/commit/5be9082fed7966dfbbbf6e9dfff44d5fb6c5b4fb.diff

LOG: [flang][cuda] Carry over the dynamic shared memory size to gpu.launch_func (#132837)

Added: 
    

Modified: 
    flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
    flang/test/Fir/CUDA/cuda-launch.fir

Removed: 
    


################################################################################
diff  --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 0fbec8a204b8d..a01100511ec66 100644

--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -810,7 +810,7 @@ struct CUFLaunchOpConversion
                   mlir::PatternRewriter &rewriter) const override {
     mlir::Location loc = op.getLoc();
     auto idxTy = mlir::IndexType::get(op.getContext());
-    auto zero = rewriter.create<mlir::arith::ConstantOp>(
+    mlir::Value zero = rewriter.create<mlir::arith::ConstantOp>(
         loc, rewriter.getIntegerType(32), rewriter.getI32IntegerAttr(0));
     auto gridSizeX =
         rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridX());
@@ -869,10 +869,11 @@ struct CUFLaunchOpConversion
       }
       args.push_back(arg);
     }
-
+    mlir::Value dynamicShmemSize = op.getBytes() ? op.getBytes() : zero;
     auto gpuLaunchOp = rewriter.create<mlir::gpu::LaunchFuncOp>(
         loc, kernelName, mlir::gpu::KernelDim3{gridSizeX, gridSizeY, gridSizeZ},
-        mlir::gpu::KernelDim3{blockSizeX, blockSizeY, blockSizeZ}, zero, args);
+        mlir::gpu::KernelDim3{blockSizeX, blockSizeY, blockSizeZ},
+        dynamicShmemSize, args);
     if (clusterDimX && clusterDimY && clusterDimZ) {
       gpuLaunchOp.getClusterSizeXMutable().assign(clusterDimX);
       gpuLaunchOp.getClusterSizeYMutable().assign(clusterDimY);

diff  --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir
index 7833fc7b490bf..b8d79ca06ffd6 100644
--- a/flang/test/Fir/CUDA/cuda-launch.fir
+++ b/flang/test/Fir/CUDA/cuda-launch.fir
@@ -23,11 +23,15 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
     // CHECK: %[[ALLOCA:.*]] = fir.alloca f32
     %c1 = arith.constant 1 : index
     %c11_i32 = arith.constant 11 : i32
+    %c1024_i32 = arith.constant 1024 : i32
     %c6_i32 = arith.constant 6 : i32
     %c1_i32 = arith.constant 1 : i32
     // CHECK: gpu.launch_func  @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})  dynamic_shared_memory_size %c0{{.*}}
     cuf.kernel_launch @cuda_device_mod::@_QPsub_device1<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>()
 
+    // CHECK: gpu.launch_func  @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})  dynamic_shared_memory_size %c1024{{.*}}
+    cuf.kernel_launch @cuda_device_mod::@_QPsub_device1<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1024_i32>>>()
+
     // CHECK: gpu.launch_func  @cuda_device_mod::@_QPsub_device2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})  dynamic_shared_memory_size %c0{{.*}} args(%[[ALLOCA]] : !fir.ref<f32>)
     cuf.kernel_launch @cuda_device_mod::@_QPsub_device2<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>(%0) : (!fir.ref<f32>)
     return