[flang-commits] [flang] ad52fb3 - [flang][cuda] Use walk instead of getOps to process nested cuf.shared_memory ops (#188812)
via flang-commits
flang-commits at lists.llvm.org
Thu Mar 26 11:46:10 PDT 2026
Author: Zhen Wang
Date: 2026-03-26T18:46:05Z
New Revision: ad52fb36c64149b4219d101928930932c42687f8
URL: https://github.com/llvm/llvm-project/commit/ad52fb36c64149b4219d101928930932c42687f8
DIFF: https://github.com/llvm/llvm-project/commit/ad52fb36c64149b4219d101928930932c42687f8.diff
LOG: [flang][cuda] Use walk instead of getOps to process nested cuf.shared_memory ops (#188812)
The CUFComputeSharedMemoryOffsetsAndSize pass used getOps to find
cuf.shared_memory operations, which only searches direct children of
gpu.func. When cuf.shared_memory ops are nested inside scf.parallel
(e.g. from reduction lowering), they are missed and never receive
offset/isStatic attributes, causing a "cuf.shared_memory must have an
offset for code gen" assertion later. Switch to walk to find
cuf.shared_memory ops at any nesting depth.
Added:
Modified:
flang/lib/Optimizer/Transforms/CUDA/CUFComputeSharedMemoryOffsetsAndSize.cpp
flang/test/Fir/CUDA/cuda-shared-offset.mlir
Removed:
################################################################################
diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFComputeSharedMemoryOffsetsAndSize.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFComputeSharedMemoryOffsetsAndSize.cpp
index 87dc27e0382ef..18537f677661a 100644
--- a/flang/lib/Optimizer/Transforms/CUDA/CUFComputeSharedMemoryOffsetsAndSize.cpp
+++ b/flang/lib/Optimizer/Transforms/CUDA/CUFComputeSharedMemoryOffsetsAndSize.cpp
@@ -105,11 +105,10 @@ struct CUFComputeSharedMemoryOffsetsAndSize
unsigned short alignment = 0;
mlir::Value crtDynOffset;
- // Go over each shared memory operation and compute their start offset and
- // the size and alignment of the global to be generated if all variables
- // are static. If this is dynamic shared memory, then only the alignment
- // is computed.
- for (auto sharedOp : funcOp.getOps<cuf::SharedMemoryOp>()) {
+ // Walk all shared memory operations (including those nested inside
+ // scf.parallel from reduction lowering) and compute their start offset
+ // and the size and alignment of the global to be generated.
+ funcOp.walk([&](cuf::SharedMemoryOp sharedOp) {
mlir::Location loc = sharedOp.getLoc();
builder.setInsertionPoint(sharedOp);
if (fir::hasDynamicSize(sharedOp.getInType())) {
@@ -155,7 +154,7 @@ struct CUFComputeSharedMemoryOffsetsAndSize
sharedOp.setIsStatic(true);
++nbStaticSharedVariables;
}
- }
+ });
if (nbDynamicSharedVariables == 0 && nbStaticSharedVariables == 0)
continue;
diff --git a/flang/test/Fir/CUDA/cuda-shared-offset.mlir b/flang/test/Fir/CUDA/cuda-shared-offset.mlir
index 59ab040529ce4..a33c89b6d9af2 100644
--- a/flang/test/Fir/CUDA/cuda-shared-offset.mlir
+++ b/flang/test/Fir/CUDA/cuda-shared-offset.mlir
@@ -165,3 +165,40 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %{{.*}} : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>>
// CHECK: fir.global external @_QMmtestsPtestany__shared_mem__ {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
+
+// -----
+
+// Test that cuf.shared_memory ops nested inside scf.parallel (from reduction
+// lowering) are also processed. walk is needed instead of getOps to find
+// ops at any nesting depth within gpu.func.
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+ gpu.module @cuda_device_mod {
+ gpu.func @_QPreduce_kernel(%arg0: !fir.ref<i32>, %arg1: index) kernel {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c0_i32 = arith.constant 0 : i32
+ %0 = cuf.shared_memory i32 {bindc_name = "_lro_private_0", uniq_name = "_lro_private_0"} -> !fir.ref<i32>
+ %1 = fir.convert %0 : (!fir.ref<i32>) -> memref<i32>
+ memref.store %c0_i32, %1[] : memref<i32>
+ %2 = scf.parallel (%iv) = (%c0) to (%arg1) step (%c1) init (%c0_i32) -> i32 {
+ %3 = cuf.shared_memory i32 {bindc_name = "_lro_private_1", uniq_name = "_lro_private_1"} -> !fir.ref<i32>
+ %4 = fir.convert %3 : (!fir.ref<i32>) -> memref<i32>
+ %5 = memref.load %4[] : memref<i32>
+ scf.reduce(%5 : i32) {
+ ^bb0(%lhs: i32, %rhs: i32):
+ %6 = arith.addi %lhs, %rhs : i32
+ scf.reduce.return %6 : i32
+ }
+ }
+ gpu.return
+ }
+ }
+}
+
+// CHECK-LABEL: gpu.func @_QPreduce_kernel
+// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 align 4 {bindc_name = "_lro_private_0", isStatic, uniq_name = "_lro_private_0"} -> !fir.ref<i32>
+// CHECK: scf.parallel
+// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 align 4 {bindc_name = "_lro_private_1", isStatic, uniq_name = "_lro_private_1"} -> !fir.ref<i32>
+// CHECK: fir.global internal @_QPreduce_kernel__shared_mem___lro_private_0 {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<4xi8>
+// CHECK: fir.global internal @_QPreduce_kernel__shared_mem___lro_private_1 {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<4xi8>
More information about the flang-commits
mailing list