[flang-commits] [flang] [flang][cuda] Fix shared memory offset computation for nested cuf.shared_memory ops (PR #188659)

Wed Mar 25 20:48:55 PDT 2026

================
@@ -165,3 +165,40 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
 // CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %{{.*}} : index {bindc_name = "smasks", uniq_name = "_QMmtestsFtestanyEsmasks"} -> !fir.ref<!fir.array<?xf32>>
 
 // CHECK: fir.global external @_QMmtestsPtestany__shared_mem__ {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
+
+// -----
+
+// Test that cuf.shared_memory ops nested inside scf.parallel (from reduction
+// lowering) are also processed. Before the fix, only direct children of
+// gpu.func were visited via getOps; walk is needed for nested ops.
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  gpu.module @cuda_device_mod {
+    gpu.func @_QPreduce_kernel(%arg0: !fir.ref<i32>, %arg1: index) kernel {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c0_i32 = arith.constant 0 : i32
+      %0 = cuf.shared_memory i32 {bindc_name = "_lro_private", uniq_name = "_lro_private"} -> !fir.ref<i32>
+      %1 = fir.convert %0 : (!fir.ref<i32>) -> memref<i32>
+      memref.store %c0_i32, %1[] : memref<i32>
+      %2 = scf.parallel (%iv) = (%c0) to (%arg1) step (%c1) init (%c0_i32) -> i32 {
+        %3 = cuf.shared_memory i32 {bindc_name = "_lro_private", uniq_name = "_lro_private"} -> !fir.ref<i32>
----------------
wangzpgi wrote:

The two cuf.shared_memory ops with the same bindc_name = "_lro_private" are two access sites into the same allocation — one stores the warp partial sums, the other reads them back after the barrier. I wrote an equivalent CUDA C reduction and compiled with nvcc -ptx, which produces a single .shared declaration referenced from both sites.

https://github.com/llvm/llvm-project/pull/188659