[llvm-branch-commits] [flang] [mlir] [Flang][OpenMP] Add pass to replace allocas with device shared memory (PR #161863)

Mon Feb 23 05:57:41 PST 2026

================
@@ -0,0 +1,215 @@
+// RUN: fir-opt --split-input-file --omp-stack-to-shared %s | FileCheck %s
+
+module attributes {omp.is_target_device = true} {
+  omp.declare_reduction @add_reduction_i32 : i32 init {
+  ^bb0(%arg0: i32):
+    %c0_i32 = arith.constant 0 : i32
+    omp.yield(%c0_i32 : i32)
+  } combiner {
+  ^bb0(%arg0: i32, %arg1: i32):
+    %0 = arith.addi %arg0, %arg1 : i32
+    omp.yield(%0 : i32)
+  }
+
+  omp.private {type = private} @privatizer_i32 : i32
+  omp.private {type = firstprivate} @firstprivatizer_i32 : i32 copy {
+  ^bb0(%arg0: i32, %arg1: i32):
+    omp.yield(%arg0 : i32)
+  }
+
+  // Verify that target device functions are searched for allocas shared across
+  // threads of a parallel region.
+  //
+  // Also ensure that all fir.alloca information is adequately forwarded to the
+  // new allocation, that uses of the allocation through hlfir.declare are
+  // detected and that only the expected types of uses (parallel reduction and
+  // non-private uses inside of a parallel region) are replaced.
+  // CHECK-LABEL: func.func @standalone_func
+  func.func @standalone_func(%lb: i32, %ub: i32, %step: i32) attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>} {
+    // CHECK: %[[ALLOC_0:.*]] = omp.alloc_shared_mem i32 {uniq_name = "x"} : !fir.ref<i32>
+    %0 = fir.alloca i32 {uniq_name = "x"}
+    %c = arith.constant 1 : index
+    // CHECK: %[[ALLOC_1:.*]] = omp.alloc_shared_mem !fir.char<1,?>(%[[C:.*]] : index), %[[C]] {bindc_name = "y", uniq_name = "y"} : !fir.ref<!fir.char<1,?>>
+    %1 = fir.alloca !fir.char<1,?>(%c : index), %c {bindc_name = "y", uniq_name = "y"}
+    // CHECK: %{{.*}}:2 = hlfir.declare %[[ALLOC_1]] typeparams %[[C]] {uniq_name = "y"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    %decl:2 = hlfir.declare %1 typeparams %c {uniq_name = "y"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+    // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "z"}
+    %2 = fir.alloca i32 {uniq_name = "z"}
+    // CHECK: %[[ALLOC_2:.*]] = omp.alloc_shared_mem i32 {uniq_name = "a"} : !fir.ref<i32>
+    %3 = fir.alloca i32 {uniq_name = "a"}
+    // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "b"}
+    %4 = fir.alloca i32 {uniq_name = "b"}
+    omp.parallel reduction(@add_reduction_i32 %0 -> %arg0 : !fir.ref<i32>) {
+      // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "c"}
+      %5 = fir.alloca i32 {uniq_name = "c"}
+      %6:2 = fir.unboxchar %decl#0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+      omp.wsloop private(@privatizer_i32 %2 -> %arg1, @firstprivatizer_i32 %3 -> %arg2 : !fir.ref<i32>, !fir.ref<i32>) {
+        omp.loop_nest (%arg3) : i32 = (%lb) to (%ub) inclusive step (%step) {
+          %7 = fir.load %5 : !fir.ref<i32>
+          omp.yield
+        }
+      }
+      omp.terminator
+    }
+    %5 = fir.load %4 : !fir.ref<i32>
+    // CHECK: omp.free_shared_mem %[[ALLOC_0]] : !fir.ref<i32>
+    // CHECK-NEXT: omp.free_shared_mem %[[ALLOC_1]] : !fir.ref<!fir.char<1,?>>
+    // CHECK-NEXT: omp.free_shared_mem %[[ALLOC_2]] : !fir.ref<i32>
+    // CHECK-NEXT: return
+    return
+  }
+
+  // Verify that generic target regions are searched for allocas shared across
+  // threads of a parallel region.
+  // CHECK-LABEL: func.func @target_generic
+  func.func @target_generic() {
+    // CHECK: omp.target
+    omp.target {
+      %c = arith.constant 0 : i32
+      // CHECK: %[[ALLOC_0:.*]] = omp.alloc_shared_mem i32 {uniq_name = "x"} : !fir.ref<i32>
+      %0 = fir.alloca i32 {uniq_name = "x"}
----------------
skatrak wrote:

The behavior I'm checking here can be reproduced with this C++ code:
```c++
// clang++ -save-temps -fopenmp -O0 --offload-arch=gfx1100 test.cpp
// llvm-dis test-openmp-amdgcn-amd-amdhsa-gfx1100.bc
int main() {
  int x;
  #pragma omp target private(x)
  {
    // Cannot explicitly declare a variable at this point because when there's a
    // teams construct, there mustn't be any further constructs or statements.
    #pragma omp teams
    {
      int y;
      #pragma omp distribute
      for (int i = 0; i < 10; ++i) {
        int z;
        #pragma omp parallel
        {
          x = 0;
          y = 1;
          z = 2;
        }
      }
    }
  }
  return 0;
}
```
In that case, `private(x)` being the clause that could be represented as an alloca at the `omp.target` level, Clang does use `__kmpc_alloc_shared` to allocate it, like it does for `y` and `z`. It also uses shared memory for split `target parallel` cases like the one below:
```c++
void f() {
  int x;
  #pragma omp target private(x)
  {
    int y;
    #pragma omp parallel
    {
      x = 0;
      y = 1;
    }
  }
}
```

https://github.com/llvm/llvm-project/pull/161863