[llvm-branch-commits] [flang] [mlir] [Flang][OpenMP] Add pass to replace allocas with device shared memory (PR #161863)
Sergio Afonso via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Feb 23 05:57:41 PST 2026
================
@@ -0,0 +1,215 @@
+// RUN: fir-opt --split-input-file --omp-stack-to-shared %s | FileCheck %s
+
+module attributes {omp.is_target_device = true} {
+ omp.declare_reduction @add_reduction_i32 : i32 init {
+ ^bb0(%arg0: i32):
+ %c0_i32 = arith.constant 0 : i32
+ omp.yield(%c0_i32 : i32)
+ } combiner {
+ ^bb0(%arg0: i32, %arg1: i32):
+ %0 = arith.addi %arg0, %arg1 : i32
+ omp.yield(%0 : i32)
+ }
+
+ omp.private {type = private} @privatizer_i32 : i32
+ omp.private {type = firstprivate} @firstprivatizer_i32 : i32 copy {
+ ^bb0(%arg0: i32, %arg1: i32):
+ omp.yield(%arg0 : i32)
+ }
+
+ // Verify that target device functions are searched for allocas shared across
+ // threads of a parallel region.
+ //
+ // Also ensure that all fir.alloca information is adequately forwarded to the
+ // new allocation, that uses of the allocation through hlfir.declare are
+ // detected and that only the expected types of uses (parallel reduction and
+ // non-private uses inside of a parallel region) are replaced.
+ // CHECK-LABEL: func.func @standalone_func
+ func.func @standalone_func(%lb: i32, %ub: i32, %step: i32) attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>} {
+ // CHECK: %[[ALLOC_0:.*]] = omp.alloc_shared_mem i32 {uniq_name = "x"} : !fir.ref<i32>
+ %0 = fir.alloca i32 {uniq_name = "x"}
+ %c = arith.constant 1 : index
+ // CHECK: %[[ALLOC_1:.*]] = omp.alloc_shared_mem !fir.char<1,?>(%[[C:.*]] : index), %[[C]] {bindc_name = "y", uniq_name = "y"} : !fir.ref<!fir.char<1,?>>
+ %1 = fir.alloca !fir.char<1,?>(%c : index), %c {bindc_name = "y", uniq_name = "y"}
+ // CHECK: %{{.*}}:2 = hlfir.declare %[[ALLOC_1]] typeparams %[[C]] {uniq_name = "y"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+ %decl:2 = hlfir.declare %1 typeparams %c {uniq_name = "y"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+ // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "z"}
+ %2 = fir.alloca i32 {uniq_name = "z"}
+ // CHECK: %[[ALLOC_2:.*]] = omp.alloc_shared_mem i32 {uniq_name = "a"} : !fir.ref<i32>
+ %3 = fir.alloca i32 {uniq_name = "a"}
+ // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "b"}
+ %4 = fir.alloca i32 {uniq_name = "b"}
+ omp.parallel reduction(@add_reduction_i32 %0 -> %arg0 : !fir.ref<i32>) {
+ // CHECK: %{{.*}} = fir.alloca i32 {uniq_name = "c"}
+ %5 = fir.alloca i32 {uniq_name = "c"}
+ %6:2 = fir.unboxchar %decl#0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+ omp.wsloop private(@privatizer_i32 %2 -> %arg1, @firstprivatizer_i32 %3 -> %arg2 : !fir.ref<i32>, !fir.ref<i32>) {
+ omp.loop_nest (%arg3) : i32 = (%lb) to (%ub) inclusive step (%step) {
+ %7 = fir.load %5 : !fir.ref<i32>
+ omp.yield
+ }
+ }
+ omp.terminator
+ }
+ %5 = fir.load %4 : !fir.ref<i32>
+ // CHECK: omp.free_shared_mem %[[ALLOC_0]] : !fir.ref<i32>
+ // CHECK-NEXT: omp.free_shared_mem %[[ALLOC_1]] : !fir.ref<!fir.char<1,?>>
+ // CHECK-NEXT: omp.free_shared_mem %[[ALLOC_2]] : !fir.ref<i32>
+ // CHECK-NEXT: return
+ return
+ }
+
+ // Verify that generic target regions are searched for allocas shared across
+ // threads of a parallel region.
+ // CHECK-LABEL: func.func @target_generic
+ func.func @target_generic() {
+ // CHECK: omp.target
+ omp.target {
+ %c = arith.constant 0 : i32
+ // CHECK: %[[ALLOC_0:.*]] = omp.alloc_shared_mem i32 {uniq_name = "x"} : !fir.ref<i32>
+ %0 = fir.alloca i32 {uniq_name = "x"}
----------------
skatrak wrote:
The behavior I'm checking here can be reproduced with this C++ code:
```c++
// clang++ -save-temps -fopenmp -O0 --offload-arch=gfx1100 test.cpp
// llvm-dis test-openmp-amdgcn-amd-amdhsa-gfx1100.bc
int main() {
int x;
#pragma omp target private(x)
{
// Cannot explicitly declare a variable at this point because when there's a
// teams construct, there mustn't be any further constructs or statements.
#pragma omp teams
{
int y;
#pragma omp distribute
for (int i = 0; i < 10; ++i) {
int z;
#pragma omp parallel
{
x = 0;
y = 1;
z = 2;
}
}
}
}
return 0;
}
```
In that case, `private(x)` being the clause that could be represented as an alloca at the `omp.target` level, Clang does use `__kmpc_alloc_shared` to allocate it, like it does for `y` and `z`. It also uses shared memory for split `target parallel` cases like the one below:
```c++
void f() {
int x;
#pragma omp target private(x)
{
int y;
#pragma omp parallel
{
x = 0;
y = 1;
}
}
}
```
https://github.com/llvm/llvm-project/pull/161863
More information about the llvm-branch-commits
mailing list