[Mlir-commits] [llvm] [mlir] [OpenMP][IRBuilder] Don't initialize `kmp_dep_info` instances in alloc regions (PR #131795)

Kareem Ergawy llvmlistbot at llvm.org
Tue Mar 18 05:55:58 PDT 2025


https://github.com/ergawy created https://github.com/llvm/llvm-project/pull/131795



Fixes #121289

Given the following MLIR, where a variable: `x`, is `private` for the `omp.parallel` op and a `depend` for the `omp.task` op:
```mlir
omp.private {type = private} @_QFEx_private_i32 : i32
llvm.func @nested_task_with_deps() {
  %0 = llvm.mlir.constant(1 : i64) : i64
  %1 = llvm.alloca %0 x i32 {bindc_name = "x"} : (i64) -> !llvm.ptr
  %2 = llvm.mlir.constant(1 : i64) : i64
  omp.parallel private(@_QFEx_private_i32 %1 -> %arg0 : !llvm.ptr) {
    omp.task depend(taskdependout -> %arg0 : !llvm.ptr) {
      omp.terminator
    }
    omp.terminator
  }
  llvm.return
}
```

Before the fix proposed by this PR, the IR builder would emit the allocation and the initialzation logic for the task's depedency info struct in the parent function's alloc region:
```llvm
define void @nested_task_with_deps() {
  ....
  %.dep.arr.addr = alloca [1 x %struct.kmp_dep_info], align 8
  %2 = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %.dep.arr.addr, i64 0, i64 0
  %3 = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %2, i32 0, i32 0
  %4 = ptrtoint ptr %omp.private.alloc to i64
  store i64 %4, ptr %3, align 4
  ....
  br label %entry

omp.par.entry:                                    ; preds = %entry
  ....
  %omp.private.alloc = alloca i32, align 4
```

Note the following:
- The private value `x` is alloced where it should be in the parallel op's entry region,
- howerver, since the privae value is also a depedency of the task and since allocation and initialzation of the task depedency info struct both happen in the alloc region,
- this results in the private value being referenced before it is actually defined.

This PR fixes the issue by only allocating the task depedency info in the alloc region while initialzation happens in the current IP of the function with the rest of the logic that depends on it.

>From 953f41d06e797b49313bce326d9c4b328e3fdb2c Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Tue, 18 Mar 2025 07:39:21 -0500
Subject: [PATCH] [OpenMP][IRBuilder] Don't initialize `kmp_dep_info` instances
 in alloc regions

Fixes #121289

Given the following MLIR, where a variable: `x`, is `private` for the
`omp.parallel` op and a `depend` for the `omp.task` op:
```mlir
omp.private {type = private} @_QFEx_private_i32 : i32
llvm.func @nested_task_with_deps() {
  %0 = llvm.mlir.constant(1 : i64) : i64
  %1 = llvm.alloca %0 x i32 {bindc_name = "x"} : (i64) -> !llvm.ptr
  %2 = llvm.mlir.constant(1 : i64) : i64
  omp.parallel private(@_QFEx_private_i32 %1 -> %arg0 : !llvm.ptr) {
    omp.task depend(taskdependout -> %arg0 : !llvm.ptr) {
      omp.terminator
    }
    omp.terminator
  }
  llvm.return
}
```

Before the fix proposed by this PR, the IR builder would emit the
allocation and the initialzation logic for the task's depedency info
struct in the parent function's alloc region:
```llvm
define void @nested_task_with_deps() {
  ....
  %.dep.arr.addr = alloca [1 x %struct.kmp_dep_info], align 8
  %2 = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %.dep.arr.addr, i64 0, i64 0
  %3 = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %2, i32 0, i32 0
  %4 = ptrtoint ptr %omp.private.alloc to i64
  store i64 %4, ptr %3, align 4
  ....
  br label %entry

omp.par.entry:                                    ; preds = %entry
  ....
  %omp.private.alloc = alloca i32, align 4
```

Note the following:
- The private value `x` is alloced where it should be in the parallel
  op's entry region,
- howerver, since the privae value is also a depedency of the task and
  since allocation and initialzation of the task depedency info struct
  both happen in the alloc region,
- this results in the private value being referenced before it is
  actually defined.

This PR fixes the issue by only allocating the task depedency info in
the alloc region while initialzation happens in the current IP of the
function with the rest of the logic that depends on it.
---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp |  3 +-
 mlir/test/Target/LLVMIR/openmp-llvm.mlir  | 67 ++++++++++++++++++++---
 2 files changed, 60 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 8dcebcdb8791d..c1a533aea529f 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2056,6 +2056,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
       Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
       DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
 
+      Builder.restoreIP(OldIP);
+
       unsigned P = 0;
       for (const DependData &Dep : Dependencies) {
         Value *Base =
@@ -2085,7 +2087,6 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
         ++P;
       }
 
-      Builder.restoreIP(OldIP);
     }
 
     // In the presence of the `if` clause, the following IR is generated:
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index e4114d491fc85..d7f4d0a65b24c 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -2651,6 +2651,16 @@ llvm.func @omp_task_attrs() -> () attributes {
 // CHECK-LABEL: define void @omp_task_with_deps
 // CHECK-SAME: (ptr %[[zaddr:.+]])
 // CHECK:  %[[dep_arr_addr:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
+// CHECK:  %[[DEP_ARR_ADDR1:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
+// CHECK:  %[[DEP_ARR_ADDR2:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
+// CHECK:  %[[DEP_ARR_ADDR3:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
+// CHECK:  %[[DEP_ARR_ADDR4:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
+
+// CHECK: %[[omp_global_thread_num:.+]] = call i32 @__kmpc_global_thread_num({{.+}})
+// CHECK: %[[task_data:.+]] = call ptr @__kmpc_omp_task_alloc
+// CHECK-SAME: (ptr @{{.+}}, i32 %[[omp_global_thread_num]], i32 1, i64 40,
+// CHECK-SAME:  i64 0, ptr @[[outlined_fn:.+]])
+
 // CHECK:  %[[dep_arr_addr_0:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[dep_arr_addr]], i64 0, i64 0
 // CHECK:  %[[dep_arr_addr_0_val:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[dep_arr_addr_0]], i32 0, i32 0
 // CHECK:  %[[dep_arr_addr_0_val_int:.+]] = ptrtoint ptr %0 to i64
@@ -2659,40 +2669,33 @@ llvm.func @omp_task_attrs() -> () attributes {
 // CHECK:  store i64 8, ptr %[[dep_arr_addr_0_size]], align 4
 // CHECK:  %[[dep_arr_addr_0_kind:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[dep_arr_addr_0]], i32 0, i32 2
 // CHECK: store i8 1, ptr %[[dep_arr_addr_0_kind]], align 1
+
+// CHECK: call i32 @__kmpc_omp_task_with_deps(ptr @{{.+}}, i32 %[[omp_global_thread_num]], ptr %[[task_data]], {{.*}})
 // -----
 // dependence_type: Out
-// CHECK:  %[[DEP_ARR_ADDR1:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
 // CHECK:  %[[DEP_ARR_ADDR_1:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARR_ADDR1]], i64 0, i64 0
 //         [...]
 // CHECK:  %[[DEP_TYPE_1:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[DEP_ARR_ADDR_1]], i32 0, i32 2
 // CHECK:  store i8 3, ptr %[[DEP_TYPE_1]], align 1
 // -----
 // dependence_type: Inout
-// CHECK:  %[[DEP_ARR_ADDR2:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
 // CHECK:  %[[DEP_ARR_ADDR_2:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARR_ADDR2]], i64 0, i64 0
 //         [...]
 // CHECK:  %[[DEP_TYPE_2:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[DEP_ARR_ADDR_2]], i32 0, i32 2
 // CHECK:  store i8 3, ptr %[[DEP_TYPE_2]], align 1
 // -----
 // dependence_type: Mutexinoutset
-// CHECK:  %[[DEP_ARR_ADDR3:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
 // CHECK:  %[[DEP_ARR_ADDR_3:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARR_ADDR3]], i64 0, i64 0
 //         [...]
 // CHECK:  %[[DEP_TYPE_3:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[DEP_ARR_ADDR_3]], i32 0, i32 2
 // CHECK:  store i8 4, ptr %[[DEP_TYPE_3]], align 1
 // -----
 // dependence_type: Inoutset
-// CHECK:  %[[DEP_ARR_ADDR4:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
 // CHECK:  %[[DEP_ARR_ADDR_4:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARR_ADDR4]], i64 0, i64 0
 //         [...]
 // CHECK:  %[[DEP_TYPE_4:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[DEP_ARR_ADDR_4]], i32 0, i32 2
 // CHECK:  store i8 8, ptr %[[DEP_TYPE_4]], align 1
 llvm.func @omp_task_with_deps(%zaddr: !llvm.ptr) {
-  // CHECK: %[[omp_global_thread_num:.+]] = call i32 @__kmpc_global_thread_num({{.+}})
-  // CHECK: %[[task_data:.+]] = call ptr @__kmpc_omp_task_alloc
-  // CHECK-SAME: (ptr @{{.+}}, i32 %[[omp_global_thread_num]], i32 1, i64 40,
-  // CHECK-SAME:  i64 0, ptr @[[outlined_fn:.+]])
-  // CHECK: call i32 @__kmpc_omp_task_with_deps(ptr @{{.+}}, i32 %[[omp_global_thread_num]], ptr %[[task_data]], {{.*}})
   omp.task depend(taskdependin -> %zaddr : !llvm.ptr) {
     %n = llvm.mlir.constant(1 : i64) : i64
     %valaddr = llvm.alloca %n x i32 : (i64) -> !llvm.ptr
@@ -3373,3 +3376,49 @@ llvm.func @distribute_wsloop(%lb : i32, %ub : i32, %step : i32) {
 // CHECK:         %[[TID:.*]] = call i32 @__kmpc_global_thread_num({{.*}})
 // CHECK:         %[[DIST_UB:.*]] = alloca i32
 // CHECK:         call void @__kmpc_dist_for_static_init_{{.*}}(ptr @{{.*}}, i32 %[[TID]], i32 34, ptr %[[LASTITER]], ptr %[[LB]], ptr %[[UB]], ptr %[[DIST_UB]], ptr %[[STRIDE]], i32 1, i32 0)
+
+// -----
+
+omp.private {type = private} @_QFEx_private_i32 : i32
+llvm.func @nested_task_with_deps() {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "x"} : (i64) -> !llvm.ptr
+  %2 = llvm.mlir.constant(1 : i64) : i64
+  omp.parallel private(@_QFEx_private_i32 %1 -> %arg0 : !llvm.ptr) {
+    omp.task depend(taskdependout -> %arg0 : !llvm.ptr) {
+      omp.terminator
+    }
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: define void @nested_task_with_deps() {
+// CHECK:         %[[PAR_FORK_ARG:.*]] = alloca { ptr }, align 8
+// CHECK:         %[[DEP_ARR:.*]] = alloca [1 x %struct.kmp_dep_info], align 8
+
+// CHECK:       omp_parallel:
+// CHECK-NEXT:    %[[DEP_ARR_GEP:.*]] = getelementptr { ptr }, ptr %[[PAR_FORK_ARG]], i32 0, i32 0
+// CHECK-NEXT:    store ptr %[[DEP_ARR]], ptr %[[DEP_ARR_GEP]], align 8
+// CHECK-NEXT:    call void {{.*}} @__kmpc_fork_call(ptr @{{.*}}, i32 1, ptr @[[PAR_OUTLINED:.*]], ptr %[[PAR_FORK_ARG]])
+// CHECK-NEXT:    br label %[[PAR_EXIT:.*]]
+
+// CHECK:       [[PAR_EXIT]]:
+// CHECK-NEXT:    ret void
+// CHECK:       }
+
+// CHECK:       define internal void @[[PAR_OUTLINED]]{{.*}} {
+// CHECK:       omp.par.entry:
+// CHECK:         %[[DEP_ARR_GEP_2:.*]] = getelementptr { ptr }, ptr %{{.*}}, i32 0, i32 0
+// CHECK:         %[[DEP_ARR_2:.*]] = load ptr, ptr %[[DEP_ARR_GEP_2]], align 8
+// CHECK:         %[[PRIV_ALLOC:omp.private.alloc]] = alloca i32, align 4
+
+// CHECK:         %[[TASK:.*]] = call ptr @__kmpc_omp_task_alloc
+// CHECK:         %[[DEP_STRUCT_GEP:.*]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARR_2]], i64 0, i64 0
+// CHECK:         %[[DEP_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[DEP_STRUCT_GEP]], i32 0, i32 0
+// CHECK:         %[[PRIV_ALLOC_TO_INT:.*]] = ptrtoint ptr %[[PRIV_ALLOC]] to i64
+// CHECK:         store i64 %[[PRIV_ALLOC_TO_INT]], ptr %[[DEP_GEP]], align 4
+// CHECK:         call i32 @__kmpc_omp_task_with_deps(ptr @{{.*}}, i32 %{{.*}}, ptr %{{.*}}, i32 1, ptr %[[DEP_ARR_2]], i32 0, ptr null)
+
+// CHECK:         ret void
+// CHECK:       }



More information about the Mlir-commits mailing list