[Mlir-commits] [llvm] [mlir] [OpenMP][IRBuilder] Don't initialize `kmp_dep_info` instances in alloc regions (PR #131795)

llvmlistbot at llvm.org llvmlistbot at llvm.org
Tue Mar 18 05:56:34 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-mlir-llvm

Author: Kareem Ergawy (ergawy)

<details>
<summary>Changes</summary>



Fixes #<!-- -->121289

Given the following MLIR, where a variable: `x`, is `private` for the `omp.parallel` op and a `depend` for the `omp.task` op:
```mlir
omp.private {type = private} @<!-- -->_QFEx_private_i32 : i32
llvm.func @<!-- -->nested_task_with_deps() {
  %0 = llvm.mlir.constant(1 : i64) : i64
  %1 = llvm.alloca %0 x i32 {bindc_name = "x"} : (i64) -> !llvm.ptr
  %2 = llvm.mlir.constant(1 : i64) : i64
  omp.parallel private(@<!-- -->_QFEx_private_i32 %1 -> %arg0 : !llvm.ptr) {
    omp.task depend(taskdependout -> %arg0 : !llvm.ptr) {
      omp.terminator
    }
    omp.terminator
  }
  llvm.return
}
```

Before the fix proposed by this PR, the IR builder would emit the allocation and the initialzation logic for the task's depedency info struct in the parent function's alloc region:
```llvm
define void @<!-- -->nested_task_with_deps() {
  ....
  %.dep.arr.addr = alloca [1 x %struct.kmp_dep_info], align 8
  %2 = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %.dep.arr.addr, i64 0, i64 0
  %3 = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %2, i32 0, i32 0
  %4 = ptrtoint ptr %omp.private.alloc to i64
  store i64 %4, ptr %3, align 4
  ....
  br label %entry

omp.par.entry:                                    ; preds = %entry
  ....
  %omp.private.alloc = alloca i32, align 4
```

Note the following:
- The private value `x` is alloced where it should be in the parallel op's entry region,
- howerver, since the privae value is also a depedency of the task and since allocation and initialzation of the task depedency info struct both happen in the alloc region,
- this results in the private value being referenced before it is actually defined.

This PR fixes the issue by only allocating the task depedency info in the alloc region while initialzation happens in the current IP of the function with the rest of the logic that depends on it.

---
Full diff: https://github.com/llvm/llvm-project/pull/131795.diff


2 Files Affected:

- (modified) llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp (+2-1) 
- (modified) mlir/test/Target/LLVMIR/openmp-llvm.mlir (+58-9) 


``````````diff
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 8dcebcdb8791d..c1a533aea529f 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2056,6 +2056,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
       Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
       DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
 
+      Builder.restoreIP(OldIP);
+
       unsigned P = 0;
       for (const DependData &Dep : Dependencies) {
         Value *Base =
@@ -2085,7 +2087,6 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
         ++P;
       }
 
-      Builder.restoreIP(OldIP);
     }
 
     // In the presence of the `if` clause, the following IR is generated:
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index e4114d491fc85..d7f4d0a65b24c 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -2651,6 +2651,16 @@ llvm.func @omp_task_attrs() -> () attributes {
 // CHECK-LABEL: define void @omp_task_with_deps
 // CHECK-SAME: (ptr %[[zaddr:.+]])
 // CHECK:  %[[dep_arr_addr:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
+// CHECK:  %[[DEP_ARR_ADDR1:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
+// CHECK:  %[[DEP_ARR_ADDR2:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
+// CHECK:  %[[DEP_ARR_ADDR3:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
+// CHECK:  %[[DEP_ARR_ADDR4:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
+
+// CHECK: %[[omp_global_thread_num:.+]] = call i32 @__kmpc_global_thread_num({{.+}})
+// CHECK: %[[task_data:.+]] = call ptr @__kmpc_omp_task_alloc
+// CHECK-SAME: (ptr @{{.+}}, i32 %[[omp_global_thread_num]], i32 1, i64 40,
+// CHECK-SAME:  i64 0, ptr @[[outlined_fn:.+]])
+
 // CHECK:  %[[dep_arr_addr_0:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[dep_arr_addr]], i64 0, i64 0
 // CHECK:  %[[dep_arr_addr_0_val:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[dep_arr_addr_0]], i32 0, i32 0
 // CHECK:  %[[dep_arr_addr_0_val_int:.+]] = ptrtoint ptr %0 to i64
@@ -2659,40 +2669,33 @@ llvm.func @omp_task_attrs() -> () attributes {
 // CHECK:  store i64 8, ptr %[[dep_arr_addr_0_size]], align 4
 // CHECK:  %[[dep_arr_addr_0_kind:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[dep_arr_addr_0]], i32 0, i32 2
 // CHECK: store i8 1, ptr %[[dep_arr_addr_0_kind]], align 1
+
+// CHECK: call i32 @__kmpc_omp_task_with_deps(ptr @{{.+}}, i32 %[[omp_global_thread_num]], ptr %[[task_data]], {{.*}})
 // -----
 // dependence_type: Out
-// CHECK:  %[[DEP_ARR_ADDR1:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
 // CHECK:  %[[DEP_ARR_ADDR_1:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARR_ADDR1]], i64 0, i64 0
 //         [...]
 // CHECK:  %[[DEP_TYPE_1:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[DEP_ARR_ADDR_1]], i32 0, i32 2
 // CHECK:  store i8 3, ptr %[[DEP_TYPE_1]], align 1
 // -----
 // dependence_type: Inout
-// CHECK:  %[[DEP_ARR_ADDR2:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
 // CHECK:  %[[DEP_ARR_ADDR_2:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARR_ADDR2]], i64 0, i64 0
 //         [...]
 // CHECK:  %[[DEP_TYPE_2:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[DEP_ARR_ADDR_2]], i32 0, i32 2
 // CHECK:  store i8 3, ptr %[[DEP_TYPE_2]], align 1
 // -----
 // dependence_type: Mutexinoutset
-// CHECK:  %[[DEP_ARR_ADDR3:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
 // CHECK:  %[[DEP_ARR_ADDR_3:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARR_ADDR3]], i64 0, i64 0
 //         [...]
 // CHECK:  %[[DEP_TYPE_3:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[DEP_ARR_ADDR_3]], i32 0, i32 2
 // CHECK:  store i8 4, ptr %[[DEP_TYPE_3]], align 1
 // -----
 // dependence_type: Inoutset
-// CHECK:  %[[DEP_ARR_ADDR4:.+]] = alloca [1 x %struct.kmp_dep_info], align 8
 // CHECK:  %[[DEP_ARR_ADDR_4:.+]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARR_ADDR4]], i64 0, i64 0
 //         [...]
 // CHECK:  %[[DEP_TYPE_4:.+]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[DEP_ARR_ADDR_4]], i32 0, i32 2
 // CHECK:  store i8 8, ptr %[[DEP_TYPE_4]], align 1
 llvm.func @omp_task_with_deps(%zaddr: !llvm.ptr) {
-  // CHECK: %[[omp_global_thread_num:.+]] = call i32 @__kmpc_global_thread_num({{.+}})
-  // CHECK: %[[task_data:.+]] = call ptr @__kmpc_omp_task_alloc
-  // CHECK-SAME: (ptr @{{.+}}, i32 %[[omp_global_thread_num]], i32 1, i64 40,
-  // CHECK-SAME:  i64 0, ptr @[[outlined_fn:.+]])
-  // CHECK: call i32 @__kmpc_omp_task_with_deps(ptr @{{.+}}, i32 %[[omp_global_thread_num]], ptr %[[task_data]], {{.*}})
   omp.task depend(taskdependin -> %zaddr : !llvm.ptr) {
     %n = llvm.mlir.constant(1 : i64) : i64
     %valaddr = llvm.alloca %n x i32 : (i64) -> !llvm.ptr
@@ -3373,3 +3376,49 @@ llvm.func @distribute_wsloop(%lb : i32, %ub : i32, %step : i32) {
 // CHECK:         %[[TID:.*]] = call i32 @__kmpc_global_thread_num({{.*}})
 // CHECK:         %[[DIST_UB:.*]] = alloca i32
 // CHECK:         call void @__kmpc_dist_for_static_init_{{.*}}(ptr @{{.*}}, i32 %[[TID]], i32 34, ptr %[[LASTITER]], ptr %[[LB]], ptr %[[UB]], ptr %[[DIST_UB]], ptr %[[STRIDE]], i32 1, i32 0)
+
+// -----
+
+omp.private {type = private} @_QFEx_private_i32 : i32
+llvm.func @nested_task_with_deps() {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "x"} : (i64) -> !llvm.ptr
+  %2 = llvm.mlir.constant(1 : i64) : i64
+  omp.parallel private(@_QFEx_private_i32 %1 -> %arg0 : !llvm.ptr) {
+    omp.task depend(taskdependout -> %arg0 : !llvm.ptr) {
+      omp.terminator
+    }
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: define void @nested_task_with_deps() {
+// CHECK:         %[[PAR_FORK_ARG:.*]] = alloca { ptr }, align 8
+// CHECK:         %[[DEP_ARR:.*]] = alloca [1 x %struct.kmp_dep_info], align 8
+
+// CHECK:       omp_parallel:
+// CHECK-NEXT:    %[[DEP_ARR_GEP:.*]] = getelementptr { ptr }, ptr %[[PAR_FORK_ARG]], i32 0, i32 0
+// CHECK-NEXT:    store ptr %[[DEP_ARR]], ptr %[[DEP_ARR_GEP]], align 8
+// CHECK-NEXT:    call void {{.*}} @__kmpc_fork_call(ptr @{{.*}}, i32 1, ptr @[[PAR_OUTLINED:.*]], ptr %[[PAR_FORK_ARG]])
+// CHECK-NEXT:    br label %[[PAR_EXIT:.*]]
+
+// CHECK:       [[PAR_EXIT]]:
+// CHECK-NEXT:    ret void
+// CHECK:       }
+
+// CHECK:       define internal void @[[PAR_OUTLINED]]{{.*}} {
+// CHECK:       omp.par.entry:
+// CHECK:         %[[DEP_ARR_GEP_2:.*]] = getelementptr { ptr }, ptr %{{.*}}, i32 0, i32 0
+// CHECK:         %[[DEP_ARR_2:.*]] = load ptr, ptr %[[DEP_ARR_GEP_2]], align 8
+// CHECK:         %[[PRIV_ALLOC:omp.private.alloc]] = alloca i32, align 4
+
+// CHECK:         %[[TASK:.*]] = call ptr @__kmpc_omp_task_alloc
+// CHECK:         %[[DEP_STRUCT_GEP:.*]] = getelementptr inbounds [1 x %struct.kmp_dep_info], ptr %[[DEP_ARR_2]], i64 0, i64 0
+// CHECK:         %[[DEP_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_dep_info, ptr %[[DEP_STRUCT_GEP]], i32 0, i32 0
+// CHECK:         %[[PRIV_ALLOC_TO_INT:.*]] = ptrtoint ptr %[[PRIV_ALLOC]] to i64
+// CHECK:         store i64 %[[PRIV_ALLOC_TO_INT]], ptr %[[DEP_GEP]], align 4
+// CHECK:         call i32 @__kmpc_omp_task_with_deps(ptr @{{.*}}, i32 %{{.*}}, ptr %{{.*}}, i32 1, ptr %[[DEP_ARR_2]], i32 0, ptr null)
+
+// CHECK:         ret void
+// CHECK:       }

``````````

</details>


https://github.com/llvm/llvm-project/pull/131795


More information about the Mlir-commits mailing list