[flang-commits] [flang] [Flang][OpenMP] Create MLIR optimization pass to push index allocations into loop body and remove them if redundant (PR #67010)

Dominik Adamski via flang-commits flang-commits at lists.llvm.org
Fri Sep 22 04:50:18 PDT 2023


DominikAdamski wrote:

Hi,
We would like to introduce new loop sharing functions for offloaded code. The main idea is as follows:

Input code:
```
 !$omp target parallel do
   do i = 1, 1024
      e(i) = i
   end do
```
Corresponding MLIR code:
```
omp.target   map((tofrom -> %arg0 : !llvm.ptr<array<1024 x i32>>)) {
   omp.parallel   {
   **%i = llvm.alloca (...)** ;we would like to move this alloca to the loop body
         omp.wsloop   for  (%arg1) : i32 = (%2) to (%1) inclusive step (%2) {
           llvm.store %arg1, %4 {tbaa = [#tbaa_tag]} : !llvm.ptr<i32>
          %5 = llvm.load %i {tbaa = [#tbaa_tag]} : !llvm.ptr<i32>
          %6 = llvm.sext %5 : i32 to i64
          %7 = llvm.sub %6, %0  : i64
          %8 = llvm.getelementptr %arg0[0, %7] : (!llvm.ptr<array<1024 x i32>>, i64) -> !llvm.ptr<i32>
          llvm.store %5, %8 {tbaa = [#tbaa_tag]} : !llvm.ptr<i32>
          omp.yield
        }
        omp.terminator
      }
      omp.terminator
    }

```
Desired LLVM IR:
```
gpu_kernel target_kernel (ptr %arg_e) {
 ; kmpc_target_init
 call __kmpc_parallel_51 (parallel_function, arg_e);
 ; kmpc_target_init
}

; function which corresponds to the MLIR parallel region
void parallel_function (ptr %tid.addr, ptr %zero.addr, ptr %arg_e) {
    call __kmpc_for_static_loop_4(loop_body_function, num_iters, arg_e, ...)
}

;function which corresponds to the wsloop body:
void loop_body_function(int32 %cnt , struct {ptr %arg_e}) {
    ;LLVM-IR code which corresponds to **e[i] = i**
    ; **i** from source depends on cnt which is set by the function __kmpc_for_static_loop_4
   ; we do not need to pass variable **i** as the part of the second argument
   ; variable cnt which is handled by OpenMP runtime is enough for performing loop body code 
}
```
Without Sergio's patch we get the following LLVM IR:

```
gpu_kernel target_kernel (ptr %arg_e) {
 ; kmpc_target_init
 call __kmpc_parallel_51 (parallel_function, arg_e);
 ; kmpc_target_init
}

; function which corresponds to the MLIR parallel region
void parallel_function (ptr %tid.addr, ptr %zero.addr, ptr %arg_e) {
    **%additional_i** = alloca i32;
    call __kmpc_for_static_loop_4(loop_body_function, num_iters, **struct{arg_e, additional_i}** ...)
}

;function which corresponds to the wsloop body:
void loop_body_function(int32 %cnt , **ptr struct{ ptr %arg_e, additional_i}** ) {
    ;LLVM-IR code which corresponds to **e[i] = i**
    ;**store new value inside additional_i
    ; **i** from source code depends on cnt which is set by the function __kmpc_for_static_loop_4
}
```
Link to the similar OpenMP DeviceRTL functions: https://github.com/jdoerfert/llvm-project/blob/IPDPS22/openmp/libomptarget/DeviceRTL/src/Workshare.cpp 

https://github.com/llvm/llvm-project/pull/67010


More information about the flang-commits mailing list