[flang-commits] [flang] [Flang][OpenMP] Create MLIR optimization pass to push index allocations into loop body and remove them if redundant (PR #67010)
Dominik Adamski via flang-commits
flang-commits at lists.llvm.org
Fri Sep 22 04:50:18 PDT 2023
DominikAdamski wrote:
Hi,
We would like to introduce new loop sharing functions for offloaded code. The main idea is as follows:
Input code:
```
!$omp target parallel do
do i = 1, 1024
e(i) = i
end do
```
Corresponding MLIR code:
```
omp.target map((tofrom -> %arg0 : !llvm.ptr<array<1024 x i32>>)) {
omp.parallel {
**%i = llvm.alloca (...)** ;we would like to move this alloca to the loop body
omp.wsloop for (%arg1) : i32 = (%2) to (%1) inclusive step (%2) {
llvm.store %arg1, %4 {tbaa = [#tbaa_tag]} : !llvm.ptr<i32>
%5 = llvm.load %i {tbaa = [#tbaa_tag]} : !llvm.ptr<i32>
%6 = llvm.sext %5 : i32 to i64
%7 = llvm.sub %6, %0 : i64
%8 = llvm.getelementptr %arg0[0, %7] : (!llvm.ptr<array<1024 x i32>>, i64) -> !llvm.ptr<i32>
llvm.store %5, %8 {tbaa = [#tbaa_tag]} : !llvm.ptr<i32>
omp.yield
}
omp.terminator
}
omp.terminator
}
```
Desired LLVM IR:
```
gpu_kernel target_kernel (ptr %arg_e) {
; kmpc_target_init
call __kmpc_parallel_51 (parallel_function, arg_e);
; kmpc_target_init
}
; function which corresponds to the MLIR parallel region
void parallel_function (ptr %tid.addr, ptr %zero.addr, ptr %arg_e) {
call __kmpc_for_static_loop_4(loop_body_function, num_iters, arg_e, ...)
}
;function which corresponds to the wsloop body:
void loop_body_function(int32 %cnt , struct {ptr %arg_e}) {
;LLVM-IR code which corresponds to **e[i] = i**
; **i** from source depends on cnt which is set by the function __kmpc_for_static_loop_4
; we do not need to pass variable **i** as the part of the second argument
; variable cnt which is handled by OpenMP runtime is enough for performing loop body code
}
```
Without Sergio's patch we get the following LLVM IR:
```
gpu_kernel target_kernel (ptr %arg_e) {
; kmpc_target_init
call __kmpc_parallel_51 (parallel_function, arg_e);
; kmpc_target_init
}
; function which corresponds to the MLIR parallel region
void parallel_function (ptr %tid.addr, ptr %zero.addr, ptr %arg_e) {
**%additional_i** = alloca i32;
call __kmpc_for_static_loop_4(loop_body_function, num_iters, **struct{arg_e, additional_i}** ...)
}
;function which corresponds to the wsloop body:
void loop_body_function(int32 %cnt , **ptr struct{ ptr %arg_e, additional_i}** ) {
;LLVM-IR code which corresponds to **e[i] = i**
;**store new value inside additional_i
; **i** from source code depends on cnt which is set by the function __kmpc_for_static_loop_4
}
```
Link to the similar OpenMP DeviceRTL functions: https://github.com/jdoerfert/llvm-project/blob/IPDPS22/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
https://github.com/llvm/llvm-project/pull/67010
More information about the flang-commits
mailing list