[Openmp-commits] [openmp] [mlir] [llvm] [OpenMP][OMPIRBuilder] Fix LLVM IR codegen for collapsed device loop (PR #78708)

Fri Jan 19 04:45:02 PST 2024

https://github.com/DominikAdamski created https://github.com/llvm/llvm-project/pull/78708

When we generate the loop body function, we need to be sure, that all original loop counters are replaced by the new counter.

We need to save all items which use the original loop counter and then perform replacement of the original loop counter. If we don't do it, there is a risk that some values are not updated.

>From e0562229e5628b437aa7af4e5eace5f844d5c2b4 Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski at amd.com>
Date: Fri, 19 Jan 2024 06:20:06 -0600
Subject: [PATCH] [OpenMP][OMPIRBuilder] Fix LLVM IR codegen for collapsed
 device loop

When we generate the loop body function, we need to be sure,
that all original loop counters are replaced by the new counter.

We need to save all items which use the original loop counter
and then perform replacement of the original loop counter. If we don't
do it, there is a risk that some values are not updated.
---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  7 +--
 .../LLVMIR/omptarget-wsloop-collapsed.mlir    | 39 ++++++++++++++++
 .../fortran/target-parallel-do-collapse.f90   | 44 +++++++++++++++++++
 3 files changed, 87 insertions(+), 3 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir
 create mode 100644 openmp/libomptarget/test/offloading/fortran/target-parallel-do-collapse.f90

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index f6cf358119fb715..211281452de22d8 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2876,9 +2876,10 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
   // We need to model loop body region as the function f(cnt, loop_arg).
   // That's why we replace loop induction variable by the new counter
   // which will be one of loop body function argument
-  for (auto Use = CLI->getIndVar()->user_begin();
-       Use != CLI->getIndVar()->user_end(); ++Use) {
-    if (Instruction *Inst = dyn_cast<Instruction>(*Use)) {
+  SmallVector<User *> Users(CLI->getIndVar()->user_begin(),
+                            CLI->getIndVar()->user_end());
+  for (auto Use : Users) {
+    if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
       if (ParallelRegionBlockSet.count(Inst->getParent())) {
         Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
       }
diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir
new file mode 100644
index 000000000000000..e246c551886cfa0
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir
@@ -0,0 +1,39 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// The aim of the test is to check the GPU LLVM IR codegen
+// for nested omp do loop with collapse clause inside omp target region
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } {
+  llvm.func @target_collapsed_wsloop(%arg0: !llvm.ptr) {
+    %loop_ub = llvm.mlir.constant(99 : i32) : i32
+    %loop_lb = llvm.mlir.constant(0 : i32) : i32
+    %loop_step = llvm.mlir.constant(1 : index) : i32
+    omp.wsloop for  (%arg1, %arg2) : i32 = (%loop_lb, %loop_lb) to (%loop_ub, %loop_ub) inclusive step (%loop_step, %loop_step) {
+      %1 = llvm.add %arg1, %arg2  : i32
+      %2 = llvm.mul %arg2, %loop_ub overflow<nsw>  : i32
+      %3 = llvm.add %arg1, %2 :i32
+      %4 = llvm.getelementptr %arg0[%3] : (!llvm.ptr, i32) -> !llvm.ptr, i32
+      llvm.store %1, %4 : i32, !llvm.ptr
+      omp.yield
+    }
+    llvm.return
+  }
+}
+
+// CHECK: define void @[[FUNC_COLLAPSED_WSLOOP:.*]](ptr %[[ARG0:.*]])
+// CHECK:   call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr),
+// CHECK-SAME: ptr @[[COLLAPSED_WSLOOP_BODY_FN:.*]], ptr %[[STRUCT_ARG:.*]], i32 10000,
+// CHECK-SAME: i32 %[[NUM_THREADS:.*]], i32 0)
+
+// CHECK: define internal void @[[COLLAPSED_WSLOOP_BODY_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])
+// CHECK:   %[[TMP0:.*]] = urem i32 %[[LOOP_CNT]], 100
+// CHECK:   %[[TMP1:.*]] = udiv i32 %[[LOOP_CNT]], 100
+// CHECK:   %[[TMP2:.*]] = mul i32 %[[TMP1]], 1
+// CHECK:   %[[TMP3:.*]] = add i32 %[[TMP2]], 0
+// CHECK:   %[[TMP4:.*]] = mul i32 %[[TMP0]], 1
+// CHECK:   %[[TMP5:.*]] = add i32 %[[TMP4]], 0
+// CHECK:   %[[TMP6:.*]] = add i32 %[[TMP3]], %[[TMP5]]
+// CHECK:   %[[TMP7:.*]] = mul nsw i32 %[[TMP5]], 99
+// CHECK:   %[[TMP8:.*]] = add i32 %[[TMP3]], %[[TMP7]]
+// CHECK:   %[[TMP9:.*]] = getelementptr i32, ptr %[[ARRAY:.*]], i32 %[[TMP8]]
+// CHECK:   store i32 %[[TMP6]], ptr %[[TMP9]], align 4
diff --git a/openmp/libomptarget/test/offloading/fortran/target-parallel-do-collapse.f90 b/openmp/libomptarget/test/offloading/fortran/target-parallel-do-collapse.f90
new file mode 100644
index 000000000000000..e44d6a50969bbc4
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/target-parallel-do-collapse.f90
@@ -0,0 +1,44 @@
+! Basic offloading test with a target region
+! REQUIRES: flang
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-generic
+! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+program main
+   use omp_lib
+   implicit none
+   integer :: i,j
+   integer :: array(10,10), errors = 0
+   do i = 1, 10
+      do j = 1, 10
+         array(j, i) = 0
+      end do
+   end do
+
+   !$omp target parallel do map(from:array) collapse(2)
+   do i = 1, 10
+      do j = 1, 10
+         array( j, i) = i + j
+      end do
+    end do
+    !$omp end target parallel do
+
+    do i = 1, 10
+       do j = 1, 10
+          if ( array( j, i) .ne. (i + j) ) then
+             errors = errors + 1
+          end if
+       end do
+   end do
+
+   print *,"number of errors: ", errors
+
+end program main
+
+! CHECK:  "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
+! CHECK:  number of errors: 0
+