[flang-commits] [flang] [flang][OpenMP] Map `teams loop` to `teams distribute` when required. (PR #127489)

Thu Feb 20 07:18:46 PST 2025

https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/127489

>From e196418642e4cbe41693d920580be65db85f6b57 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Mon, 17 Feb 2025 04:10:59 -0600
Subject: [PATCH] [flang][OpenMP] Map `teams loop` to `teams distribute` when
 required.

This extends support for generic `loop` rewriting by:
1. Preventing nesting multiple worksharing loops inside each other. This
   is checked by walking the `teams loop` region searching for any
   `loop` directive whose `bind` modifier is `parallel`.
2. Preventing convert to worksharing loop if calls to unknow functions
   are found in the `loop` directive's body.

We walk the `teams loop` body to identify either of the above 2
conditions, if either of them is found to be true, we map the `loop`
directive to `distribute`.
---
 .../OpenMP/GenericLoopConversion.cpp          | 65 +++++++++++++-
 flang/test/Lower/OpenMP/loop-directive.f90    | 86 ++++++++++++++++++-
 2 files changed, 146 insertions(+), 5 deletions(-)

diff --git a/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp b/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp
index d2581e3ad0a0a..85e57bff8c420 100644
--- a/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp
@@ -56,7 +56,10 @@ class GenericLoopConversionPattern
           "not yet implemented: Combined `parallel loop` directive");
       break;
     case GenericLoopCombinedInfo::TeamsLoop:
-      rewriteToDistributeParallelDo(loopOp, rewriter);
+      if (teamsLoopCanBeParallelFor(loopOp))
+        rewriteToDistributeParallelDo(loopOp, rewriter);
+      else
+        rewriteToDistrbute(loopOp, rewriter);
       break;
     }
 
@@ -97,8 +100,6 @@ class GenericLoopConversionPattern
     if (!loopOp.getReductionVars().empty())
       return todo("reduction");
 
-    // TODO For `teams loop`, check similar constrains to what is checked
-    // by `TeamsLoopChecker` in SemaOpenMP.cpp.
     return mlir::success();
   }
 
@@ -118,6 +119,64 @@ class GenericLoopConversionPattern
     return result;
   }
 
+  /// Checks whether a `teams loop` construct can be rewriten to `teams
+  /// distribute parallel do` or it has to be converted to `teams distribute`.
+  ///
+  /// This checks similar constrains to what is checked by `TeamsLoopChecker` in
+  /// SemaOpenMP.cpp in clang.
+  static bool teamsLoopCanBeParallelFor(mlir::omp::LoopOp loopOp) {
+    bool canBeParallelFor =
+        !loopOp
+             .walk<mlir::WalkOrder::PreOrder>([&](mlir::Operation *nestedOp) {
+               if (nestedOp == loopOp)
+                 return mlir::WalkResult::advance();
+
+               if (auto nestedLoopOp =
+                       mlir::dyn_cast<mlir::omp::LoopOp>(nestedOp)) {
+                 GenericLoopCombinedInfo combinedInfo =
+                     findGenericLoopCombineInfo(nestedLoopOp);
+
+                 // Worksharing loops cannot be nested inside each other.
+                 // Therefore, if the current `loop` directive nests another
+                 // `loop` whose `bind` modifier is `parallel`, this `loop`
+                 // directive cannot be mapped to `distribute parallel for`
+                 // but rather only to `distribute`.
+                 if (combinedInfo == GenericLoopCombinedInfo::Standalone &&
+                     nestedLoopOp.getBindKind() &&
+                     *nestedLoopOp.getBindKind() ==
+                         mlir::omp::ClauseBindKind::Parallel)
+                   return mlir::WalkResult::interrupt();
+
+                 // TODO check for combined `parallel loop` when we support
+                 // it.
+               }
+
+               if (auto callOp =
+                       mlir::dyn_cast<mlir::CallOpInterface>(nestedOp)) {
+                 // Calls to non-OpenMP API runtime functions inhibits
+                 // transformation to `teams distribute parallel do` since the
+                 // called functions might have nested parallelism themselves.
+                 bool isOpenMPAPI = false;
+                 mlir::CallInterfaceCallable callable =
+                     callOp.getCallableForCallee();
+
+                 if (auto callableSymRef =
+                         mlir::dyn_cast<mlir::SymbolRefAttr>(callable))
+                   isOpenMPAPI =
+                       callableSymRef.getRootReference().strref().starts_with(
+                           "omp_");
+
+                 if (!isOpenMPAPI)
+                   return mlir::WalkResult::interrupt();
+               }
+
+               return mlir::WalkResult::advance();
+             })
+             .wasInterrupted();
+
+    return canBeParallelFor;
+  }
+
   void rewriteStandaloneLoop(mlir::omp::LoopOp loopOp,
                              mlir::ConversionPatternRewriter &rewriter) const {
     using namespace mlir::omp;
diff --git a/flang/test/Lower/OpenMP/loop-directive.f90 b/flang/test/Lower/OpenMP/loop-directive.f90
index 785f732e1b4f5..c98dd2b2b7eed 100644
--- a/flang/test/Lower/OpenMP/loop-directive.f90
+++ b/flang/test/Lower/OpenMP/loop-directive.f90
@@ -1,7 +1,8 @@
 ! This test checks lowering of OpenMP loop Directive.
 
-! RUN: bbc -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
-! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
+! REQUIRES: openmp_runtime
+
+! RUN: %flang_fc1 -emit-hlfir %openmp_flags -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s
 
 ! CHECK: omp.declare_reduction @[[RED:add_reduction_i32]] : i32
 ! CHECK: omp.private {type = private} @[[DUMMY_PRIV:.*test_privateEdummy_private.*]] : i32
@@ -179,3 +180,84 @@ subroutine test_standalone_bind_parallel
     c(i) = a(i) * b(i)
   end do
 end subroutine
+
+! CHECK-LABEL: func.func @_QPteams_loop_cannot_be_parallel_for
+subroutine teams_loop_cannot_be_parallel_for
+  implicit none
+  integer :: iter, iter2, val(20)
+  val = 0
+  ! CHECK: omp.teams {
+
+  ! Verify the outer `loop` directive was mapped to only `distribute`.
+  ! CHECK-NOT: omp.parallel {{.*}}
+  ! CHECK:     omp.distribute {{.*}} {
+  ! CHECK-NEXT:  omp.loop_nest {{.*}} {
+
+  ! Verify the inner `loop` directive was mapped to a worksharing loop.
+  ! CHECK:         omp.wsloop {{.*}} {
+  ! CHECK-NEXT:      omp.loop_nest {{.*}} {
+  ! CHECK:           }
+  ! CHECK:         }
+
+  ! CHECK:       }
+  ! CHECK:     }
+
+  ! CHECK: }
+  !$omp target teams loop map(tofrom:val)
+  DO iter = 1, 5
+    !$omp loop bind(parallel)
+    DO iter2 = 1, 5
+      val(iter+iter2) = iter+iter2
+    END DO
+  END DO
+end subroutine
+
+subroutine foo()
+end subroutine
+
+! CHECK-LABEL: func.func @_QPteams_loop_cannot_be_parallel_for_2
+subroutine teams_loop_cannot_be_parallel_for_2
+  implicit none
+  integer :: iter, val(20)
+  val = 0
+
+  ! CHECK: omp.teams {
+
+  ! Verify the `loop` directive was mapped to only `distribute`.
+  ! CHECK-NOT: omp.parallel {{.*}}
+  ! CHECK:     omp.distribute {{.*}} {
+  ! CHECK-NEXT:  omp.loop_nest {{.*}} {
+  ! CHECK:         fir.call @_QPfoo
+  ! CHECK:       }
+  ! CHECK:     }
+
+  ! CHECK: }
+  !$omp target teams loop map(tofrom:val)
+  DO iter = 1, 5
+    call foo()
+  END DO
+end subroutine
+
+! CHECK-LABEL: func.func @_QPteams_loop_cannot_be_parallel_for_3
+subroutine teams_loop_cannot_be_parallel_for_3
+  use omp_lib
+  implicit none
+  integer :: iter, tid, val(20)
+  val = 0
+
+  !CHECK: omp.teams {
+  !CHECK:   omp.parallel {{.*}} {
+  !CHECK:     omp.distribute {
+  !CHECK:       omp.wsloop {
+  !CHECK:         omp.loop_nest {{.*}} {
+  !CHECK:           fir.call @omp_get_thread_num()
+  !CHECK:         }
+  !CHECK:       }
+  !CHECK:     }
+  !CHECK:   }
+  !CHECK: }
+  !$omp target teams loop map(tofrom:val)
+  DO iter = 1, 5
+    tid = omp_get_thread_num()
+  END DO
+end subroutine