[flang-commits] [flang] [flang][OpenMP] Support `bind` clause code-gen for standalone `loop`s (PR #122674)

Mon Jan 13 00:14:51 PST 2025

https://github.com/ergawy created https://github.com/llvm/llvm-project/pull/122674

Extends rewriting of `loop` directives by supporting `bind` clause for
standalone directives. This follows both the spec and the current state
of clang as follows:
* No `bind` or `bind(thread)`: the `loop` is rewritten to `simd`.
* `bind(parallel)`: the `loop` is rewritten to `do`.
* `bind(teams)`: the `loop` is rewritten to `distribute`.

This is a follow-up PR for https://github.com/llvm/llvm-project/pull/122632, only the latest commit in this PR is relevant to the PR.

>From 52076c753e5120a8318061e32d22a51cc602c380 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Wed, 18 Dec 2024 02:39:40 -0600
Subject: [PATCH 1/2] [flang][OpenMP] Rewrite standalone `loop` directives to
 `simd`

Extends conversion support for `loop` directives. This PR handles
standalone `loop` constructs by rewriting them to equivalent `simd`
constructs. The reasoning behind that decision is documented in the
rewrite function itself.
---
 .../OpenMP/GenericLoopConversion.cpp          | 85 +++++++++++++++++--
 flang/test/Lower/OpenMP/loop-directive.f90    | 45 +++++++++-
 .../generic-loop-rewriting-todo.mlir          | 13 ---
 3 files changed, 121 insertions(+), 22 deletions(-)

diff --git a/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp b/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp
index c3c1f3b2848b82..b518fa750e6d69 100644
--- a/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp
@@ -30,19 +30,37 @@ class GenericLoopConversionPattern
     : public mlir::OpConversionPattern<mlir::omp::LoopOp> {
 public:
   enum class GenericLoopCombinedInfo {
-    None,
+    Standalone,
     TargetTeamsLoop,
     TargetParallelLoop
   };
 
   using mlir::OpConversionPattern<mlir::omp::LoopOp>::OpConversionPattern;
 
+  explicit GenericLoopConversionPattern(mlir::MLIRContext *ctx)
+      : mlir::OpConversionPattern<mlir::omp::LoopOp>{ctx} {
+    this->setHasBoundedRewriteRecursion(true);
+  }
+
   mlir::LogicalResult
   matchAndRewrite(mlir::omp::LoopOp loopOp, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
     assert(mlir::succeeded(checkLoopConversionSupportStatus(loopOp)));
 
-    rewriteToDistributeParallelDo(loopOp, rewriter);
+    GenericLoopCombinedInfo combinedInfo = findGenericLoopCombineInfo(loopOp);
+
+    switch (combinedInfo) {
+    case GenericLoopCombinedInfo::Standalone:
+      rewriteToSimdLoop(loopOp, rewriter);
+      break;
+    case GenericLoopCombinedInfo::TargetParallelLoop:
+      assert(false);
+      break;
+    case GenericLoopCombinedInfo::TargetTeamsLoop:
+      rewriteToDistributeParallelDo(loopOp, rewriter);
+      break;
+    }
+
     rewriter.eraseOp(loopOp);
     return mlir::success();
   }
@@ -52,9 +70,8 @@ class GenericLoopConversionPattern
     GenericLoopCombinedInfo combinedInfo = findGenericLoopCombineInfo(loopOp);
 
     switch (combinedInfo) {
-    case GenericLoopCombinedInfo::None:
-      return loopOp.emitError(
-          "not yet implemented: Standalone `omp loop` directive");
+    case GenericLoopCombinedInfo::Standalone:
+      break;
     case GenericLoopCombinedInfo::TargetParallelLoop:
       return loopOp.emitError(
           "not yet implemented: Combined `omp target parallel loop` directive");
@@ -86,7 +103,7 @@ class GenericLoopConversionPattern
   static GenericLoopCombinedInfo
   findGenericLoopCombineInfo(mlir::omp::LoopOp loopOp) {
     mlir::Operation *parentOp = loopOp->getParentOp();
-    GenericLoopCombinedInfo result = GenericLoopCombinedInfo::None;
+    GenericLoopCombinedInfo result = GenericLoopCombinedInfo::Standalone;
 
     if (auto teamsOp = mlir::dyn_cast_if_present<mlir::omp::TeamsOp>(parentOp))
       if (mlir::isa_and_present<mlir::omp::TargetOp>(teamsOp->getParentOp()))
@@ -100,6 +117,62 @@ class GenericLoopConversionPattern
     return result;
   }
 
+  /// Rewrites standalone `loop` directives to equivalent `simd` constructs.
+  /// The reasoning behind this decision is that according to the spec (version
+  /// 5.2, section 11.7.1):
+  ///
+  /// "If the bind clause is not specified on a construct for which it may be
+  /// specified and the construct is closely nested inside a teams or parallel
+  /// construct, the effect is as if binding is teams or parallel. If none of
+  /// those conditions hold, the binding region is not defined."
+  ///
+  /// which means that standalone `loop` directives have undefined binding
+  /// region. Moreover, the spec says (in the next paragraph):
+  ///
+  /// "The specified binding region determines the binding thread set.
+  /// Specifically, if the binding region is a teams region, then the binding
+  /// thread set is the set of initial threads that are executing that region
+  /// while if the binding region is a parallel region, then the binding thread
+  /// set is the team of threads that are executing that region. If the binding
+  /// region is not defined, then the binding thread set is the encountering
+  /// thread."
+  ///
+  /// which means that the binding thread set for a standalone `loop` directive
+  /// is only the encountering thread.
+  ///
+  /// Since the encountering thread is the binding thread (set) for a
+  /// standalone `loop` directive, the best we can do in such case is to "simd"
+  /// the directive.
+  void rewriteToSimdLoop(mlir::omp::LoopOp loopOp,
+                         mlir::ConversionPatternRewriter &rewriter) const {
+    loopOp.emitWarning("Detected standalone OpenMP `loop` directive, the "
+                       "associated loop will be rewritten to `simd`.");
+    mlir::omp::SimdOperands simdClauseOps;
+    simdClauseOps.privateVars = loopOp.getPrivateVars();
+
+    auto privateSyms = loopOp.getPrivateSyms();
+    if (privateSyms)
+      simdClauseOps.privateSyms.assign(privateSyms->begin(),
+                                       privateSyms->end());
+
+    Fortran::common::openmp::EntryBlockArgs simdArgs;
+    simdArgs.priv.vars = simdClauseOps.privateVars;
+
+    auto simdOp =
+        rewriter.create<mlir::omp::SimdOp>(loopOp.getLoc(), simdClauseOps);
+    mlir::Block *simdBlock =
+        genEntryBlock(rewriter, simdArgs, simdOp.getRegion());
+
+    mlir::IRMapping mapper;
+    mlir::Block &loopBlock = *loopOp.getRegion().begin();
+
+    for (auto [loopOpArg, simdopArg] :
+         llvm::zip_equal(loopBlock.getArguments(), simdBlock->getArguments()))
+      mapper.map(loopOpArg, simdopArg);
+
+    rewriter.clone(*loopOp.begin(), mapper);
+  }
+
   void rewriteToDistributeParallelDo(
       mlir::omp::LoopOp loopOp,
       mlir::ConversionPatternRewriter &rewriter) const {
diff --git a/flang/test/Lower/OpenMP/loop-directive.f90 b/flang/test/Lower/OpenMP/loop-directive.f90
index 4b4d640e449eeb..9fa0de3bfe171a 100644
--- a/flang/test/Lower/OpenMP/loop-directive.f90
+++ b/flang/test/Lower/OpenMP/loop-directive.f90
@@ -11,7 +11,7 @@
 subroutine test_no_clauses()
   integer :: i, j, dummy = 1
 
-  ! CHECK: omp.loop private(@[[I_PRIV]] %{{.*}}#0 -> %[[ARG:.*]] : !fir.ref<i32>) {
+  ! CHECK: omp.simd private(@[[I_PRIV]] %{{.*}}#0 -> %[[ARG:.*]] : !fir.ref<i32>) {
   ! CHECK-NEXT:   omp.loop_nest (%[[IV:.*]]) : i32 = (%{{.*}}) to (%{{.*}}) {{.*}} {
   ! CHECK:          %[[ARG_DECL:.*]]:2 = hlfir.declare %[[ARG]]
   ! CHECK:          fir.store %[[IV]] to %[[ARG_DECL]]#1 : !fir.ref<i32>
@@ -27,7 +27,7 @@ subroutine test_no_clauses()
 ! CHECK-LABEL: func.func @_QPtest_collapse
 subroutine test_collapse()
   integer :: i, j, dummy = 1
-  ! CHECK: omp.loop private(@{{.*}} %{{.*}}#0 -> %{{.*}}, @{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) {
+  ! CHECK: omp.simd private(@{{.*}} %{{.*}}#0 -> %{{.*}}, @{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) {
   ! CHECK-NEXT:   omp.loop_nest (%{{.*}}, %{{.*}}) : i32 {{.*}} {
   ! CHECK:        }
   ! CHECK: }
@@ -43,7 +43,7 @@ subroutine test_collapse()
 ! CHECK-LABEL: func.func @_QPtest_private
 subroutine test_private()
   integer :: i, dummy = 1
-  ! CHECK: omp.loop private(@[[DUMMY_PRIV]] %{{.*}}#0 -> %[[DUMMY_ARG:.*]], @{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) {
+  ! CHECK: omp.simd private(@[[DUMMY_PRIV]] %{{.*}}#0 -> %[[DUMMY_ARG:.*]], @{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) {
   ! CHECK-NEXT:   omp.loop_nest (%{{.*}}) : i32 = (%{{.*}}) to (%{{.*}}) {{.*}} {
   ! CHECK:          %[[DUMMY_DECL:.*]]:2 = hlfir.declare %[[DUMMY_ARG]] {uniq_name = "_QFtest_privateEdummy"}
   ! CHECK:          %{{.*}} = fir.load %[[DUMMY_DECL]]#0
@@ -100,3 +100,42 @@ subroutine test_bind()
   end do
   !$omp end loop
 end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_nested_directives
+subroutine test_nested_directives
+  implicit none
+  integer, parameter :: N = 100000
+  integer a(N), b(N), c(N)
+  integer j,i, num, flag;
+  num = N
+
+  ! CHECK: omp.teams {
+
+  ! Verify the first `loop` directive was combined with `target teams` into 
+  ! `target teams distribute parallel do`.
+  ! CHECK:   omp.parallel {{.*}} {
+  ! CHECK:     omp.distribute {
+  ! CHECK:       omp.wsloop {
+  ! CHECK:         omp.loop_nest {{.*}} {
+
+  ! Very the second `loop` directive was rewritten to `simd`.
+  ! CHECK:           omp.simd {{.*}} {
+  ! CHECK:             omp.loop_nest {{.*}} {
+  ! CHECK:             }
+  ! CHECK:           }
+
+  ! CHECK:         }
+  ! CHECK:       } {omp.composite}
+  ! CHECK:     } {omp.composite}
+  ! CHECK:   } {omp.composite}
+  ! CHECK: }
+  !$omp target teams map(to: a,b) map(from: c)
+  !$omp loop
+  do j=1,1000
+    !$omp loop
+    do i=1,N
+      c(i) = a(i) * b(i)
+    end do
+  end do
+  !$omp end target teams
+end subroutine
diff --git a/flang/test/Transforms/generic-loop-rewriting-todo.mlir b/flang/test/Transforms/generic-loop-rewriting-todo.mlir
index 9ea6bf001b6685..becd6b8dcb5cb4 100644
--- a/flang/test/Transforms/generic-loop-rewriting-todo.mlir
+++ b/flang/test/Transforms/generic-loop-rewriting-todo.mlir
@@ -1,18 +1,5 @@
 // RUN: fir-opt --omp-generic-loop-conversion -verify-diagnostics %s
 
-func.func @_QPtarget_loop() {
-  %c0 = arith.constant 0 : i32
-  %c10 = arith.constant 10 : i32
-  %c1 = arith.constant 1 : i32
-  // expected-error at below {{not yet implemented: Standalone `omp loop` directive}}
-  omp.loop {
-    omp.loop_nest (%arg3) : i32 = (%c0) to (%c10) inclusive step (%c1) {
-      omp.yield
-    }
-  }
-  return
-}
-
 func.func @_QPtarget_parallel_loop() {
   omp.target {
     omp.parallel {

>From f153a15bd7dd626098bc68f52d9492dea73d6856 Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Sun, 12 Jan 2025 23:47:09 -0600
Subject: [PATCH 2/2] [flang][OpenMP] Support `bind` clause code-gen for
 standalone `loop`s

Extends rewriting of `loop` directives by supporting `bind` clause for
standalone directives. This follows both the spec and the current state
of clang as follows:

* No `bind` or `bind(thread)`: the `loop` is rewritten to `simd`.
* `bind(parallel)`: the `loop` is rewritten to `do`.
* `bind(teams)`: the `loop` is rewritten to `distribute`.
---
 .../OpenMP/GenericLoopConversion.cpp          | 80 ++++++++++++++-----
 flang/test/Lower/OpenMP/loop-directive.f90    | 42 +++++++++-
 2 files changed, 103 insertions(+), 19 deletions(-)

diff --git a/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp b/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp
index b518fa750e6d69..949d2d37c85114 100644
--- a/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp
@@ -51,7 +51,7 @@ class GenericLoopConversionPattern
 
     switch (combinedInfo) {
     case GenericLoopCombinedInfo::Standalone:
-      rewriteToSimdLoop(loopOp, rewriter);
+      rewriteStandaloneLoop(loopOp, rewriter);
       break;
     case GenericLoopCombinedInfo::TargetParallelLoop:
       assert(false);
@@ -85,7 +85,10 @@ class GenericLoopConversionPattern
              << loopOp->getName() << " operation";
     };
 
-    if (loopOp.getBindKind())
+    // For standalone directives, `bind` is already supported. Other combined
+    // forms will be supported in a follow-up PR.
+    if (combinedInfo != GenericLoopCombinedInfo::Standalone &&
+        loopOp.getBindKind())
       return todo("bind");
 
     if (loopOp.getOrder())
@@ -117,7 +120,27 @@ class GenericLoopConversionPattern
     return result;
   }
 
-  /// Rewrites standalone `loop` directives to equivalent `simd` constructs.
+  void rewriteStandaloneLoop(mlir::omp::LoopOp loopOp,
+                             mlir::ConversionPatternRewriter &rewriter) const {
+    using namespace mlir::omp;
+    std::optional<ClauseBindKind> bindKind = loopOp.getBindKind();
+
+    if (!bindKind.has_value())
+      return rewriteToSimdLoop(loopOp, rewriter);
+
+    switch (*loopOp.getBindKind()) {
+    case ClauseBindKind::Parallel:
+      return rewriteToWsloop(loopOp, rewriter);
+    case ClauseBindKind::Teams:
+      return rewriteToDistrbute(loopOp, rewriter);
+    case ClauseBindKind::Thread:
+      return rewriteToSimdLoop(loopOp, rewriter);
+    }
+  }
+
+  /// Rewrites standalone `loop` (with `bind` clause with `bind(parallel)`)
+  /// directives to equivalent `simd` constructs.
+  ///
   /// The reasoning behind this decision is that according to the spec (version
   /// 5.2, section 11.7.1):
   ///
@@ -145,30 +168,51 @@ class GenericLoopConversionPattern
   /// the directive.
   void rewriteToSimdLoop(mlir::omp::LoopOp loopOp,
                          mlir::ConversionPatternRewriter &rewriter) const {
-    loopOp.emitWarning("Detected standalone OpenMP `loop` directive, the "
-                       "associated loop will be rewritten to `simd`.");
-    mlir::omp::SimdOperands simdClauseOps;
-    simdClauseOps.privateVars = loopOp.getPrivateVars();
+    loopOp.emitWarning(
+        "Detected standalone OpenMP `loop` directive with thread binding, "
+        "the associated loop will be rewritten to `simd`.");
+    rewriteToSingleWrapperOp<mlir::omp::SimdOp, mlir::omp::SimdOperands>(
+        loopOp, rewriter);
+  }
+
+  void rewriteToDistrbute(mlir::omp::LoopOp loopOp,
+                          mlir::ConversionPatternRewriter &rewriter) const {
+    rewriteToSingleWrapperOp<mlir::omp::DistributeOp,
+                             mlir::omp::DistributeOperands>(loopOp, rewriter);
+  }
+
+  void rewriteToWsloop(mlir::omp::LoopOp loopOp,
+                       mlir::ConversionPatternRewriter &rewriter) const {
+    rewriteToSingleWrapperOp<mlir::omp::WsloopOp, mlir::omp::WsloopOperands>(
+        loopOp, rewriter);
+  }
+
+  template <typename OpTy, typename OpOperandsTy>
+  void
+  rewriteToSingleWrapperOp(mlir::omp::LoopOp loopOp,
+                           mlir::ConversionPatternRewriter &rewriter) const {
+    OpOperandsTy distributeClauseOps;
+    distributeClauseOps.privateVars = loopOp.getPrivateVars();
 
     auto privateSyms = loopOp.getPrivateSyms();
     if (privateSyms)
-      simdClauseOps.privateSyms.assign(privateSyms->begin(),
-                                       privateSyms->end());
+      distributeClauseOps.privateSyms.assign(privateSyms->begin(),
+                                             privateSyms->end());
 
-    Fortran::common::openmp::EntryBlockArgs simdArgs;
-    simdArgs.priv.vars = simdClauseOps.privateVars;
+    Fortran::common::openmp::EntryBlockArgs distributeArgs;
+    distributeArgs.priv.vars = distributeClauseOps.privateVars;
 
-    auto simdOp =
-        rewriter.create<mlir::omp::SimdOp>(loopOp.getLoc(), simdClauseOps);
-    mlir::Block *simdBlock =
-        genEntryBlock(rewriter, simdArgs, simdOp.getRegion());
+    auto distributeOp =
+        rewriter.create<OpTy>(loopOp.getLoc(), distributeClauseOps);
+    mlir::Block *distributeBlock =
+        genEntryBlock(rewriter, distributeArgs, distributeOp.getRegion());
 
     mlir::IRMapping mapper;
     mlir::Block &loopBlock = *loopOp.getRegion().begin();
 
-    for (auto [loopOpArg, simdopArg] :
-         llvm::zip_equal(loopBlock.getArguments(), simdBlock->getArguments()))
-      mapper.map(loopOpArg, simdopArg);
+    for (auto [loopOpArg, distributeOpArg] : llvm::zip_equal(
+             loopBlock.getArguments(), distributeBlock->getArguments()))
+      mapper.map(loopOpArg, distributeOpArg);
 
     rewriter.clone(*loopOp.begin(), mapper);
   }
diff --git a/flang/test/Lower/OpenMP/loop-directive.f90 b/flang/test/Lower/OpenMP/loop-directive.f90
index 9fa0de3bfe171a..845905da0fcba2 100644
--- a/flang/test/Lower/OpenMP/loop-directive.f90
+++ b/flang/test/Lower/OpenMP/loop-directive.f90
@@ -92,7 +92,7 @@ subroutine test_reduction()
 ! CHECK-LABEL: func.func @_QPtest_bind
 subroutine test_bind()
   integer :: i, dummy = 1
-  ! CHECK: omp.loop bind(thread) private(@{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) {
+  ! CHECK: omp.simd private(@{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) {
   ! CHECK: }
   !$omp loop bind(thread)
   do i=1,10
@@ -139,3 +139,43 @@ subroutine test_nested_directives
   end do
   !$omp end target teams
 end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_standalone_bind_teams
+subroutine test_standalone_bind_teams
+  implicit none
+  integer, parameter :: N = 100000
+  integer a(N), b(N), c(N)
+  integer j,i, num, flag;
+  num = N
+
+  ! CHECK:     omp.distribute
+  ! CHECK-SAME:  private(@{{.*}}Ea_private_ref_100000xi32 {{[^,]*}},
+  ! CHECK-SAME:          @{{.*}}Ei_private_ref_i32 {{.*}} : {{.*}}) {
+  ! CHECK:       omp.loop_nest {{.*}} {
+  ! CHECK:       }
+  ! CHECK:     }
+  !$omp loop bind(teams) private(a)
+  do i=1,N
+    c(i) = a(i) * b(i)
+  end do
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_standalone_bind_parallel
+subroutine test_standalone_bind_parallel
+  implicit none
+  integer, parameter :: N = 100000
+  integer a(N), b(N), c(N)
+  integer j,i, num, flag;
+  num = N
+
+  ! CHECK:     omp.wsloop
+  ! CHECK-SAME:  private(@{{.*}}Ea_private_ref_100000xi32 {{[^,]*}},
+  ! CHECK-SAME:          @{{.*}}Ei_private_ref_i32 {{.*}} : {{.*}}) {
+  ! CHECK:       omp.loop_nest {{.*}} {
+  ! CHECK:       }
+  ! CHECK:     }
+  !$omp loop bind(parallel) private(a)
+  do i=1,N
+    c(i) = a(i) * b(i)
+  end do
+end subroutine