[llvm-branch-commits] [flang] [flang][OpenMP] Extend `do concurrent` mapping to multi-range loops (PR #127634)

Mon Mar 17 00:54:23 PDT 2025

https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/127634

>From 8b56c277f04c4f2d3a8a387d20454f7ddb86754c Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Tue, 18 Feb 2025 06:17:17 -0600
Subject: [PATCH 1/2] [flang][OpenMP] Extend `do concurrent` mapping to
 multi-range loops

Adds support for converting mulit-range loops to OpenMP (on the host
only for now). The changes here "prepare" a loop nest for collapsing by
sinking iteration variables to the innermost `fir.do_loop` op in the
nest.
---
 flang/docs/DoConcurrentConversionToOpenMP.md  |  29 ++++
 .../OpenMP/DoConcurrentConversion.cpp         | 139 +++++++++++++++++-
 .../multiple_iteration_ranges.f90             |  72 +++++++++
 3 files changed, 239 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90

diff --git a/flang/docs/DoConcurrentConversionToOpenMP.md b/flang/docs/DoConcurrentConversionToOpenMP.md
index 19611615ee9d6..ecb4428d7d3ba 100644
--- a/flang/docs/DoConcurrentConversionToOpenMP.md
+++ b/flang/docs/DoConcurrentConversionToOpenMP.md
@@ -173,6 +173,35 @@ omp.parallel {
 
 <!-- TODO -->
 
+### Multi-range loops
+
+The pass currently supports multi-range loops as well. Given the following
+example:
+
+```fortran
+   do concurrent(i=1:n, j=1:m)
+       a(i,j) = i * j
+   end do
+```
+
+The generated `omp.loop_nest` operation look like:
+
+```
+omp.loop_nest (%arg0, %arg1)
+    : index = (%17, %19) to (%18, %20)
+    inclusive step (%c1_2, %c1_4) {
+  fir.store %arg0 to %private_i#1 : !fir.ref<i32>
+  fir.store %arg1 to %private_j#1 : !fir.ref<i32>
+  ...
+  omp.yield
+}
+```
+
+It is worth noting that we have privatized versions for both iteration
+variables: `i` and `j`. These are locally allocated inside the parallel/target
+OpenMP region similar to what the single-range example in previous section
+shows.
+
 <!--
 More details about current status will be added along with relevant parts of the
 implementation in later upstreaming patches.
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index bb535cf8aba03..ae3fe4db6b9f5 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -30,6 +30,9 @@ namespace looputils {
 struct InductionVariableInfo {
   /// The operation allocating memory for iteration variable.
   mlir::Operation *iterVarMemDef;
+  /// the operation(s) updating the iteration variable with the current
+  /// iteration number.
+  llvm::SetVector<mlir::Operation *> indVarUpdateOps;
 };
 
 using LoopNestToIndVarMap =
@@ -87,6 +90,47 @@ mlir::Operation *findLoopIterationVarMemDecl(fir::DoLoopOp doLoop) {
   return result.getDefiningOp();
 }
 
+/// Collects the op(s) responsible for updating a loop's iteration variable with
+/// the current iteration number. For example, for the input IR:
+/// ```
+/// %i = fir.alloca i32 {bindc_name = "i"}
+/// %i_decl:2 = hlfir.declare %i ...
+/// ...
+/// fir.do_loop %i_iv = %lb to %ub step %step unordered {
+///   %1 = fir.convert %i_iv : (index) -> i32
+///   fir.store %1 to %i_decl#1 : !fir.ref<i32>
+///   ...
+/// }
+/// ```
+/// this function would return the first 2 ops in the `fir.do_loop`'s region.
+llvm::SetVector<mlir::Operation *>
+extractIndVarUpdateOps(fir::DoLoopOp doLoop) {
+  mlir::Value indVar = doLoop.getInductionVar();
+  llvm::SetVector<mlir::Operation *> indVarUpdateOps;
+
+  llvm::SmallVector<mlir::Value> toProcess;
+  toProcess.push_back(indVar);
+
+  llvm::DenseSet<mlir::Value> done;
+
+  while (!toProcess.empty()) {
+    mlir::Value val = toProcess.back();
+    toProcess.pop_back();
+
+    if (!done.insert(val).second)
+      continue;
+
+    for (mlir::Operation *user : val.getUsers()) {
+      indVarUpdateOps.insert(user);
+
+      for (mlir::Value result : user->getResults())
+        toProcess.push_back(result);
+    }
+  }
+
+  return std::move(indVarUpdateOps);
+}
+
 /// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff
 /// there are no operations in \p outerloop's body other than:
 ///
@@ -183,7 +227,9 @@ mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
   while (true) {
     loopNest.insert(
         {currentLoop,
-         InductionVariableInfo{findLoopIterationVarMemDecl(currentLoop)}});
+         InductionVariableInfo{
+             findLoopIterationVarMemDecl(currentLoop),
+             std::move(looputils::extractIndVarUpdateOps(currentLoop))}});
 
     llvm::SmallVector<fir::DoLoopOp> unorderedLoops;
 
@@ -210,6 +256,96 @@ mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
 
   return mlir::success();
 }
+
+/// Prepares the `fir.do_loop` nest to be easily mapped to OpenMP. In
+/// particular, this function would take this input IR:
+/// ```
+/// fir.do_loop %i_iv = %i_lb to %i_ub step %i_step unordered {
+///   fir.store %i_iv to %i#1 : !fir.ref<i32>
+///   %j_lb = arith.constant 1 : i32
+///   %j_ub = arith.constant 10 : i32
+///   %j_step = arith.constant 1 : index
+///
+///   fir.do_loop %j_iv = %j_lb to %j_ub step %j_step unordered {
+///     fir.store %j_iv to %j#1 : !fir.ref<i32>
+///     ...
+///   }
+/// }
+/// ```
+///
+/// into the following form (using generic op form since the result is
+/// technically an invalid `fir.do_loop` op:
+///
+/// ```
+/// "fir.do_loop"(%i_lb, %i_ub, %i_step) <{unordered}> ({
+/// ^bb0(%i_iv: index):
+///   %j_lb = "arith.constant"() <{value = 1 : i32}> : () -> i32
+///   %j_ub = "arith.constant"() <{value = 10 : i32}> : () -> i32
+///   %j_step = "arith.constant"() <{value = 1 : index}> : () -> index
+///
+///   "fir.do_loop"(%j_lb, %j_ub, %j_step) <{unordered}> ({
+///   ^bb0(%new_i_iv: index, %new_j_iv: index):
+///     "fir.store"(%new_i_iv, %i#1) : (i32, !fir.ref<i32>) -> ()
+///     "fir.store"(%new_j_iv, %j#1) : (i32, !fir.ref<i32>) -> ()
+///     ...
+///   })
+/// ```
+///
+/// What happened to the loop nest is the following:
+///
+/// * the innermost loop's entry block was updated from having one operand to
+///   having `n` operands where `n` is the number of loops in the nest,
+///
+/// * the outer loop(s)' ops that update the IVs were sank inside the innermost
+///   loop (see the `"fir.store"(%new_i_iv, %i#1)` op above),
+///
+/// * the innermost loop's entry block's arguments were mapped in order from the
+///   outermost to the innermost IV.
+///
+/// With this IR change, we can directly inline the innermost loop's region into
+/// the newly generated `omp.loop_nest` op.
+///
+/// Note that this function has a pre-condition that \p loopNest consists of
+/// perfectly nested loops; i.e. there are no in-between ops between 2 nested
+/// loops except for the ops to setup the inner loop's LB, UB, and step. These
+/// ops are handled/cloned by `genLoopNestClauseOps(..)`.
+void sinkLoopIVArgs(mlir::ConversionPatternRewriter &rewriter,
+                    looputils::LoopNestToIndVarMap &loopNest) {
+  if (loopNest.size() <= 1)
+    return;
+
+  fir::DoLoopOp innermostLoop = loopNest.back().first;
+  mlir::Operation &innermostFirstOp = innermostLoop.getRegion().front().front();
+
+  llvm::SmallVector<mlir::Type> argTypes;
+  llvm::SmallVector<mlir::Location> argLocs;
+
+  for (auto &[doLoop, indVarInfo] : llvm::drop_end(loopNest)) {
+    // Sink the IV update ops to the innermost loop. We need to do for all loops
+    // except for the innermost one, hence the `drop_end` usage above.
+    for (mlir::Operation *op : indVarInfo.indVarUpdateOps)
+      op->moveBefore(&innermostFirstOp);
+
+    argTypes.push_back(doLoop.getInductionVar().getType());
+    argLocs.push_back(doLoop.getInductionVar().getLoc());
+  }
+
+  mlir::Region &innermmostRegion = innermostLoop.getRegion();
+  // Extend the innermost entry block with arguments to represent the outer IVs.
+  innermmostRegion.addArguments(argTypes, argLocs);
+
+  unsigned idx = 1;
+  // In reverse, remap the IVs of the loop nest from the old values to the new
+  // ones. We do that in reverse since the first argument before this loop is
+  // the old IV for the innermost loop. Therefore, we want to replace it first
+  // before the old value (1st argument in the block) is remapped to be the IV
+  // of the outermost loop in the nest.
+  for (auto &[doLoop, _] : llvm::reverse(loopNest)) {
+    doLoop.getInductionVar().replaceAllUsesWith(
+        innermmostRegion.getArgument(innermmostRegion.getNumArguments() - idx));
+    ++idx;
+  }
+}
 } // namespace looputils
 
 class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
@@ -236,6 +372,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
                         "Some `do concurent` loops are not perfectly-nested. "
                         "These will be serialized.");
 
+    looputils::sinkLoopIVArgs(rewriter, loopNest);
     mlir::IRMapping mapper;
     genParallelOp(doLoop.getLoc(), rewriter, loopNest, mapper);
     mlir::omp::LoopNestOperands loopNestClauseOps;
diff --git a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
new file mode 100644
index 0000000000000..232420fb07a75
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
@@ -0,0 +1,72 @@
+! Tests mapping of a `do concurrent` loop with multiple iteration ranges.
+
+! RUN: split-file %s %t
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %t/multi_range.f90 -o - \
+! RUN:   | FileCheck %s
+
+!--- multi_range.f90
+program main
+   integer, parameter :: n = 20
+   integer, parameter :: m = 40
+   integer, parameter :: l = 60
+   integer :: a(n, m, l)
+
+   do concurrent(i=3:n, j=5:m, k=7:l)
+       a(i,j,k) = i * j + k
+   end do
+end
+
+! CHECK: func.func @_QQmain
+
+! CHECK: %[[C3:.*]] = arith.constant 3 : i32
+! CHECK: %[[LB_I:.*]] = fir.convert %[[C3]] : (i32) -> index
+! CHECK: %[[C20:.*]] = arith.constant 20 : i32
+! CHECK: %[[UB_I:.*]] = fir.convert %[[C20]] : (i32) -> index
+! CHECK: %[[STEP_I:.*]] = arith.constant 1 : index
+
+! CHECK: %[[C5:.*]] = arith.constant 5 : i32
+! CHECK: %[[LB_J:.*]] = fir.convert %[[C5]] : (i32) -> index
+! CHECK: %[[C40:.*]] = arith.constant 40 : i32
+! CHECK: %[[UB_J:.*]] = fir.convert %[[C40]] : (i32) -> index
+! CHECK: %[[STEP_J:.*]] = arith.constant 1 : index
+
+! CHECK: %[[C7:.*]] = arith.constant 7 : i32
+! CHECK: %[[LB_K:.*]] = fir.convert %[[C7]] : (i32) -> index
+! CHECK: %[[C60:.*]] = arith.constant 60 : i32
+! CHECK: %[[UB_K:.*]] = fir.convert %[[C60]] : (i32) -> index
+! CHECK: %[[STEP_K:.*]] = arith.constant 1 : index
+
+! CHECK: omp.parallel {
+
+! CHECK-NEXT: %[[ITER_VAR_I:.*]] = fir.alloca i32 {bindc_name = "i"}
+! CHECK-NEXT: %[[BINDING_I:.*]]:2 = hlfir.declare %[[ITER_VAR_I]] {uniq_name = "_QFEi"}
+
+! CHECK-NEXT: %[[ITER_VAR_J:.*]] = fir.alloca i32 {bindc_name = "j"}
+! CHECK-NEXT: %[[BINDING_J:.*]]:2 = hlfir.declare %[[ITER_VAR_J]] {uniq_name = "_QFEj"}
+
+! CHECK-NEXT: %[[ITER_VAR_K:.*]] = fir.alloca i32 {bindc_name = "k"}
+! CHECK-NEXT: %[[BINDING_K:.*]]:2 = hlfir.declare %[[ITER_VAR_K]] {uniq_name = "_QFEk"}
+
+! CHECK: omp.wsloop {
+! CHECK-NEXT: omp.loop_nest
+! CHECK-SAME:   (%[[ARG0:[^[:space:]]+]], %[[ARG1:[^[:space:]]+]], %[[ARG2:[^[:space:]]+]])
+! CHECK-SAME:   : index = (%[[LB_I]], %[[LB_J]], %[[LB_K]])
+! CHECK-SAME:     to (%[[UB_I]], %[[UB_J]], %[[UB_K]]) inclusive
+! CHECK-SAME:     step (%[[STEP_I]], %[[STEP_J]], %[[STEP_K]]) {
+
+! CHECK-NEXT: %[[IV_IDX_I:.*]] = fir.convert %[[ARG0]]
+! CHECK-NEXT: fir.store %[[IV_IDX_I]] to %[[BINDING_I]]#1
+
+! CHECK-NEXT: %[[IV_IDX_J:.*]] = fir.convert %[[ARG1]]
+! CHECK-NEXT: fir.store %[[IV_IDX_J]] to %[[BINDING_J]]#1
+
+! CHECK-NEXT: %[[IV_IDX_K:.*]] = fir.convert %[[ARG2]]
+! CHECK-NEXT: fir.store %[[IV_IDX_K]] to %[[BINDING_K]]#1
+
+! CHECK:      omp.yield
+! CHECK-NEXT: }
+! CHECK-NEXT: }
+
+! CHECK-NEXT: omp.terminator
+! CHECK-NEXT: }

>From f7322fc9fc90bd730f9d18f0313ea1d1257f637c Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Mon, 17 Mar 2025 02:43:30 -0500
Subject: [PATCH 2/2] Simplify iv update op detection

---
 .../OpenMP/DoConcurrentConversion.cpp         | 152 +++++++-----------
 1 file changed, 57 insertions(+), 95 deletions(-)

diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index ae3fe4db6b9f5..d25ac4b3c3e79 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -28,108 +28,75 @@ namespace looputils {
 /// Stores info needed about the induction/iteration variable for each `do
 /// concurrent` in a loop nest.
 struct InductionVariableInfo {
+  InductionVariableInfo(fir::DoLoopOp doLoop) { populateInfo(doLoop); }
+
   /// The operation allocating memory for iteration variable.
   mlir::Operation *iterVarMemDef;
   /// the operation(s) updating the iteration variable with the current
   /// iteration number.
-  llvm::SetVector<mlir::Operation *> indVarUpdateOps;
-};
-
-using LoopNestToIndVarMap =
-    llvm::MapVector<fir::DoLoopOp, InductionVariableInfo>;
+  llvm::SmallVector<mlir::Operation *, 2> indVarUpdateOps;
 
-/// For the \p doLoop parameter, find the operation that declares its iteration
-/// variable or allocates memory for it.
-///
-/// For example, give the following loop:
-/// ```
-///   ...
-///   %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ...
-///   ...
-///   fir.do_loop %ind_var = %lb to %ub step %s unordered {
-///     %ind_var_conv = fir.convert %ind_var : (index) -> i32
-///     fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
-///     ...
-///   }
-/// ```
-///
-/// This function returns the `hlfir.declare` op for `%i`.
-///
-/// Note: The current implementation is dependent on how flang emits loop
-/// bodies; which is sufficient for the current simple test/use cases. If this
-/// proves to be insufficient, this should be made more generic.
-mlir::Operation *findLoopIterationVarMemDecl(fir::DoLoopOp doLoop) {
-  mlir::Value result = nullptr;
-
-  // Checks if a StoreOp is updating the memref of the loop's iteration
-  // variable.
-  auto isStoringIV = [&](fir::StoreOp storeOp) {
-    // Direct store into the IV memref.
-    if (storeOp.getValue() == doLoop.getInductionVar())
-      return true;
-
-    // Indirect store into the IV memref.
-    if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(
-            storeOp.getValue().getDefiningOp())) {
-      if (convertOp.getOperand() == doLoop.getInductionVar())
+private:
+  /// For the \p doLoop parameter, find the operation that declares its
+  /// iteration variable or allocates memory for it.
+  ///
+  /// For example, give the following loop:
+  /// ```
+  ///   ...
+  ///   %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ...
+  ///   ...
+  ///   fir.do_loop %ind_var = %lb to %ub step %s unordered {
+  ///     %ind_var_conv = fir.convert %ind_var : (index) -> i32
+  ///     fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
+  ///     ...
+  ///   }
+  /// ```
+  ///
+  /// This function returns the `hlfir.declare` op for `%i`.
+  ///
+  /// Note: The current implementation is dependent on how flang emits loop
+  /// bodies; which is sufficient for the current simple test/use cases. If this
+  /// proves to be insufficient, this should be made more generic.
+  void populateInfo(fir::DoLoopOp doLoop) {
+    mlir::Value result = nullptr;
+
+    // Checks if a StoreOp is updating the memref of the loop's iteration
+    // variable.
+    auto isStoringIV = [&](fir::StoreOp storeOp) {
+      // Direct store into the IV memref.
+      if (storeOp.getValue() == doLoop.getInductionVar()) {
+        indVarUpdateOps.push_back(storeOp);
         return true;
-    }
-
-    return false;
-  };
-
-  for (mlir::Operation &op : doLoop) {
-    if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op))
-      if (isStoringIV(storeOp)) {
-        result = storeOp.getMemref();
-        break;
       }
-  }
 
-  assert(result != nullptr && result.getDefiningOp() != nullptr);
-  return result.getDefiningOp();
-}
-
-/// Collects the op(s) responsible for updating a loop's iteration variable with
-/// the current iteration number. For example, for the input IR:
-/// ```
-/// %i = fir.alloca i32 {bindc_name = "i"}
-/// %i_decl:2 = hlfir.declare %i ...
-/// ...
-/// fir.do_loop %i_iv = %lb to %ub step %step unordered {
-///   %1 = fir.convert %i_iv : (index) -> i32
-///   fir.store %1 to %i_decl#1 : !fir.ref<i32>
-///   ...
-/// }
-/// ```
-/// this function would return the first 2 ops in the `fir.do_loop`'s region.
-llvm::SetVector<mlir::Operation *>
-extractIndVarUpdateOps(fir::DoLoopOp doLoop) {
-  mlir::Value indVar = doLoop.getInductionVar();
-  llvm::SetVector<mlir::Operation *> indVarUpdateOps;
-
-  llvm::SmallVector<mlir::Value> toProcess;
-  toProcess.push_back(indVar);
-
-  llvm::DenseSet<mlir::Value> done;
-
-  while (!toProcess.empty()) {
-    mlir::Value val = toProcess.back();
-    toProcess.pop_back();
-
-    if (!done.insert(val).second)
-      continue;
+      // Indirect store into the IV memref.
+      if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(
+              storeOp.getValue().getDefiningOp())) {
+        if (convertOp.getOperand() == doLoop.getInductionVar()) {
+          indVarUpdateOps.push_back(convertOp);
+          indVarUpdateOps.push_back(storeOp);
+          return true;
+        }
+      }
 
-    for (mlir::Operation *user : val.getUsers()) {
-      indVarUpdateOps.insert(user);
+      return false;
+    };
 
-      for (mlir::Value result : user->getResults())
-        toProcess.push_back(result);
+    for (mlir::Operation &op : doLoop) {
+      if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op))
+        if (isStoringIV(storeOp)) {
+          result = storeOp.getMemref();
+          break;
+        }
     }
+
+    assert(result != nullptr && result.getDefiningOp() != nullptr);
+    iterVarMemDef = result.getDefiningOp();
   }
+};
 
-  return std::move(indVarUpdateOps);
-}
+using LoopNestToIndVarMap =
+    llvm::MapVector<fir::DoLoopOp, InductionVariableInfo>;
 
 /// Loop \p innerLoop is considered perfectly-nested inside \p outerLoop iff
 /// there are no operations in \p outerloop's body other than:
@@ -225,12 +192,7 @@ mlir::LogicalResult collectLoopNest(fir::DoLoopOp currentLoop,
   assert(currentLoop.getUnordered());
 
   while (true) {
-    loopNest.insert(
-        {currentLoop,
-         InductionVariableInfo{
-             findLoopIterationVarMemDecl(currentLoop),
-             std::move(looputils::extractIndVarUpdateOps(currentLoop))}});
-
+    loopNest.insert({currentLoop, InductionVariableInfo(currentLoop)});
     llvm::SmallVector<fir::DoLoopOp> unorderedLoops;
 
     for (auto nestedLoop : currentLoop.getRegion().getOps<fir::DoLoopOp>())