[Mlir-commits] [mlir] [WIP][flang][OpenMP] Translate OpenMP scopes when compiling for target device (PR #130078)

Fri Mar 7 00:23:15 PST 2025

https://github.com/ergawy updated https://github.com/llvm/llvm-project/pull/130078

>From b55bae368ef822c06cdc61d2556ecd0855ba223a Mon Sep 17 00:00:00 2001
From: ergawy <kareem.ergawy at amd.com>
Date: Thu, 6 Mar 2025 03:16:59 -0600
Subject: [PATCH] [flang][OpenMP] Translate OpenMP scopes when compiling for
 target device

If a `target` directive is nested in a host OpenMP directive (e.g.
parallel, task, or a worksharing loop), flang currently crashes if the
target directive-related MLIR ops (e.g. `omp.map.bounds` and
`omp.map.info` depends on SSA values defined inside the parent host
OpenMP directives/ops.

This PR tries to solve this problem by treating these parent OpenMP ops
as "SSA scopes". Whenever we are translating for the device, instead of
completely translating host ops, we just tranlate their MLIR ops as pure
SSA values.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  91 ++++++++++--
 .../openmp-target-nesting-in-host-ops.mlir    | 136 ++++++++++++++++++
 2 files changed, 218 insertions(+), 9 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/openmp-target-nesting-in-host-ops.mlir

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 32c7c501d03c3..d0dc646ae536a 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -542,6 +542,18 @@ static llvm::omp::ProcBindKind getProcBindKind(omp::ClauseProcBindKind kind) {
   llvm_unreachable("Unknown ClauseProcBindKind kind");
 }
 
+/// Maps elements of \p blockArgs (which are MLIR values) to the corresponding
+/// LLVM values of \p operands' elements. This is useful when an OpenMP region
+/// with entry block arguments is converted to LLVM. In this case \p blockArgs
+/// are (part of) of the OpenMP region's entry arguments and \p operands are
+/// (part of) of the operands to the OpenMP op containing the region.
+static void forwardArgs(LLVM::ModuleTranslation &moduleTranslation,
+                        llvm::ArrayRef<BlockArgument> blockArgs,
+                        OperandRange operands) {
+  for (auto [arg, var] : llvm::zip_equal(blockArgs, operands))
+    moduleTranslation.mapValue(arg, moduleTranslation.lookupValue(var));
+}
+
 /// Helper function to map block arguments defined by ignored loop wrappers to
 /// LLVM values and prevent any uses of those from triggering null pointer
 /// dereferences.
@@ -554,18 +566,12 @@ convertIgnoredWrapper(omp::LoopWrapperInterface opInst,
   // Map block arguments directly to the LLVM value associated to the
   // corresponding operand. This is semantically equivalent to this wrapper not
   // being present.
-  auto forwardArgs =
-      [&moduleTranslation](llvm::ArrayRef<BlockArgument> blockArgs,
-                           OperandRange operands) {
-        for (auto [arg, var] : llvm::zip_equal(blockArgs, operands))
-          moduleTranslation.mapValue(arg, moduleTranslation.lookupValue(var));
-      };
-
   return llvm::TypeSwitch<Operation *, LogicalResult>(opInst)
       .Case([&](omp::SimdOp op) {
         auto blockArgIface = cast<omp::BlockArgOpenMPOpInterface>(*op);
-        forwardArgs(blockArgIface.getPrivateBlockArgs(), op.getPrivateVars());
-        forwardArgs(blockArgIface.getReductionBlockArgs(),
+        forwardArgs(moduleTranslation, blockArgIface.getPrivateBlockArgs(),
+                    op.getPrivateVars());
+        forwardArgs(moduleTranslation, blockArgIface.getReductionBlockArgs(),
                     op.getReductionVars());
         op.emitWarning() << "simd information on composite construct discarded";
         return success();
@@ -4673,6 +4679,8 @@ static LogicalResult
 convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
                  LLVM::ModuleTranslation &moduleTranslation) {
   auto targetOp = cast<omp::TargetOp>(opInst);
+  llvm::errs() << ">>>> input moule: "
+               << targetOp->getParentOfType<mlir::ModuleOp>() << "\n";
   if (failed(checkImplementationStatus(opInst)))
     return failure();
 
@@ -5236,6 +5244,26 @@ convertTargetDeviceOp(Operation *op, llvm::IRBuilderBase &builder,
   return convertHostOrTargetOperation(op, builder, moduleTranslation);
 }
 
+template <typename OMPOp>
+static void forwardPrivateArgs(OMPOp ompOp,
+                               LLVM::ModuleTranslation &moduleTranslation) {
+  auto blockArgIface = cast<omp::BlockArgOpenMPOpInterface>(*ompOp);
+  if (blockArgIface) {
+    forwardArgs(moduleTranslation, blockArgIface.getPrivateBlockArgs(),
+                ompOp.getPrivateVars());
+  }
+}
+
+template <typename OMPOp>
+static void forwardReductionArgs(OMPOp ompOp,
+                               LLVM::ModuleTranslation &moduleTranslation) {
+  auto blockArgIface = cast<omp::BlockArgOpenMPOpInterface>(*ompOp);
+  if (blockArgIface) {
+    forwardArgs(moduleTranslation, blockArgIface.getReductionBlockArgs(),
+                ompOp.getReductionVars());
+  }
+}
+
 static LogicalResult
 convertTargetOpsInNest(Operation *op, llvm::IRBuilderBase &builder,
                        LLVM::ModuleTranslation &moduleTranslation) {
@@ -5255,6 +5283,51 @@ convertTargetOpsInNest(Operation *op, llvm::IRBuilderBase &builder,
               return WalkResult::interrupt();
             return WalkResult::skip();
           }
+
+          // Non-target ops might nest target-related ops, therefore, we
+          // translate them as non-OpenMP scopes. Translating them is needed by
+          // nested target-related ops since they might LLVM values defined in
+          // their parent non-target ops.
+          if (isa<omp::OpenMPDialect>(oper->getDialect()) &&
+              oper->getParentOfType<LLVM::LLVMFuncOp>() &&
+              !oper->getRegions().empty()) {
+
+            // TODO Handle other ops with entry block args.
+            llvm::TypeSwitch<Operation &>(*oper)
+                .Case([&](omp::WsloopOp wsloopOp) {
+                  forwardPrivateArgs(wsloopOp, moduleTranslation);
+                  forwardReductionArgs(wsloopOp, moduleTranslation);
+                })
+                .Case([&](omp::ParallelOp parallelOp) {
+                  forwardPrivateArgs(parallelOp, moduleTranslation);
+                  forwardReductionArgs(parallelOp, moduleTranslation);
+                })
+                .Case([&](omp::TaskOp taskOp) {
+                  forwardPrivateArgs(taskOp, moduleTranslation);
+                });
+
+            if (auto loopNest = dyn_cast<omp::LoopNestOp>(oper)) {
+              for (auto iv : loopNest.getIVs()) {
+                // Create fake allocas just to maintain IR validity.
+                moduleTranslation.mapValue(
+                    iv, builder.CreateAlloca(
+                            moduleTranslation.convertType(iv.getType())));
+              }
+            }
+
+            for (Region &region : oper->getRegions()) {
+              auto result = convertOmpOpRegions(
+                  region, oper->getName().getStringRef().str() + ".fake.region",
+                  builder, moduleTranslation);
+              if (failed(handleError(result, *oper)))
+                return WalkResult::interrupt();
+
+              builder.SetInsertPoint(result.get(), result.get()->end());
+            }
+
+            return WalkResult::skip();
+          }
+
           return WalkResult::advance();
         }).wasInterrupted();
   return failure(interrupted);
diff --git a/mlir/test/Target/LLVMIR/openmp-target-nesting-in-host-ops.mlir b/mlir/test/Target/LLVMIR/openmp-target-nesting-in-host-ops.mlir
new file mode 100644
index 0000000000000..2b3bde46a787c
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-target-nesting-in-host-ops.mlir
@@ -0,0 +1,136 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} {
+
+  omp.private {type = private} @i32_privatizer : i32
+
+  llvm.func @test_nested_target_in_parallel(%arg0: !llvm.ptr) {
+    omp.parallel {
+    %0 = llvm.mlir.constant(4 : index) : i64
+    %1 = llvm.mlir.constant(1 : index) : i64
+    %4 = omp.map.bounds   lower_bound(%1 : i64) upper_bound(%0 : i64) stride(%1 : i64) start_idx(%1 : i64)
+    %mapv1 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(tofrom) capture(ByRef) bounds(%4) -> !llvm.ptr {name = ""}
+    omp.target map_entries(%mapv1 -> %map_arg : !llvm.ptr) {
+      omp.terminator
+    }
+      omp.terminator
+    }
+    llvm.return
+  }
+
+// CHECK-LABEL: define void @test_nested_target_in_parallel({{.*}}) {
+// CHECK-NEXT:    br label %omp.parallel.fake.region
+// CHECK:       omp.parallel.fake.region:
+// CHECK-NEXT:    br label %omp.region.cont
+// CHECK:       omp.region.cont:
+// CHECK-NEXT:    ret void
+// CHECK-NEXT:  }
+
+  llvm.func @test_nested_target_in_wsloop(%arg0: !llvm.ptr) {
+    %8 = llvm.mlir.constant(1 : i64) : i64
+    %9 = llvm.alloca %8 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+    %16 = llvm.mlir.constant(10 : i32) : i32
+    %17 = llvm.mlir.constant(1 : i32) : i32
+    omp.wsloop private(@i32_privatizer %9 -> %loop_arg : !llvm.ptr) {
+      omp.loop_nest (%arg1) : i32 = (%17) to (%16) inclusive step (%17) {
+        llvm.store %arg1, %loop_arg : i32, !llvm.ptr
+        %0 = llvm.mlir.constant(4 : index) : i64
+        %1 = llvm.mlir.constant(1 : index) : i64
+        %4 = omp.map.bounds   lower_bound(%1 : i64) upper_bound(%0 : i64) stride(%1 : i64) start_idx(%1 : i64)
+        %mapv1 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(tofrom) capture(ByRef) bounds(%4) -> !llvm.ptr {name = ""}
+        omp.target map_entries(%mapv1 -> %map_arg : !llvm.ptr) {
+          omp.terminator
+        }
+        omp.yield
+      }
+    }
+    llvm.return
+  }
+
+// CHECK-LABEL: define void @test_nested_target_in_wsloop(ptr %0) {
+// CHECK-NEXT:    %{{.*}} = alloca i32, i64 1, align 4
+// CHECK-NEXT:    br label %omp.wsloop.fake.region
+// CHECK:       omp.wsloop.fake.region:
+// CHECK-NEXT:    %{{.*}} = alloca i32, align 4
+// CHECK-NEXT:    br label %omp.loop_nest.fake.region
+// CHECK:       omp.loop_nest.fake.region:
+// CHECK-NEXT:    store ptr %3, ptr %2, align 8
+// CHECK-NEXT:    br label %omp.region.cont1
+// CHECK:       omp.region.cont1:
+// CHECK-NEXT:    br label %omp.region.cont
+// CHECK:       omp.region.cont:
+// CHECK-NEXT:    ret void
+// CHECK-NEXT:  }
+
+  llvm.func @test_nested_target_in_parallel_with_private(%arg0: !llvm.ptr) {
+    %8 = llvm.mlir.constant(1 : i64) : i64
+    %9 = llvm.alloca %8 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+    omp.parallel private(@i32_privatizer %9 -> %i_priv_arg : !llvm.ptr) {
+        %1 = llvm.mlir.constant(1 : index) : i64
+        // Use the private clause from omp.parallel to make sure block arguments
+        // are handled.
+        %i_val = llvm.load %i_priv_arg : !llvm.ptr -> i64
+        %4 = omp.map.bounds   lower_bound(%1 : i64) upper_bound(%i_val : i64) stride(%1 : i64) start_idx(%1 : i64)
+        %mapv1 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(tofrom) capture(ByRef) bounds(%4) -> !llvm.ptr {name = ""}
+        omp.target map_entries(%mapv1 -> %map_arg : !llvm.ptr) {
+          omp.terminator
+        }
+        omp.terminator
+    }
+    llvm.return
+  }
+
+  llvm.func @test_nested_target_in_task_with_private(%arg0: !llvm.ptr) {
+    %8 = llvm.mlir.constant(1 : i64) : i64
+    %9 = llvm.alloca %8 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+    omp.task private(@i32_privatizer %9 -> %i_priv_arg : !llvm.ptr) {
+        %1 = llvm.mlir.constant(1 : index) : i64
+        // Use the private clause from omp.task to make sure block arguments
+        // are handled.
+        %i_val = llvm.load %i_priv_arg : !llvm.ptr -> i64
+        %4 = omp.map.bounds   lower_bound(%1 : i64) upper_bound(%i_val : i64) stride(%1 : i64) start_idx(%1 : i64)
+        %mapv1 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(tofrom) capture(ByRef) bounds(%4) -> !llvm.ptr {name = ""}
+        omp.target map_entries(%mapv1 -> %map_arg : !llvm.ptr) {
+          omp.terminator
+        }
+        omp.terminator
+    }
+    llvm.return
+  }
+
+// CHECK-LABEL: define void @test_nested_target_in_parallel_with_private({{.*}}) {
+// CHECK:        br label %omp.parallel.fake.region
+// CHECK:       omp.parallel.fake.region:
+// CHECK:         br label %omp.region.cont
+// CHECK:       omp.region.cont:
+// CHECK-NEXT:    ret void
+// CHECK-NEXT:  }
+
+// CHECK-LABEL: define {{.*}} amdgpu_kernel void @__omp_offloading_{{.*}}_nested_target_in_parallel_{{.*}} {
+// CHECK:         call i32 @__kmpc_target_init
+// CHECK:       user_code.entry:
+// CHECK:         call void @__kmpc_target_deinit()
+// CHECK:         ret void
+// CHECK:       }
+
+// CHECK-LABEL: define {{.*}} amdgpu_kernel void @__omp_offloading_{{.*}}_test_nested_target_in_wsloop_{{.*}} {
+// CHECK:         call i32 @__kmpc_target_init
+// CHECK:       user_code.entry:
+// CHECK:         call void @__kmpc_target_deinit()
+// CHECK:         ret void
+// CHECK:       }
+
+// CHECK-LABEL: define {{.*}} amdgpu_kernel void @__omp_offloading_{{.*}}_test_nested_target_in_parallel_with_private_{{.*}} {
+// CHECK:         call i32 @__kmpc_target_init
+// CHECK:       user_code.entry:
+// CHECK:         call void @__kmpc_target_deinit()
+// CHECK:         ret void
+// CHECK:       }
+
+// CHECK-LABEL: define {{.*}} amdgpu_kernel void @__omp_offloading_{{.*}}_test_nested_target_in_task_with_private_{{.*}} {
+// CHECK:         call i32 @__kmpc_target_init
+// CHECK:       user_code.entry:
+// CHECK:         call void @__kmpc_target_deinit()
+// CHECK:         ret void
+// CHECK:       }
+}