[Mlir-commits] [mlir] e737b84 - [flang][OpenMP] Translate OpenMP scopes when compiling for target device (#130078)

Wed Mar 19 00:26:22 PDT 2025

Author: Kareem Ergawy
Date: 2025-03-19T08:26:19+01:00
New Revision: e737b846b4a34940b626c2a1119779caaa430460

URL: https://github.com/llvm/llvm-project/commit/e737b846b4a34940b626c2a1119779caaa430460
DIFF: https://github.com/llvm/llvm-project/commit/e737b846b4a34940b626c2a1119779caaa430460.diff

LOG: [flang][OpenMP] Translate OpenMP scopes when compiling for target device (#130078)

If a `target` directive is nested in a host OpenMP directive (e.g.
parallel, task, or a worksharing loop), flang currently crashes if the
target directive-related MLIR ops (e.g. `omp.map.bounds` and
`omp.map.info` depends on SSA values defined inside the parent host
OpenMP directives/ops.

This PR tries to solve this problem by treating these parent OpenMP ops
as "SSA scopes". Whenever we are translating for the device, instead of
completely translating host ops, we just tranlate their MLIR ops as pure
SSA values.

Added: 
    mlir/test/Target/LLVMIR/openmp-target-nesting-in-host-ops.mlir

Modified: 
    mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 537558a83cb36..88d2e56e2f36e 100644

--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -29,6 +29,7 @@
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
@@ -536,6 +537,20 @@ static llvm::omp::ProcBindKind getProcBindKind(omp::ClauseProcBindKind kind) {
   llvm_unreachable("Unknown ClauseProcBindKind kind");
 }
 
+/// Maps block arguments from \p blockArgIface (which are MLIR values) to the
+/// corresponding LLVM values of \p the interface's operands. This is useful
+/// when an OpenMP region with entry block arguments is converted to LLVM. In
+/// this case the block arguments are (part of) of the OpenMP region's entry
+/// arguments and the operands are (part of) of the operands to the OpenMP op
+/// containing the region.
+static void forwardArgs(LLVM::ModuleTranslation &moduleTranslation,
+                        omp::BlockArgOpenMPOpInterface blockArgIface) {
+  llvm::SmallVector<std::pair<Value, BlockArgument>> blockArgsPairs;
+  blockArgIface.getBlockArgsPairs(blockArgsPairs);
+  for (auto [var, arg] : blockArgsPairs)
+    moduleTranslation.mapValue(arg, moduleTranslation.lookupValue(var));
+}
+
 /// Helper function to map block arguments defined by ignored loop wrappers to
 /// LLVM values and prevent any uses of those from triggering null pointer
 /// dereferences.
@@ -548,17 +563,10 @@ convertIgnoredWrapper(omp::LoopWrapperInterface opInst,
   // Map block arguments directly to the LLVM value associated to the
   // corresponding operand. This is semantically equivalent to this wrapper not
   // being present.
-  auto forwardArgs =
-      [&moduleTranslation](omp::BlockArgOpenMPOpInterface blockArgIface) {
-        llvm::SmallVector<std::pair<Value, BlockArgument>> blockArgsPairs;
-        blockArgIface.getBlockArgsPairs(blockArgsPairs);
-        for (auto [var, arg] : blockArgsPairs)
-          moduleTranslation.mapValue(arg, moduleTranslation.lookupValue(var));
-      };
-
   return llvm::TypeSwitch<Operation *, LogicalResult>(opInst)
       .Case([&](omp::SimdOp op) {
-        forwardArgs(cast<omp::BlockArgOpenMPOpInterface>(*op));
+        forwardArgs(moduleTranslation,
+                    cast<omp::BlockArgOpenMPOpInterface>(*op));
         op.emitWarning() << "simd information on composite construct discarded";
         return success();
       })
@@ -5351,6 +5359,46 @@ convertTargetOpsInNest(Operation *op, llvm::IRBuilderBase &builder,
               return WalkResult::interrupt();
             return WalkResult::skip();
           }
+
+          // Non-target ops might nest target-related ops, therefore, we
+          // translate them as non-OpenMP scopes. Translating them is needed by
+          // nested target-related ops since they might need LLVM values defined
+          // in their parent non-target ops.
+          if (isa<omp::OpenMPDialect>(oper->getDialect()) &&
+              oper->getParentOfType<LLVM::LLVMFuncOp>() &&
+              !oper->getRegions().empty()) {
+            if (auto blockArgsIface =
+                    dyn_cast<omp::BlockArgOpenMPOpInterface>(oper))
+              forwardArgs(moduleTranslation, blockArgsIface);
+
+            if (auto loopNest = dyn_cast<omp::LoopNestOp>(oper)) {
+              assert(builder.GetInsertBlock() &&
+                     "No insert block is set for the builder");
+              for (auto iv : loopNest.getIVs()) {
+                // Map iv to an undefined value just to keep the IR validity.
+                moduleTranslation.mapValue(
+                    iv, llvm::PoisonValue::get(
+                            moduleTranslation.convertType(iv.getType())));
+              }
+            }
+
+            for (Region &region : oper->getRegions()) {
+              // Regions are fake in the sense that they are not a truthful
+              // translation of the OpenMP construct being converted (e.g. no
+              // OpenMP runtime calls will be generated). We just need this to
+              // prepare the kernel invocation args.
+              auto result = convertOmpOpRegions(
+                  region, oper->getName().getStringRef().str() + ".fake.region",
+                  builder, moduleTranslation);
+              if (failed(handleError(result, *oper)))
+                return WalkResult::interrupt();
+
+              builder.SetInsertPoint(result.get(), result.get()->end());
+            }
+
+            return WalkResult::skip();
+          }
+
           return WalkResult::advance();
         }).wasInterrupted();
   return failure(interrupted);

diff  --git a/mlir/test/Target/LLVMIR/openmp-target-nesting-in-host-ops.mlir b/mlir/test/Target/LLVMIR/openmp-target-nesting-in-host-ops.mlir
new file mode 100644
index 0000000000000..71a4c29eaf0aa
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-target-nesting-in-host-ops.mlir
@@ -0,0 +1,135 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} {
+
+  omp.private {type = private} @i32_privatizer : i32
+
+  llvm.func @test_nested_target_in_parallel(%arg0: !llvm.ptr) {
+    omp.parallel {
+    %0 = llvm.mlir.constant(4 : index) : i64
+    %1 = llvm.mlir.constant(1 : index) : i64
+    %4 = omp.map.bounds   lower_bound(%1 : i64) upper_bound(%0 : i64) stride(%1 : i64) start_idx(%1 : i64)
+    %mapv1 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(tofrom) capture(ByRef) bounds(%4) -> !llvm.ptr {name = ""}
+    omp.target map_entries(%mapv1 -> %map_arg : !llvm.ptr) {
+      omp.terminator
+    }
+      omp.terminator
+    }
+    llvm.return
+  }
+
+// CHECK-LABEL: define void @test_nested_target_in_parallel({{.*}}) {
+// CHECK-NEXT:    br label %omp.parallel.fake.region
+// CHECK:       omp.parallel.fake.region:
+// CHECK-NEXT:    br label %omp.region.cont
+// CHECK:       omp.region.cont:
+// CHECK-NEXT:    ret void
+// CHECK-NEXT:  }
+
+  llvm.func @test_nested_target_in_wsloop(%arg0: !llvm.ptr) {
+    %8 = llvm.mlir.constant(1 : i64) : i64
+    %9 = llvm.alloca %8 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+    %16 = llvm.mlir.constant(10 : i32) : i32
+    %17 = llvm.mlir.constant(1 : i32) : i32
+    omp.wsloop private(@i32_privatizer %9 -> %loop_arg : !llvm.ptr) {
+      omp.loop_nest (%arg1) : i32 = (%17) to (%16) inclusive step (%17) {
+        llvm.store %arg1, %loop_arg : i32, !llvm.ptr
+        %0 = llvm.mlir.constant(4 : index) : i64
+        %1 = llvm.mlir.constant(1 : index) : i64
+        %4 = omp.map.bounds   lower_bound(%1 : i64) upper_bound(%0 : i64) stride(%1 : i64) start_idx(%1 : i64)
+        %mapv1 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(tofrom) capture(ByRef) bounds(%4) -> !llvm.ptr {name = ""}
+        omp.target map_entries(%mapv1 -> %map_arg : !llvm.ptr) {
+          omp.terminator
+        }
+        omp.yield
+      }
+    }
+    llvm.return
+  }
+
+// CHECK-LABEL: define void @test_nested_target_in_wsloop(ptr %0) {
+// CHECK-NEXT:    %{{.*}} = alloca i32, i64 1, align 4
+// CHECK-NEXT:    br label %omp.wsloop.fake.region
+// CHECK:       omp.wsloop.fake.region:
+// CHECK-NEXT:    br label %omp.loop_nest.fake.region
+// CHECK:       omp.loop_nest.fake.region:
+// CHECK-NEXT:    store i32 poison, ptr %{{.*}}
+// CHECK-NEXT:    br label %omp.region.cont1
+// CHECK:       omp.region.cont1:
+// CHECK-NEXT:    br label %omp.region.cont
+// CHECK:       omp.region.cont:
+// CHECK-NEXT:    ret void
+// CHECK-NEXT:  }
+
+  llvm.func @test_nested_target_in_parallel_with_private(%arg0: !llvm.ptr) {
+    %8 = llvm.mlir.constant(1 : i64) : i64
+    %9 = llvm.alloca %8 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+    omp.parallel private(@i32_privatizer %9 -> %i_priv_arg : !llvm.ptr) {
+        %1 = llvm.mlir.constant(1 : index) : i64
+        // Use the private clause from omp.parallel to make sure block arguments
+        // are handled.
+        %i_val = llvm.load %i_priv_arg : !llvm.ptr -> i64
+        %4 = omp.map.bounds   lower_bound(%1 : i64) upper_bound(%i_val : i64) stride(%1 : i64) start_idx(%1 : i64)
+        %mapv1 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(tofrom) capture(ByRef) bounds(%4) -> !llvm.ptr {name = ""}
+        omp.target map_entries(%mapv1 -> %map_arg : !llvm.ptr) {
+          omp.terminator
+        }
+        omp.terminator
+    }
+    llvm.return
+  }
+
+  llvm.func @test_nested_target_in_task_with_private(%arg0: !llvm.ptr) {
+    %8 = llvm.mlir.constant(1 : i64) : i64
+    %9 = llvm.alloca %8 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+    omp.task private(@i32_privatizer %9 -> %i_priv_arg : !llvm.ptr) {
+        %1 = llvm.mlir.constant(1 : index) : i64
+        // Use the private clause from omp.task to make sure block arguments
+        // are handled.
+        %i_val = llvm.load %i_priv_arg : !llvm.ptr -> i64
+        %4 = omp.map.bounds   lower_bound(%1 : i64) upper_bound(%i_val : i64) stride(%1 : i64) start_idx(%1 : i64)
+        %mapv1 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(tofrom) capture(ByRef) bounds(%4) -> !llvm.ptr {name = ""}
+        omp.target map_entries(%mapv1 -> %map_arg : !llvm.ptr) {
+          omp.terminator
+        }
+        omp.terminator
+    }
+    llvm.return
+  }
+
+// CHECK-LABEL: define void @test_nested_target_in_parallel_with_private({{.*}}) {
+// CHECK:        br label %omp.parallel.fake.region
+// CHECK:       omp.parallel.fake.region:
+// CHECK:         br label %omp.region.cont
+// CHECK:       omp.region.cont:
+// CHECK-NEXT:    ret void
+// CHECK-NEXT:  }
+
+// CHECK-LABEL: define {{.*}} amdgpu_kernel void @__omp_offloading_{{.*}}_nested_target_in_parallel_{{.*}} {
+// CHECK:         call i32 @__kmpc_target_init
+// CHECK:       user_code.entry:
+// CHECK:         call void @__kmpc_target_deinit()
+// CHECK:         ret void
+// CHECK:       }
+
+// CHECK-LABEL: define {{.*}} amdgpu_kernel void @__omp_offloading_{{.*}}_test_nested_target_in_wsloop_{{.*}} {
+// CHECK:         call i32 @__kmpc_target_init
+// CHECK:       user_code.entry:
+// CHECK:         call void @__kmpc_target_deinit()
+// CHECK:         ret void
+// CHECK:       }
+
+// CHECK-LABEL: define {{.*}} amdgpu_kernel void @__omp_offloading_{{.*}}_test_nested_target_in_parallel_with_private_{{.*}} {
+// CHECK:         call i32 @__kmpc_target_init
+// CHECK:       user_code.entry:
+// CHECK:         call void @__kmpc_target_deinit()
+// CHECK:         ret void
+// CHECK:       }
+
+// CHECK-LABEL: define {{.*}} amdgpu_kernel void @__omp_offloading_{{.*}}_test_nested_target_in_task_with_private_{{.*}} {
+// CHECK:         call i32 @__kmpc_target_init
+// CHECK:       user_code.entry:
+// CHECK:         call void @__kmpc_target_deinit()
+// CHECK:         ret void
+// CHECK:       }
+}