[flang-commits] [llvm] [mlir] [flang] [mlir][OpenMP] Add outlining pass for TargetOp (PR #78328)

Tue Jan 16 11:04:22 PST 2024

https://github.com/fabianmcg created https://github.com/llvm/llvm-project/pull/78328

This patch adds a pass to outline OpenMP target operations into a GPU module,
allowing them to be compiled using the GPU dialect compilation infrastructure.

The pass works by traversing each function, outlining the ops to a GPU module,
and then cloning all the symbols referenced inside the target regions marked
with a declare target attribute. The outlining mechanism is similar to the one
found in `gpu-kernel-outlining`.

Note: Ignore the base commits, they are being reviewed in other PRs.

>From 61c8809698b66cf3b4686e9908fb11773ecf0eb6 Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev at gmail.com>
Date: Sat, 13 Jan 2024 23:45:57 +0000
Subject: [PATCH 1/9] [mlir][interfaces] Add the `TargetInfo` attribute
 interface

This patch adds the TargetInfo attribute interface to the set of DLTI
interfaces. Target information attributes provide essential information on the
compilation target. This information includes the target triple identifier, the
target chip identifier, and a string representation of the target features.

This patch also adds this new interface to the NVVM and ROCDL GPU target
attributes.
---
 .../include/mlir/Dialect/LLVMIR/NVVMDialect.h |  1 +
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td   |  5 ++-
 .../mlir/Dialect/LLVMIR/ROCDLDialect.h        |  1 +
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td  |  6 ++--
 .../mlir/Interfaces/DataLayoutInterfaces.td   | 33 +++++++++++++++++++
 mlir/lib/Dialect/LLVMIR/CMakeLists.txt        |  2 ++
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp    |  8 +++++
 mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp   |  8 +++++
 8 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
index 08019e77ae6af8..1a55d08be9edc2 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/DataLayoutInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index c5f68a2ebe3952..0bbbde6270cd69 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -17,6 +17,7 @@ include "mlir/IR/EnumAttr.td"
 include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
 include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/DataLayoutInterfaces.td"
 include "mlir/Dialect/LLVMIR/BasicPtxBuilderInterface.td"
 
 def LLVM_PointerGlobal : LLVM_PointerInAddressSpace<1>;
@@ -1894,7 +1895,9 @@ def NVVM_WgmmaMmaAsyncOp : NVVM_Op<"wgmma.mma_async",
 // NVVM target attribute.
 //===----------------------------------------------------------------------===//
 
-def NVVM_TargettAttr : NVVM_Attr<"NVVMTarget", "target"> {
+def NVVM_TargettAttr : NVVM_Attr<"NVVMTarget", "target", [
+    DeclareAttrInterfaceMethods<TargetInfoAttrInterface>
+  ]> {
   let description = [{
     GPU target attribute for controlling compilation of NVIDIA targets. All
     parameters decay into default values if not present.
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
index c2a82ffc1c43cf..fa1131a463e1ab 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
@@ -26,6 +26,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/DataLayoutInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
 ///// Ops /////
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 48b830ae34f292..a492709c299544 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -15,6 +15,7 @@
 
 include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
 include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
+include "mlir/Interfaces/DataLayoutInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
 //===----------------------------------------------------------------------===//
@@ -608,8 +609,9 @@ def ROCDL_CvtSrFp8F32Op :
 // ROCDL target attribute.
 //===----------------------------------------------------------------------===//
 
-def ROCDL_TargettAttr :
-    ROCDL_Attr<"ROCDLTarget", "target"> {
+def ROCDL_TargettAttr : ROCDL_Attr<"ROCDLTarget", "target", [
+    DeclareAttrInterfaceMethods<TargetInfoAttrInterface>
+  ]> {
   let description = [{
     ROCDL target attribute for controlling compilation of AMDGPU targets. All
     parameters decay into default values if not present.
diff --git a/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td b/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td
index a8def967fffcfa..eac9521aadc11e 100644
--- a/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td
+++ b/mlir/include/mlir/Interfaces/DataLayoutInterfaces.td
@@ -188,6 +188,39 @@ def DataLayoutSpecInterface : AttrInterface<"DataLayoutSpecInterface"> {
   }];
 }
 
+def TargetInfoAttrInterface : AttrInterface<"TargetInfoAttrInterface"> {
+  let cppNamespace = "::mlir";
+
+  let description = [{
+    Attribute interface describing target information.
+
+    Target information attributes provide essential information on the
+    compilation target. This information includes the target triple identifier,
+    the target chip identifier, and a string representation of the target features.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      /*description=*/"Returns the target triple identifier.",
+      /*retTy=*/"::mlir::StringRef",
+      /*methodName=*/"getTargetTriple",
+      /*args=*/(ins)
+    >,
+    InterfaceMethod<
+      /*description=*/"Returns the target chip identifier.",
+      /*retTy=*/"::mlir::StringRef",
+      /*methodName=*/"getTargetChip",
+      /*args=*/(ins)
+    >,
+    InterfaceMethod<
+      /*description=*/"Returns the target features as a string.",
+      /*retTy=*/"std::string",
+      /*methodName=*/"getTargetFeatures",
+      /*args=*/(ins)
+    >
+  ];
+}
+
 //===----------------------------------------------------------------------===//
 // Operation interface
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
index b00259677697a5..00b78e30ee8b09 100644
--- a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
+++ b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt
@@ -61,6 +61,7 @@ add_mlir_dialect_library(MLIRNVVMDialect
   LINK_LIBS PUBLIC
   MLIRIR
   MLIRLLVMDialect
+  MLIRDataLayoutInterfaces
   MLIRSideEffectInterfaces
   )
 
@@ -83,5 +84,6 @@ add_mlir_dialect_library(MLIRROCDLDialect
   LINK_LIBS PUBLIC
   MLIRIR
   MLIRLLVMDialect
+  MLIRDataLayoutInterfaces
   MLIRSideEffectInterfaces
   )
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index aa49c4dc31fbc0..b73504ac4969af 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -1106,6 +1106,14 @@ NVVMTargetAttr::verify(function_ref<InFlightDiagnostic()> emitError,
   return success();
 }
 
+StringRef NVVMTargetAttr::getTargetTriple() const { return getTriple(); }
+
+StringRef NVVMTargetAttr::getTargetChip() const { return getChip(); }
+
+std::string NVVMTargetAttr::getTargetFeatures() const {
+  return getFeatures().str();
+}
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/LLVMIR/NVVMOps.cpp.inc"
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp
index 26e46b31ddc018..8b10c48718a3f8 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/ROCDLDialect.cpp
@@ -295,6 +295,14 @@ ROCDLTargetAttr::verify(function_ref<InFlightDiagnostic()> emitError,
   return success();
 }
 
+StringRef ROCDLTargetAttr::getTargetTriple() const { return getTriple(); }
+
+StringRef ROCDLTargetAttr::getTargetChip() const { return getChip(); }
+
+std::string ROCDLTargetAttr::getTargetFeatures() const {
+  return getFeatures().str();
+}
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/LLVMIR/ROCDLOps.cpp.inc"
 

>From 436ec9b04bb238238d4a935a8f965a13e70c6846 Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev at gmail.com>
Date: Sun, 14 Jan 2024 01:29:19 +0000
Subject: [PATCH 2/9] [mlir][Target][LLVM] Add offload utility class

This patch adds the `OffloadHandler` utility class for creating LLVM offload
entries.
LLVM offload entries hold information on offload symbols; for example, for a
GPU kernel, this includes its host address to identify the kernel and the kernel
identifier in the binary. Arrays of offload entries can be used to register
functions within the CUDA/HIP runtime. Libomptarget also uses these entries to
register OMP target offload kernels and variables.

This patch is 1/4 on introducing the `OffloadEmbeddingAttr` GPU translation
attribute.
---
 mlir/include/mlir/Target/LLVM/Offload.h   |  61 ++++++++++++
 mlir/lib/Target/LLVM/CMakeLists.txt       |   2 +
 mlir/lib/Target/LLVM/Offload.cpp          | 111 ++++++++++++++++++++++
 mlir/unittests/Target/LLVM/CMakeLists.txt |   1 +
 mlir/unittests/Target/LLVM/Offload.cpp    |  49 ++++++++++
 5 files changed, 224 insertions(+)
 create mode 100644 mlir/include/mlir/Target/LLVM/Offload.h
 create mode 100644 mlir/lib/Target/LLVM/Offload.cpp
 create mode 100644 mlir/unittests/Target/LLVM/Offload.cpp

diff --git a/mlir/include/mlir/Target/LLVM/Offload.h b/mlir/include/mlir/Target/LLVM/Offload.h
new file mode 100644
index 00000000000000..7b705667d477d2
--- /dev/null
+++ b/mlir/include/mlir/Target/LLVM/Offload.h
@@ -0,0 +1,61 @@
+//===- Offload.h - LLVM Target Offload --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares LLVM target offload utility classes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVM_OFFLOAD_H
+#define MLIR_TARGET_LLVM_OFFLOAD_H
+
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+class Constant;
+class GlobalVariable;
+class Module;
+} // namespace llvm
+
+namespace mlir {
+namespace LLVM {
+/// `OffloadHandler` is a utility class for creating LLVM offload entries. LLVM
+/// offload entries hold information on offload symbols; for example, for a GPU
+/// kernel, this includes its host address to identify the kernel and the kernel
+/// identifier in the binary. Arrays of offload entries can be used to register
+/// functions within the CUDA/HIP runtime. Libomptarget also uses these entries
+/// to register OMP target offload kernels and variables.
+class OffloadHandler {
+public:
+  using OffloadEntryArray =
+      std::pair<llvm::GlobalVariable *, llvm::GlobalVariable *>;
+  OffloadHandler(llvm::Module &module) : module(module) {}
+
+  /// Returns the begin symbol name used in the entry array.
+  static std::string getBeginSymbol(StringRef suffix);
+
+  /// Returns the end symbol name used in the entry array.
+  static std::string getEndSymbol(StringRef suffix);
+
+  /// Returns the entry array if it exists or a pair of null pointers.
+  OffloadEntryArray getEntryArray(StringRef suffix);
+
+  /// Emits an empty array of offloading entries.
+  OffloadEntryArray emitEmptyEntryArray(StringRef suffix);
+
+  /// Inserts an offloading entry into an existing entry array. This method
+  /// returns failure if the entry array hasn't been declared.
+  LogicalResult insertOffloadEntry(StringRef suffix, llvm::Constant *entry);
+
+protected:
+  llvm::Module &module;
+};
+} // namespace LLVM
+} // namespace mlir
+
+#endif // MLIR_TARGET_LLVM_OFFLOAD_H
diff --git a/mlir/lib/Target/LLVM/CMakeLists.txt b/mlir/lib/Target/LLVM/CMakeLists.txt
index cc2c3a00a02eaf..241a6c64dd868f 100644
--- a/mlir/lib/Target/LLVM/CMakeLists.txt
+++ b/mlir/lib/Target/LLVM/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_library(MLIRTargetLLVM
   ModuleToObject.cpp
+  Offload.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Target/LLVM
@@ -16,6 +17,7 @@ add_mlir_library(MLIRTargetLLVM
   Passes
   Support
   Target
+  FrontendOffloading
   LINK_LIBS PUBLIC
   MLIRExecutionEngineUtils
   MLIRTargetLLVMIRExport
diff --git a/mlir/lib/Target/LLVM/Offload.cpp b/mlir/lib/Target/LLVM/Offload.cpp
new file mode 100644
index 00000000000000..81ba12403bfb99
--- /dev/null
+++ b/mlir/lib/Target/LLVM/Offload.cpp
@@ -0,0 +1,111 @@
+//===- Offload.cpp - LLVM Target Offload ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines LLVM target offload utility classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/LLVM/Offload.h"
+#include "llvm/Frontend/Offloading/Utility.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+
+using namespace mlir;
+using namespace mlir::LLVM;
+
+std::string OffloadHandler::getBeginSymbol(StringRef suffix) {
+  return ("__begin_offload_" + suffix).str();
+}
+
+std::string OffloadHandler::getEndSymbol(StringRef suffix) {
+  return ("__end_offload_" + suffix).str();
+}
+
+namespace {
+/// Returns the type of the entry array.
+llvm::ArrayType *getEntryArrayType(llvm::Module &module, size_t numElems) {
+  return llvm::ArrayType::get(llvm::offloading::getEntryTy(module), numElems);
+}
+
+/// Creates the initializer of the entry array.
+llvm::Constant *getEntryArrayBegin(llvm::Module &module,
+                                   ArrayRef<llvm::Constant *> entries) {
+  // If there are no entries return a constant zero initializer.
+  llvm::ArrayType *arrayTy = getEntryArrayType(module, entries.size());
+  return entries.empty() ? llvm::ConstantAggregateZero::get(arrayTy)
+                         : llvm::ConstantArray::get(arrayTy, entries);
+}
+
+/// Computes the end position of the entry array.
+llvm::Constant *getEntryArrayEnd(llvm::Module &module,
+                                 llvm::GlobalVariable *begin, size_t numElems) {
+  llvm::Type *intTy = module.getDataLayout().getIntPtrType(module.getContext());
+  return llvm::ConstantExpr::getGetElementPtr(
+      llvm::offloading::getEntryTy(module), begin,
+      ArrayRef<llvm::Constant *>({llvm::ConstantInt::get(intTy, numElems)}),
+      true);
+}
+} // namespace
+
+OffloadHandler::OffloadEntryArray
+OffloadHandler::getEntryArray(StringRef suffix) {
+  llvm::GlobalVariable *beginGV =
+      module.getGlobalVariable(getBeginSymbol(suffix), true);
+  llvm::GlobalVariable *endGV =
+      module.getGlobalVariable(getEndSymbol(suffix), true);
+  return {beginGV, endGV};
+}
+
+OffloadHandler::OffloadEntryArray
+OffloadHandler::emitEmptyEntryArray(StringRef suffix) {
+  llvm::ArrayType *arrayTy = getEntryArrayType(module, 0);
+  auto *beginGV = new llvm::GlobalVariable(
+      module, arrayTy, /*isConstant=*/true, llvm::GlobalValue::InternalLinkage,
+      getEntryArrayBegin(module, {}), getBeginSymbol(suffix));
+  auto *endGV = new llvm::GlobalVariable(
+      module, llvm::PointerType::get(module.getContext(), 0),
+      /*isConstant=*/true, llvm::GlobalValue::InternalLinkage,
+      getEntryArrayEnd(module, beginGV, 0), getEndSymbol(suffix));
+  return {beginGV, endGV};
+}
+
+LogicalResult OffloadHandler::insertOffloadEntry(StringRef suffix,
+                                                 llvm::Constant *entry) {
+  // Get the begin and end symbols to the entry array.
+  std::string beginSymId = getBeginSymbol(suffix);
+  llvm::GlobalVariable *beginGV = module.getGlobalVariable(beginSymId, true);
+  llvm::GlobalVariable *endGV =
+      module.getGlobalVariable(getEndSymbol(suffix), true);
+  // Fail if the symbols are missing.
+  if (!beginGV || !endGV)
+    return failure();
+  // Create the entry initializer.
+  assert(beginGV->getInitializer() && "entry array initializer is missing.");
+  // Add existing entries into the new entry array.
+  SmallVector<llvm::Constant *> entries;
+  if (auto beginInit = dyn_cast_or_null<llvm::ConstantAggregate>(
+          beginGV->getInitializer())) {
+    for (unsigned i = 0; i < beginInit->getNumOperands(); ++i)
+      entries.push_back(beginInit->getOperand(i));
+  }
+  // Add the new entry.
+  entries.push_back(entry);
+  // Create a global holding the new updated set of entries.
+  auto *arrayTy = llvm::ArrayType::get(llvm::offloading::getEntryTy(module),
+                                       entries.size());
+  auto *entryArr = new llvm::GlobalVariable(
+      module, arrayTy, /*isConstant=*/true, llvm::GlobalValue::InternalLinkage,
+      getEntryArrayBegin(module, entries), beginSymId, endGV);
+  // Replace the old entry array variable withe new one.
+  beginGV->replaceAllUsesWith(entryArr);
+  beginGV->eraseFromParent();
+  entryArr->setName(beginSymId);
+  // Update the end symbol.
+  endGV->setInitializer(getEntryArrayEnd(module, entryArr, entries.size()));
+  return success();
+}
diff --git a/mlir/unittests/Target/LLVM/CMakeLists.txt b/mlir/unittests/Target/LLVM/CMakeLists.txt
index 6d612548a94c0f..d04f38ddddfacf 100644
--- a/mlir/unittests/Target/LLVM/CMakeLists.txt
+++ b/mlir/unittests/Target/LLVM/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_mlir_unittest(MLIRTargetLLVMTests
+  Offload.cpp
   SerializeNVVMTarget.cpp
   SerializeROCDLTarget.cpp
   SerializeToLLVMBitcode.cpp
diff --git a/mlir/unittests/Target/LLVM/Offload.cpp b/mlir/unittests/Target/LLVM/Offload.cpp
new file mode 100644
index 00000000000000..375edc2e9614d3
--- /dev/null
+++ b/mlir/unittests/Target/LLVM/Offload.cpp
@@ -0,0 +1,49 @@
+//===- Offload.cpp ----------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/LLVM/Offload.h"
+#include "llvm/Frontend/Offloading/Utility.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+
+#include "gmock/gmock.h"
+
+using namespace llvm;
+
+TEST(MLIRTarget, OffloadAPI) {
+  using OffloadEntryArray = mlir::LLVM::OffloadHandler::OffloadEntryArray;
+  LLVMContext llvmContext;
+  Module llvmModule("offload", llvmContext);
+  mlir::LLVM::OffloadHandler handler(llvmModule);
+  StringRef suffix = ".mlir";
+  // Check there's no entry array with `.mlir` suffix.
+  OffloadEntryArray entryArray = handler.getEntryArray(suffix);
+  EXPECT_EQ(entryArray, OffloadEntryArray());
+  // Emit the entry array.
+  handler.emitEmptyEntryArray(suffix);
+  // Check there's an entry array with `.mlir` suffix.
+  entryArray = handler.getEntryArray(suffix);
+  ASSERT_NE(entryArray.first, nullptr);
+  ASSERT_NE(entryArray.second, nullptr);
+  // Check the array contains no entries.
+  auto *zeroInitializer = dyn_cast_or_null<ConstantAggregateZero>(
+      entryArray.first->getInitializer());
+  ASSERT_NE(zeroInitializer, nullptr);
+  // Insert an empty entries.
+  auto emptyEntry =
+      ConstantAggregateZero::get(offloading::getEntryTy(llvmModule));
+  ASSERT_TRUE(succeeded(handler.insertOffloadEntry(suffix, emptyEntry)));
+  // Check there's an entry in the entry array with `.mlir` suffix.
+  entryArray = handler.getEntryArray(suffix);
+  ASSERT_NE(entryArray.first, nullptr);
+  Constant *arrayInitializer = entryArray.first->getInitializer();
+  ASSERT_NE(arrayInitializer, nullptr);
+  auto *arrayTy = dyn_cast_or_null<ArrayType>(arrayInitializer->getType());
+  ASSERT_NE(arrayTy, nullptr);
+  EXPECT_EQ(arrayTy->getNumElements(), 1u);
+}

>From 96ca7efc81c85ee6011add476a5987f6e0efacef Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev at gmail.com>
Date: Mon, 15 Jan 2024 15:01:42 +0000
Subject: [PATCH 3/9] [llvm][OpenMPIRBuilder] Allow to not register offload
 entries in the entry manager This patch adds an optional field in the create
 target method to store the offload entry in a custom location and not
 register the entry in the entry manager. This change is required to enable
 JIT compilation in MLIR for OpenMP target offload ops, as arrays of entries
 are handled differently for standalone MLIR compilation.

---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       | 53 ++++++++++++-------
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 36 ++++++++-----
 2 files changed, 57 insertions(+), 32 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 669104307fa0e2..a584552d15954d 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -298,9 +298,10 @@ class OffloadEntriesInfoManager {
   void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo,
                                        unsigned Order);
   /// Register target region entry.
-  void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
-                                     Constant *Addr, Constant *ID,
-                                     OMPTargetRegionEntryKind Flags);
+  void registerTargetRegionEntryInfo(
+      TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
+      OMPTargetRegionEntryKind Flags,
+      OffloadEntryInfoTargetRegion *EntryInfoStorage = nullptr);
   /// Return true if a target region entry with the provided information
   /// exists.
   bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
@@ -2139,10 +2140,14 @@ class OpenMPIRBuilder {
   /// \param GenerateFunctionCallback The callback function to generate the code
   /// \param OutlinedFunction Pointer to the outlined function
   /// \param EntryFnIDName Name of the ID o be created
-  void emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo,
-                                FunctionGenCallback &GenerateFunctionCallback,
-                                bool IsOffloadEntry, Function *&OutlinedFn,
-                                Constant *&OutlinedFnID);
+  /// \param EntryInfoStorage Optional storage location for the offload target
+  /// region entry.
+  void emitTargetRegionFunction(
+      TargetRegionEntryInfo &EntryInfo,
+      FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
+      Function *&OutlinedFn, Constant *&OutlinedFnID,
+      OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion
+          *EntryInfoStorage = nullptr);
 
   /// Registers the given function and sets up the attribtues of the function
   /// Returns the FunctionID.
@@ -2152,10 +2157,13 @@ class OpenMPIRBuilder {
   /// \param OutlinedFunction Pointer to the outlined function
   /// \param EntryFnName Name of the outlined function
   /// \param EntryFnIDName Name of the ID o be created
-  Constant *registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo,
-                                         Function *OutlinedFunction,
-                                         StringRef EntryFnName,
-                                         StringRef EntryFnIDName);
+  /// \param EntryInfoStorage Optional storage location for the offload target
+  /// region entry.
+  Constant *registerTargetRegionFunction(
+      TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction,
+      StringRef EntryFnName, StringRef EntryFnIDName,
+      OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion
+          *EntryInfoStorage = nullptr);
 
   /// Type of BodyGen to use for region codegen
   ///
@@ -2225,15 +2233,20 @@ class OpenMPIRBuilder {
   /// \param BodyGenCB Callback that will generate the region code.
   /// \param ArgAccessorFuncCB Callback that will generate accessors
   /// instructions for passed in target arguments where neccessary
-  InsertPointTy createTarget(const LocationDescription &Loc,
-                             OpenMPIRBuilder::InsertPointTy AllocaIP,
-                             OpenMPIRBuilder::InsertPointTy CodeGenIP,
-                             TargetRegionEntryInfo &EntryInfo, int32_t NumTeams,
-                             int32_t NumThreads,
-                             SmallVectorImpl<Value *> &Inputs,
-                             GenMapInfoCallbackTy GenMapInfoCB,
-                             TargetBodyGenCallbackTy BodyGenCB,
-                             TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB);
+  /// \param EntryInfoStorage Optional storage location for the offload target
+  /// region entry. If the pointer is null the entry gets registered in the
+  /// `OffloadEntriesInfoManager`, otherwise it gets stored in
+  /// `EntryInfoStorage` and doesn't get registered in
+  /// `OffloadEntriesInfoManager`.
+  InsertPointTy createTarget(
+      const LocationDescription &Loc, OpenMPIRBuilder::InsertPointTy AllocaIP,
+      OpenMPIRBuilder::InsertPointTy CodeGenIP,
+      TargetRegionEntryInfo &EntryInfo, int32_t NumTeams, int32_t NumThreads,
+      SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
+      TargetBodyGenCallbackTy BodyGenCB,
+      TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
+      OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion
+          *EntryInfoStorage = nullptr);
 
   /// Returns __kmpc_for_static_init_* runtime function for the specified
   /// size \a IVSize and sign \a IVSigned. Will create a distribute call
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index f6cf358119fb71..c5498d6c67b455 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4741,7 +4741,8 @@ Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
 void OpenMPIRBuilder::emitTargetRegionFunction(
     TargetRegionEntryInfo &EntryInfo,
     FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
-    Function *&OutlinedFn, Constant *&OutlinedFnID) {
+    Function *&OutlinedFn, Constant *&OutlinedFnID,
+    OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion *EntryInfoStorage) {
 
   SmallString<64> EntryFnName;
   OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
@@ -4761,20 +4762,22 @@ void OpenMPIRBuilder::emitTargetRegionFunction(
           ? std::string(EntryFnName)
           : createPlatformSpecificName({EntryFnName, "region_id"});
 
-  OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
-                                              EntryFnName, EntryFnIDName);
+  OutlinedFnID = registerTargetRegionFunction(
+      EntryInfo, OutlinedFn, EntryFnName, EntryFnIDName, EntryInfoStorage);
 }
 
 Constant *OpenMPIRBuilder::registerTargetRegionFunction(
     TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
-    StringRef EntryFnName, StringRef EntryFnIDName) {
+    StringRef EntryFnName, StringRef EntryFnIDName,
+    OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion *EntryInfoStorage) {
   if (OutlinedFn)
     setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
   auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
   auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
   OffloadInfoManager.registerTargetRegionEntryInfo(
       EntryInfo, EntryAddr, OutlinedFnID,
-      OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion);
+      OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion,
+      EntryInfoStorage);
   return OutlinedFnID;
 }
 
@@ -5094,7 +5097,8 @@ static void emitTargetOutlinedFunction(
     TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn,
     Constant *&OutlinedFnID, SmallVectorImpl<Value *> &Inputs,
     OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
-    OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
+    OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB,
+    OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion *EntryInfoStorage) {
 
   OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
       [&OMPBuilder, &Builder, &Inputs, &CBFunc,
@@ -5104,7 +5108,8 @@ static void emitTargetOutlinedFunction(
       };
 
   OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction, true,
-                                      OutlinedFn, OutlinedFnID);
+                                      OutlinedFn, OutlinedFnID,
+                                      EntryInfoStorage);
 }
 
 static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
@@ -5165,7 +5170,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget(
     int32_t NumThreads, SmallVectorImpl<Value *> &Args,
     GenMapInfoCallbackTy GenMapInfoCB,
     OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
-    OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB) {
+    OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
+    OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion *EntryInfoStorage) {
   if (!updateToLocation(Loc))
     return InsertPointTy();
 
@@ -5174,7 +5180,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget(
   Function *OutlinedFn;
   Constant *OutlinedFnID;
   emitTargetOutlinedFunction(*this, Builder, EntryInfo, OutlinedFn,
-                             OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB);
+                             OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB,
+                             EntryInfoStorage);
   if (!Config.isTargetDevice())
     emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
                    NumThreads, Args, GenMapInfoCB);
@@ -6929,7 +6936,8 @@ void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
 
 void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
     TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
-    OMPTargetRegionEntryKind Flags) {
+    OMPTargetRegionEntryKind Flags,
+    OffloadEntryInfoTargetRegion *EntryInfoStorage) {
   assert(EntryInfo.Count == 0 && "expected default EntryInfo");
 
   // Update the EntryInfo with the next available count for this location.
@@ -6953,8 +6961,12 @@ void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
     assert(!hasTargetRegionEntryInfo(EntryInfo) &&
            "Target region entry already registered!");
     OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
-    OffloadEntriesTargetRegion[EntryInfo] = Entry;
-    ++OffloadingEntriesNum;
+    if (EntryInfoStorage) {
+      *EntryInfoStorage = Entry;
+    } else {
+      OffloadEntriesTargetRegion[EntryInfo] = Entry;
+      ++OffloadingEntriesNum;
+    }
   }
   incrementTargetRegionEntryInfoCount(EntryInfo);
 }

>From ac2df60c670a8118a3b4090f0c592ce4838a79a5 Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev at gmail.com>
Date: Sun, 14 Jan 2024 01:29:19 +0000
Subject: [PATCH 4/9] [mlir][Target][LLVM] Add offload utility class

This patch adds the `OffloadHandler` utility class for creating LLVM offload
entries.
LLVM offload entries hold information on offload symbols; for example, for a
GPU kernel, this includes its host address to identify the kernel and the kernel
identifier in the binary. Arrays of offload entries can be used to register
functions within the CUDA/HIP runtime. Libomptarget also uses these entries to
register OMP target offload kernels and variables.

This patch is 1/4 on introducing the `OffloadEmbeddingAttr` GPU translation
attribute.
---
 mlir/include/mlir/Target/LLVM/Offload.h   |  61 ++++++++++++
 mlir/lib/Target/LLVM/CMakeLists.txt       |   2 +
 mlir/lib/Target/LLVM/Offload.cpp          | 111 ++++++++++++++++++++++
 mlir/unittests/Target/LLVM/CMakeLists.txt |   1 +
 mlir/unittests/Target/LLVM/Offload.cpp    |  49 ++++++++++
 5 files changed, 224 insertions(+)
 create mode 100644 mlir/include/mlir/Target/LLVM/Offload.h
 create mode 100644 mlir/lib/Target/LLVM/Offload.cpp
 create mode 100644 mlir/unittests/Target/LLVM/Offload.cpp

diff --git a/mlir/include/mlir/Target/LLVM/Offload.h b/mlir/include/mlir/Target/LLVM/Offload.h
new file mode 100644
index 00000000000000..7b705667d477d2
--- /dev/null
+++ b/mlir/include/mlir/Target/LLVM/Offload.h
@@ -0,0 +1,61 @@
+//===- Offload.h - LLVM Target Offload --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares LLVM target offload utility classes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_TARGET_LLVM_OFFLOAD_H
+#define MLIR_TARGET_LLVM_OFFLOAD_H
+
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+class Constant;
+class GlobalVariable;
+class Module;
+} // namespace llvm
+
+namespace mlir {
+namespace LLVM {
+/// `OffloadHandler` is a utility class for creating LLVM offload entries. LLVM
+/// offload entries hold information on offload symbols; for example, for a GPU
+/// kernel, this includes its host address to identify the kernel and the kernel
+/// identifier in the binary. Arrays of offload entries can be used to register
+/// functions within the CUDA/HIP runtime. Libomptarget also uses these entries
+/// to register OMP target offload kernels and variables.
+class OffloadHandler {
+public:
+  using OffloadEntryArray =
+      std::pair<llvm::GlobalVariable *, llvm::GlobalVariable *>;
+  OffloadHandler(llvm::Module &module) : module(module) {}
+
+  /// Returns the begin symbol name used in the entry array.
+  static std::string getBeginSymbol(StringRef suffix);
+
+  /// Returns the end symbol name used in the entry array.
+  static std::string getEndSymbol(StringRef suffix);
+
+  /// Returns the entry array if it exists or a pair of null pointers.
+  OffloadEntryArray getEntryArray(StringRef suffix);
+
+  /// Emits an empty array of offloading entries.
+  OffloadEntryArray emitEmptyEntryArray(StringRef suffix);
+
+  /// Inserts an offloading entry into an existing entry array. This method
+  /// returns failure if the entry array hasn't been declared.
+  LogicalResult insertOffloadEntry(StringRef suffix, llvm::Constant *entry);
+
+protected:
+  llvm::Module &module;
+};
+} // namespace LLVM
+} // namespace mlir
+
+#endif // MLIR_TARGET_LLVM_OFFLOAD_H
diff --git a/mlir/lib/Target/LLVM/CMakeLists.txt b/mlir/lib/Target/LLVM/CMakeLists.txt
index cc2c3a00a02eaf..241a6c64dd868f 100644
--- a/mlir/lib/Target/LLVM/CMakeLists.txt
+++ b/mlir/lib/Target/LLVM/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_library(MLIRTargetLLVM
   ModuleToObject.cpp
+  Offload.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Target/LLVM
@@ -16,6 +17,7 @@ add_mlir_library(MLIRTargetLLVM
   Passes
   Support
   Target
+  FrontendOffloading
   LINK_LIBS PUBLIC
   MLIRExecutionEngineUtils
   MLIRTargetLLVMIRExport
diff --git a/mlir/lib/Target/LLVM/Offload.cpp b/mlir/lib/Target/LLVM/Offload.cpp
new file mode 100644
index 00000000000000..81ba12403bfb99
--- /dev/null
+++ b/mlir/lib/Target/LLVM/Offload.cpp
@@ -0,0 +1,111 @@
+//===- Offload.cpp - LLVM Target Offload ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines LLVM target offload utility classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/LLVM/Offload.h"
+#include "llvm/Frontend/Offloading/Utility.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+
+using namespace mlir;
+using namespace mlir::LLVM;
+
+std::string OffloadHandler::getBeginSymbol(StringRef suffix) {
+  return ("__begin_offload_" + suffix).str();
+}
+
+std::string OffloadHandler::getEndSymbol(StringRef suffix) {
+  return ("__end_offload_" + suffix).str();
+}
+
+namespace {
+/// Returns the type of the entry array.
+llvm::ArrayType *getEntryArrayType(llvm::Module &module, size_t numElems) {
+  return llvm::ArrayType::get(llvm::offloading::getEntryTy(module), numElems);
+}
+
+/// Creates the initializer of the entry array.
+llvm::Constant *getEntryArrayBegin(llvm::Module &module,
+                                   ArrayRef<llvm::Constant *> entries) {
+  // If there are no entries return a constant zero initializer.
+  llvm::ArrayType *arrayTy = getEntryArrayType(module, entries.size());
+  return entries.empty() ? llvm::ConstantAggregateZero::get(arrayTy)
+                         : llvm::ConstantArray::get(arrayTy, entries);
+}
+
+/// Computes the end position of the entry array.
+llvm::Constant *getEntryArrayEnd(llvm::Module &module,
+                                 llvm::GlobalVariable *begin, size_t numElems) {
+  llvm::Type *intTy = module.getDataLayout().getIntPtrType(module.getContext());
+  return llvm::ConstantExpr::getGetElementPtr(
+      llvm::offloading::getEntryTy(module), begin,
+      ArrayRef<llvm::Constant *>({llvm::ConstantInt::get(intTy, numElems)}),
+      true);
+}
+} // namespace
+
+OffloadHandler::OffloadEntryArray
+OffloadHandler::getEntryArray(StringRef suffix) {
+  llvm::GlobalVariable *beginGV =
+      module.getGlobalVariable(getBeginSymbol(suffix), true);
+  llvm::GlobalVariable *endGV =
+      module.getGlobalVariable(getEndSymbol(suffix), true);
+  return {beginGV, endGV};
+}
+
+OffloadHandler::OffloadEntryArray
+OffloadHandler::emitEmptyEntryArray(StringRef suffix) {
+  llvm::ArrayType *arrayTy = getEntryArrayType(module, 0);
+  auto *beginGV = new llvm::GlobalVariable(
+      module, arrayTy, /*isConstant=*/true, llvm::GlobalValue::InternalLinkage,
+      getEntryArrayBegin(module, {}), getBeginSymbol(suffix));
+  auto *endGV = new llvm::GlobalVariable(
+      module, llvm::PointerType::get(module.getContext(), 0),
+      /*isConstant=*/true, llvm::GlobalValue::InternalLinkage,
+      getEntryArrayEnd(module, beginGV, 0), getEndSymbol(suffix));
+  return {beginGV, endGV};
+}
+
+LogicalResult OffloadHandler::insertOffloadEntry(StringRef suffix,
+                                                 llvm::Constant *entry) {
+  // Get the begin and end symbols to the entry array.
+  std::string beginSymId = getBeginSymbol(suffix);
+  llvm::GlobalVariable *beginGV = module.getGlobalVariable(beginSymId, true);
+  llvm::GlobalVariable *endGV =
+      module.getGlobalVariable(getEndSymbol(suffix), true);
+  // Fail if the symbols are missing.
+  if (!beginGV || !endGV)
+    return failure();
+  // Create the entry initializer.
+  assert(beginGV->getInitializer() && "entry array initializer is missing.");
+  // Add existing entries into the new entry array.
+  SmallVector<llvm::Constant *> entries;
+  if (auto beginInit = dyn_cast_or_null<llvm::ConstantAggregate>(
+          beginGV->getInitializer())) {
+    for (unsigned i = 0; i < beginInit->getNumOperands(); ++i)
+      entries.push_back(beginInit->getOperand(i));
+  }
+  // Add the new entry.
+  entries.push_back(entry);
+  // Create a global holding the new updated set of entries.
+  auto *arrayTy = llvm::ArrayType::get(llvm::offloading::getEntryTy(module),
+                                       entries.size());
+  auto *entryArr = new llvm::GlobalVariable(
+      module, arrayTy, /*isConstant=*/true, llvm::GlobalValue::InternalLinkage,
+      getEntryArrayBegin(module, entries), beginSymId, endGV);
+  // Replace the old entry array variable withe new one.
+  beginGV->replaceAllUsesWith(entryArr);
+  beginGV->eraseFromParent();
+  entryArr->setName(beginSymId);
+  // Update the end symbol.
+  endGV->setInitializer(getEntryArrayEnd(module, entryArr, entries.size()));
+  return success();
+}
diff --git a/mlir/unittests/Target/LLVM/CMakeLists.txt b/mlir/unittests/Target/LLVM/CMakeLists.txt
index 6d612548a94c0f..d04f38ddddfacf 100644
--- a/mlir/unittests/Target/LLVM/CMakeLists.txt
+++ b/mlir/unittests/Target/LLVM/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_mlir_unittest(MLIRTargetLLVMTests
+  Offload.cpp
   SerializeNVVMTarget.cpp
   SerializeROCDLTarget.cpp
   SerializeToLLVMBitcode.cpp
diff --git a/mlir/unittests/Target/LLVM/Offload.cpp b/mlir/unittests/Target/LLVM/Offload.cpp
new file mode 100644
index 00000000000000..375edc2e9614d3
--- /dev/null
+++ b/mlir/unittests/Target/LLVM/Offload.cpp
@@ -0,0 +1,49 @@
+//===- Offload.cpp ----------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Target/LLVM/Offload.h"
+#include "llvm/Frontend/Offloading/Utility.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+
+#include "gmock/gmock.h"
+
+using namespace llvm;
+
+TEST(MLIRTarget, OffloadAPI) {
+  using OffloadEntryArray = mlir::LLVM::OffloadHandler::OffloadEntryArray;
+  LLVMContext llvmContext;
+  Module llvmModule("offload", llvmContext);
+  mlir::LLVM::OffloadHandler handler(llvmModule);
+  StringRef suffix = ".mlir";
+  // Check there's no entry array with `.mlir` suffix.
+  OffloadEntryArray entryArray = handler.getEntryArray(suffix);
+  EXPECT_EQ(entryArray, OffloadEntryArray());
+  // Emit the entry array.
+  handler.emitEmptyEntryArray(suffix);
+  // Check there's an entry array with `.mlir` suffix.
+  entryArray = handler.getEntryArray(suffix);
+  ASSERT_NE(entryArray.first, nullptr);
+  ASSERT_NE(entryArray.second, nullptr);
+  // Check the array contains no entries.
+  auto *zeroInitializer = dyn_cast_or_null<ConstantAggregateZero>(
+      entryArray.first->getInitializer());
+  ASSERT_NE(zeroInitializer, nullptr);
+  // Insert an empty entries.
+  auto emptyEntry =
+      ConstantAggregateZero::get(offloading::getEntryTy(llvmModule));
+  ASSERT_TRUE(succeeded(handler.insertOffloadEntry(suffix, emptyEntry)));
+  // Check there's an entry in the entry array with `.mlir` suffix.
+  entryArray = handler.getEntryArray(suffix);
+  ASSERT_NE(entryArray.first, nullptr);
+  Constant *arrayInitializer = entryArray.first->getInitializer();
+  ASSERT_NE(arrayInitializer, nullptr);
+  auto *arrayTy = dyn_cast_or_null<ArrayType>(arrayInitializer->getType());
+  ASSERT_NE(arrayTy, nullptr);
+  EXPECT_EQ(arrayTy->getNumElements(), 1u);
+}

>From 5c08090f05704ae1464fa2b09ce21fb4ddc9471d Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev at gmail.com>
Date: Mon, 15 Jan 2024 16:44:27 +0000
Subject: [PATCH 5/9] [mlir][OpenMP] Add the `tgt_entry_info` attribute

This patch adds the `omp.tgt_entry_info` attribute. This attribute provides
information to identify offload entries uniquely and partially reflects the
information in the `llvm::TargetRegionEntryInfo` struct.

An `info` parameter was added to `TargetOp` to specify the offload entry
information from the operation explicitly. Both the host and device versions
must have the same `info` attribute; otherwise, the constructs won't correctly
map between each other.

This patch is required to enable JIT compilation for the OMP dialect, as
the entry array has to be fully constructed in the IR instead of using
sections to implicitly construct it.

Note: Ignore the base commits.
---
 flang/lib/Lower/OpenMP.cpp                    |  2 +-
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 34 ++++++++++-
 .../LLVMIR/Dialect/OpenMP/CMakeLists.txt      |  1 +
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 61 ++++++++++++++++++-
 .../Target/LLVMIR/omptarget-entry-info.mlir   | 48 +++++++++++++++
 5 files changed, 142 insertions(+), 4 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-entry-info.mlir

diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index 4f7c99a6d2b840..2944187aaef7aa 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -2802,7 +2802,7 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
 
   auto targetOp = converter.getFirOpBuilder().create<mlir::omp::TargetOp>(
       currentLocation, ifClauseOperand, deviceOperand, threadLimitOperand,
-      nowaitAttr, mapOperands);
+      nowaitAttr, mapOperands, mlir::omp::TargetRegionEntryInfoAttr());
 
   genBodyOfTargetOp(converter, eval, targetOp, mapSymTypes, mapSymLocs,
                     mapSymbols, currentLocation);
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index d614f2666a85ab..89fea898497be4 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -81,6 +81,33 @@ def TargetAttr : OpenMP_Attr<"Target", "target"> {
   let assemblyFormat = "`<` struct(params) `>`";
 }
 
+def TargetRegionEntryInfoAttr :
+    OpenMP_Attr<"TargetRegionEntryInfo", "tgt_entry_info"> {
+  let description = [{
+    The `tgt_entry_info` attribute provides information to identify offload
+    entries uniquely. This attribute partially reflects the information in the
+    `llvm::TargetRegionEntryInfo` struct.
+
+    The information in this attribute can be used to generate the unique
+    identifier used to refer the offload symbol.
+
+    The optional `section` parameter can be used to emit the entry in an specific
+    entry rather than using the `omp_offloading_entries` data section. This
+    array has to be created before translating the Op generating the entry.
+
+    Example:
+    ```mlir
+    omp.tgt_entry_info<deviceID = 1, fileID = 0, line = 1, section = @entryArray>
+    ```
+  }];
+  let parameters = (ins
+    "unsigned":$deviceID,
+    "unsigned":$fileID,
+    "unsigned":$line,
+    OptionalParameter<"FlatSymbolRefAttr">:$section
+  );
+  let assemblyFormat = "`<` struct(params) `>`";
+}
 
 class OpenMP_Op<string mnemonic, list<Trait> traits = []> :
       Op<OpenMP_Dialect, mnemonic, traits>;
@@ -1443,6 +1470,9 @@ def TargetOp : OpenMP_Op<"target",[IsolatedFromAbove, OutlineableOpenMPOpInterfa
     The optional $nowait elliminates the implicit barrier so the parent task can make progress
     even if the target task is not yet completed.
 
+    The optional $info paremeter specifies the information that should be used to
+    create the offload entry in the IR.
+
     TODO:  is_device_ptr, depend, defaultmap, in_reduction
 
   }];
@@ -1451,7 +1481,8 @@ def TargetOp : OpenMP_Op<"target",[IsolatedFromAbove, OutlineableOpenMPOpInterfa
                        Optional<AnyInteger>:$device,
                        Optional<AnyInteger>:$thread_limit,
                        UnitAttr:$nowait,
-                       Variadic<AnyType>:$map_operands);
+                       Variadic<AnyType>:$map_operands,
+                       OptionalAttr<TargetRegionEntryInfoAttr>:$targetRegionEntryInfo);
 
   let regions = (region AnyRegion:$region);
 
@@ -1461,6 +1492,7 @@ def TargetOp : OpenMP_Op<"target",[IsolatedFromAbove, OutlineableOpenMPOpInterfa
     | `thread_limit` `(` $thread_limit `:` type($thread_limit) `)`
     | `nowait` $nowait
     | `map_entries` `(` custom<MapEntries>($map_operands, type($map_operands)) `)`
+    | `info` `=` qualified($targetRegionEntryInfo)
     ) $region attr-dict
   }];
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/CMakeLists.txt b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/CMakeLists.txt
index 0a5d7c6e22058d..744ec7b1ae3cf4 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/CMakeLists.txt
@@ -9,6 +9,7 @@ add_mlir_translation_library(MLIROpenMPToLLVMIRTranslation
   MLIRLLVMDialect
   MLIROpenMPDialect
   MLIRSupport
+  MLIRTargetLLVM
   MLIRTargetLLVMIRExport
   MLIRTransformUtils
   )
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index e7aebc3ce4be56..5492d828a99a3f 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -18,12 +18,14 @@
 #include "mlir/IR/Operation.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
+#include "mlir/Target/LLVM/Offload.h"
 #include "mlir/Target/LLVMIR/Dialect/OpenMPCommon.h"
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"
 #include "mlir/Transforms/RegionUtils.h"
 
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Frontend/Offloading/Utility.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -2065,6 +2067,25 @@ LogicalResult convertFlagsAttr(Operation *op, mlir::omp::FlagsAttr attribute,
   return success();
 }
 
+static bool
+getAndRegisterTargetEntryUniqueInfo(llvm::TargetRegionEntryInfo &targetInfo,
+                                    LLVM::ModuleTranslation &moduleTranslation,
+                                    omp::TargetOp targetOp,
+                                    llvm::StringRef parentName = "") {
+  omp::TargetRegionEntryInfoAttr infoAttr =
+      targetOp.getTargetRegionEntryInfoAttr();
+  if (!infoAttr)
+    return false;
+  targetInfo =
+      llvm::TargetRegionEntryInfo(parentName, infoAttr.getDeviceID(),
+                                  infoAttr.getFileID(), infoAttr.getLine());
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  if (ompBuilder->Config.isTargetDevice() && ompBuilder->Config.isGPU())
+    ompBuilder->OffloadInfoManager.initializeTargetRegionEntryInfo(targetInfo,
+                                                                   0);
+  return true;
+}
+
 static bool getTargetEntryUniqueInfo(llvm::TargetRegionEntryInfo &targetInfo,
                                      omp::TargetOp targetOp,
                                      llvm::StringRef parentName = "") {
@@ -2371,7 +2392,9 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
 
   llvm::TargetRegionEntryInfo entryInfo;
 
-  if (!getTargetEntryUniqueInfo(entryInfo, targetOp, parentName))
+  if (!(getAndRegisterTargetEntryUniqueInfo(entryInfo, moduleTranslation,
+                                            targetOp, parentName) ||
+        getTargetEntryUniqueInfo(entryInfo, targetOp, parentName)))
     return failure();
 
   int32_t defaultValTeams = -1;
@@ -2434,15 +2457,49 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
       kernelInput.push_back(mapData.OriginalValue[i]);
   }
 
+  llvm::OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion entryRegionInfo;
+  // Determine whether the entry is going to be handled by
+  // `OffloadEntriesInfoManager` or by this method. If `entryArraySection` is
+  // null then it's handled by `OffloadEntriesInfoManager`
+  omp::TargetRegionEntryInfoAttr regionInfoAttr =
+      targetOp.getTargetRegionEntryInfoAttr();
+  FlatSymbolRefAttr entryArraySection =
+      regionInfoAttr ? regionInfoAttr.getSection() : FlatSymbolRefAttr();
+
+  // Create the target region
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTarget(
       ompLoc, allocaIP, builder.saveIP(), entryInfo, defaultValTeams,
-      defaultValThreads, kernelInput, genMapInfoCB, bodyCB, argAccessorCB));
+      defaultValThreads, kernelInput, genMapInfoCB, bodyCB, argAccessorCB,
+      entryArraySection ? &entryRegionInfo : nullptr));
 
   // Remap access operations to declare target reference pointers for the
   // device, essentially generating extra loadop's as necessary
   if (moduleTranslation.getOpenMPBuilder()->Config.isTargetDevice())
     handleDeclareTargetMapVar(mapData, moduleTranslation, builder);
 
+  // Return early if the target op it's being emitted for a device or if the
+  // entry is handled by `OffloadEntriesInfoManager`
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  if (ompBuilder->Config.isTargetDevice() || !entryArraySection)
+    return bodyGenStatus;
+
+  assert(entryRegionInfo.isValid() && "invalid target entry region");
+
+  auto regionAddrGV =
+      dyn_cast_or_null<llvm::GlobalValue>(entryRegionInfo.getAddress());
+  assert(regionAddrGV && "missing reggion address");
+
+  // Emit the offload entry.
+  llvm::Module &llvmModule = *moduleTranslation.getLLVMModule();
+  LLVM::OffloadHandler offloadHandler(llvmModule);
+  std::pair<llvm::Constant *, llvm::GlobalVariable *> entryInit =
+      llvm::offloading::getOffloadingEntryInitializer(
+          llvmModule, entryRegionInfo.getID(), regionAddrGV->getName(), 0,
+          entryRegionInfo.getFlags(), 0);
+
+  if (failed(offloadHandler.insertOffloadEntry(entryArraySection.getValue(),
+                                               entryInit.first)))
+    targetOp.emitError("failed to insert the entry");
   return bodyGenStatus;
 }
 
diff --git a/mlir/test/Target/LLVMIR/omptarget-entry-info.mlir b/mlir/test/Target/LLVMIR/omptarget-entry-info.mlir
new file mode 100644
index 00000000000000..dafae687022291
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-entry-info.mlir
@@ -0,0 +1,48 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+!EntryArray = !llvm.struct<(!llvm.ptr, !llvm.ptr, i64, i32, i32)>
+// CHECK: @__begin_offload_omp = internal constant [2 x %{{.*}}] [%{{.*}} { ptr @[[TGT_OP_1:.*]], ptr @[[TGT_OP_1_NAME:.*]], i64 0, i32 0, i32 0 }, %{{.*}} { ptr @[[TGT_OP_2:.*]], ptr @[[TGT_OP_2_NAME:.*]], i64 0, i32 0, i32 0 }]
+// CHECK: @__end_offload_omp = constant ptr getelementptr inbounds (%{{.*}}, ptr @__begin_offload_omp, i64 2)
+// CHECK: @[[TGT_OP_1]] = weak constant i8 0
+// CHECK: @[[TGT_OP_1_NAME]] = internal unnamed_addr constant [{{.*}} x i8] c"{{.*}}0_0_main_l0\00"
+// CHECK: @[[TGT_OP_2]] = weak constant i8 0
+// CHECK: @[[TGT_OP_2_NAME]] = internal unnamed_addr constant [{{.*}} x i8] c"{{.*}}0_0_main_l1\00"
+// CHECK: define void @main() {
+// CHECK: %{{.*}} = call i32 @__tgt_target_kernel(ptr @{{.*}}, i64 -1, i32 -1, i32 0, ptr @[[TGT_OP_1]], ptr %{{.*}})
+// CHECK: %{{.*}} = call i32 @__tgt_target_kernel(ptr @{{.*}}, i64 -1, i32 -1, i32 0, ptr @[[TGT_OP_2]], ptr %{{.*}})
+// CHECK: }
+// CHECK-LABEL: define internal void @{{.*}}0_0_main_l0() {
+// CHECK-LABEL: define internal void @{{.*}}0_0_main_l1() {
+module attributes {omp.is_target_device = false, omp.is_gpu = false} {
+  llvm.mlir.global constant @__begin_offload_omp() : !llvm.array<0 x !EntryArray> {
+    %zero = llvm.mlir.zero : !llvm.array<0 x !EntryArray>
+    llvm.return %zero : !llvm.array<0 x !EntryArray>
+  }
+  llvm.mlir.global constant @__end_offload_omp() : !llvm.ptr {
+    %array = llvm.mlir.addressof @__begin_offload_omp : !llvm.ptr
+    llvm.return %array : !llvm.ptr
+  }
+  llvm.func @main() {
+    omp.target info = #omp.tgt_entry_info<deviceID = 0, fileID = 0, line = 0, section = @omp> {
+      omp.terminator
+    }
+    omp.target info = #omp.tgt_entry_info<deviceID = 0, fileID = 0, line = 1, section = @omp> {
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// -----
+
+// CHECK: @[[TGT_OP:.*]] = weak constant i8 0
+// CHECK: @[[TGT_OP_NAME:.*]] = internal unnamed_addr constant [{{.*}} x i8] c"{{.*}}0_0_main_l0\00"
+// CHECK: @{{.*}} = weak constant %{{.*}} { ptr @[[TGT_OP]], ptr @[[TGT_OP_NAME]], i64 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
+module attributes {omp.is_target_device = false, omp.is_gpu = false} {
+  llvm.func @main() {
+    omp.target info = #omp.tgt_entry_info<deviceID = 0, fileID = 0, line = 0> {
+      omp.terminator
+    }
+    llvm.return
+  }
+}

>From 11dbc670581a414dbeee2ff9a45dc8c2491b585c Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev at gmail.com>
Date: Tue, 16 Jan 2024 01:46:37 +0000
Subject: [PATCH 6/9] [mlir][OpenMP] Remove unnecessary dialect dependencies

This patch removes dialect dependencies from the OpenMP dialect; all external
models were moved to independent libraries to accomplish this change.
Consequently, OpenMP dialect users can pull only the desired external models;
for example, a user might choose not to include OpenMP Func external models.
However, all external models still reside inside the OpenMP dialect, as it's
the owner of those interfaces.

Additionally, an external model was included for the GPUModule operation.
---
 .../include/flang/Optimizer/Support/InitFIR.h |  4 ++
 flang/lib/Optimizer/Transforms/CMakeLists.txt |  3 +
 .../mlir/Dialect/OpenMP/ExternalModels.h      | 37 +++++++++++
 .../mlir/Dialect/OpenMP/OpenMPDialect.h       |  1 -
 .../mlir/Dialect/OpenMP/OpenMPInterfaces.h    |  5 +-
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |  2 -
 mlir/include/mlir/InitAllDialects.h           |  2 +
 mlir/include/mlir/Target/LLVMIR/Dialect/All.h |  3 +
 mlir/lib/Dialect/OpenMP/CMakeLists.txt        | 62 ++++++++++++++++++-
 .../ExternalModels/BuiltinExternalModels.cpp  | 35 +++++++++++
 .../ExternalModels/FuncExternalModels.cpp     | 31 ++++++++++
 .../ExternalModels/GPUExternalModels.cpp      | 24 +++++++
 .../ExternalModels/LLVMExternalModels.cpp     | 42 +++++++++++++
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  | 43 -------------
 mlir/lib/Target/LLVMIR/CMakeLists.txt         |  4 ++
 15 files changed, 248 insertions(+), 50 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/OpenMP/ExternalModels.h
 create mode 100644 mlir/lib/Dialect/OpenMP/ExternalModels/BuiltinExternalModels.cpp
 create mode 100644 mlir/lib/Dialect/OpenMP/ExternalModels/FuncExternalModels.cpp
 create mode 100644 mlir/lib/Dialect/OpenMP/ExternalModels/GPUExternalModels.cpp
 create mode 100644 mlir/lib/Dialect/OpenMP/ExternalModels/LLVMExternalModels.cpp

diff --git a/flang/include/flang/Optimizer/Support/InitFIR.h b/flang/include/flang/Optimizer/Support/InitFIR.h
index 8c47ad3d9f4451..76dbf39a1b6e27 100644
--- a/flang/include/flang/Optimizer/Support/InitFIR.h
+++ b/flang/include/flang/Optimizer/Support/InitFIR.h
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Func/Extensions/InlinerExtension.h"
+#include "mlir/Dialect/OpenMP/ExternalModels.h"
 #include "mlir/InitAllDialects.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassRegistry.h"
@@ -44,6 +45,9 @@ namespace fir::support {
 inline void registerNonCodegenDialects(mlir::DialectRegistry &registry) {
   registry.insert<FLANG_NONCODEGEN_DIALECT_LIST>();
   mlir::func::registerInlinerExtension(registry);
+  mlir::omp::registerBuiltinExternalModels(registry);
+  mlir::omp::registerFuncExternalModels(registry);
+  mlir::omp::registerLLVMExternalModels(registry);
 }
 
 /// Register all the dialects used by flang.
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index fc067ad3585395..612a20453a9377 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -43,4 +43,7 @@ add_flang_library(FIRTransforms
   MLIROpenACCDialect
   MLIROpenACCToLLVMIRTranslation
   MLIROpenMPDialect
+  MLIROpenMPBuiltinExternalModels
+  MLIROpenMPFuncExternalModels
+  MLIROpenMPLLVMExternalModels
 )
diff --git a/mlir/include/mlir/Dialect/OpenMP/ExternalModels.h b/mlir/include/mlir/Dialect/OpenMP/ExternalModels.h
new file mode 100644
index 00000000000000..9c3194982eb86f
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenMP/ExternalModels.h
@@ -0,0 +1,37 @@
+//===- ExternalModels.h - External models owned by the OMP dialect --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the OpenMP external models for other dialects.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_OPENMP_EXTERNALMODELS_H
+#define MLIR_DIALECT_OPENMP_EXTERNALMODELS_H
+
+namespace mlir {
+class DialectRegistry;
+namespace omp {
+/// Register OMP external models for the Builtin dialect.
+void registerBuiltinExternalModels(DialectRegistry &registry);
+/// Register OMP external models for the Func dialect.
+void registerFuncExternalModels(DialectRegistry &registry);
+/// Register OMP external models for the GPU dialect.
+void registerGPUExternalModels(DialectRegistry &registry);
+/// Register OMP external models for the LLVM dialect.
+void registerLLVMExternalModels(DialectRegistry &registry);
+/// Register all OMP external models.
+inline void registerAllExternalModels(DialectRegistry &registry) {
+  registerBuiltinExternalModels(registry);
+  registerFuncExternalModels(registry);
+  registerGPUExternalModels(registry);
+  registerLLVMExternalModels(registry);
+}
+} // namespace omp
+} // namespace mlir
+
+#endif // MLIR_DIALECT_OPENMP_EXTERNALMODELS_H
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h b/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h
index 23509c5b607016..e6786dfaf2b57f 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPDialect.h
@@ -13,7 +13,6 @@
 #ifndef MLIR_DIALECT_OPENMP_OPENMPDIALECT_H_
 #define MLIR_DIALECT_OPENMP_OPENMPDIALECT_H_
 
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/OpenACCMPCommon/Interfaces/AtomicInterfaces.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPInterfaces.h b/mlir/include/mlir/Dialect/OpenMP/OpenMPInterfaces.h
index d78c541252a98d..9b3e23f01ca868 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPInterfaces.h
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPInterfaces.h
@@ -27,9 +27,10 @@ namespace mlir::omp {
 // You can override defaults here or implement more complex implementations of
 // functions. Or define a completely seperate external model implementation,
 // to override the existing implementation.
+template <typename T>
 struct OffloadModuleDefaultModel
-    : public OffloadModuleInterface::ExternalModel<OffloadModuleDefaultModel,
-                                                   mlir::ModuleOp> {};
+    : public OffloadModuleInterface::ExternalModel<OffloadModuleDefaultModel<T>,
+                                                   T> {};
 
 template <typename T>
 struct DeclareTargetDefaultModel
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index d614f2666a85ab..0703368c141e8a 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -19,7 +19,6 @@ include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/IR/SymbolInterfaces.td"
-include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
 include "mlir/Dialect/OpenACCMPCommon/Interfaces/AtomicInterfaces.td"
 include "mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td"
 include "mlir/Dialect/OpenMP/OpenMPTypeInterfaces.td"
@@ -27,7 +26,6 @@ include "mlir/Dialect/OpenMP/OpenMPTypeInterfaces.td"
 def OpenMP_Dialect : Dialect {
   let name = "omp";
   let cppNamespace = "::mlir::omp";
-  let dependentDialects = ["::mlir::LLVM::LLVMDialect, ::mlir::func::FuncDialect"];
   let useDefaultAttributePrinterParser = 1;
   let useDefaultTypePrinterParser = 1;
 }
diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h
index 19a62cadaa2e04..681ab02ed2c20d 100644
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@@ -57,6 +57,7 @@
 #include "mlir/Dialect/Mesh/IR/MeshOps.h"
 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Dialect/OpenMP/ExternalModels.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/Dialect/PDL/IR/PDL.h"
 #include "mlir/Dialect/PDLInterp/IR/PDLInterp.h"
@@ -160,6 +161,7 @@ inline void registerAllDialects(DialectRegistry &registry) {
   memref::registerRuntimeVerifiableOpInterfaceExternalModels(registry);
   memref::registerValueBoundsOpInterfaceExternalModels(registry);
   memref::registerMemorySlotExternalModels(registry);
+  omp::registerAllExternalModels(registry);
   scf::registerBufferDeallocationOpInterfaceExternalModels(registry);
   scf::registerBufferizableOpInterfaceExternalModels(registry);
   scf::registerValueBoundsOpInterfaceExternalModels(registry);
diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/All.h b/mlir/include/mlir/Target/LLVMIR/Dialect/All.h
index 0b37e23e45118b..7e76b29bb5977e 100644
--- a/mlir/include/mlir/Target/LLVMIR/Dialect/All.h
+++ b/mlir/include/mlir/Target/LLVMIR/Dialect/All.h
@@ -14,6 +14,7 @@
 #ifndef MLIR_TARGET_LLVMIR_DIALECT_ALL_H
 #define MLIR_TARGET_LLVMIR_DIALECT_ALL_H
 
+#include "mlir/Dialect/OpenMP/ExternalModels.h"
 #include "mlir/Target/LLVMIR/Dialect/AMX/AMXToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/ArmNeon/ArmNeonToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/ArmSME/ArmSMEToLLVMIRTranslation.h"
@@ -52,6 +53,8 @@ static inline void registerAllToLLVMIRTranslations(DialectRegistry &registry) {
 
   // Extension required for translating GPU offloading Ops.
   gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(registry);
+  // Extensions required for translating the OpenMP dialect.
+  omp::registerAllExternalModels(registry);
 }
 
 /// Registers all the translations to LLVM IR required by GPU passes.
diff --git a/mlir/lib/Dialect/OpenMP/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
index 40b4837484a136..a1af7e40dcc6b6 100644
--- a/mlir/lib/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
@@ -11,7 +11,65 @@ add_mlir_dialect_library(MLIROpenMPDialect
 
   LINK_LIBS PUBLIC
   MLIRIR
-  MLIRLLVMDialect
-  MLIRFuncDialect
+  MLIRControlFlowInterfaces
   MLIROpenACCMPCommon
   )
+
+add_mlir_dialect_library(MLIROpenMPBuiltinExternalModels
+  ExternalModels/BuiltinExternalModels.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenMP
+
+  DEPENDS
+  MLIROpenMPOpsInterfacesIncGen
+  MLIROpenMPTypeInterfacesIncGen
+
+  LINK_LIBS PUBLIC
+  MLIROpenMPDialect
+)
+
+add_mlir_dialect_library(MLIROpenMPFuncExternalModels
+  ExternalModels/FuncExternalModels.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenMP
+
+  DEPENDS
+  MLIROpenMPOpsInterfacesIncGen
+  MLIROpenMPTypeInterfacesIncGen
+
+  LINK_LIBS PUBLIC
+  MLIROpenMPDialect
+  MLIRFuncDialect
+)
+
+add_mlir_dialect_library(MLIROpenMPGPUExternalModels
+  ExternalModels/GPUExternalModels.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenMP
+
+  DEPENDS
+  MLIROpenMPOpsInterfacesIncGen
+  MLIROpenMPTypeInterfacesIncGen
+
+  LINK_LIBS PUBLIC
+  MLIROpenMPDialect
+  MLIRGPUDialect
+)
+
+add_mlir_dialect_library(MLIROpenMPLLVMExternalModels
+  ExternalModels/LLVMExternalModels.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenMP
+
+  DEPENDS
+  MLIROpenMPOpsInterfacesIncGen
+  MLIROpenMPTypeInterfacesIncGen
+
+  LINK_LIBS PUBLIC
+  MLIROpenMPDialect
+  MLIRLLVMDialect
+)
diff --git a/mlir/lib/Dialect/OpenMP/ExternalModels/BuiltinExternalModels.cpp b/mlir/lib/Dialect/OpenMP/ExternalModels/BuiltinExternalModels.cpp
new file mode 100644
index 00000000000000..e44b721ad39304
--- /dev/null
+++ b/mlir/lib/Dialect/OpenMP/ExternalModels/BuiltinExternalModels.cpp
@@ -0,0 +1,35 @@
+//===- BuiltinExternalModels.cpp - Impl of Builtin external models --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the OpenMP external models for the Builtin dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenMP/ExternalModels.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/BuiltinDialect.h"
+
+using namespace mlir;
+
+namespace {
+struct MemRefPointerLikeModel
+    : public omp::PointerLikeType::ExternalModel<MemRefPointerLikeModel,
+                                                 MemRefType> {
+  Type getElementType(Type pointer) const {
+    return llvm::cast<MemRefType>(pointer).getElementType();
+  }
+};
+} // namespace
+
+void omp::registerBuiltinExternalModels(DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, BuiltinDialect *dialect) {
+    MemRefType::attachInterface<MemRefPointerLikeModel>(*ctx);
+    mlir::ModuleOp::attachInterface<
+        mlir::omp::OffloadModuleDefaultModel<mlir::ModuleOp>>(*ctx);
+  });
+}
diff --git a/mlir/lib/Dialect/OpenMP/ExternalModels/FuncExternalModels.cpp b/mlir/lib/Dialect/OpenMP/ExternalModels/FuncExternalModels.cpp
new file mode 100644
index 00000000000000..761a304b54778e
--- /dev/null
+++ b/mlir/lib/Dialect/OpenMP/ExternalModels/FuncExternalModels.cpp
@@ -0,0 +1,31 @@
+//===- FuncExternalModels.cpp - Implementation of Func external models ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the OpenMP external models for the Func dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/OpenMP/ExternalModels.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+
+using namespace mlir;
+
+void omp::registerFuncExternalModels(DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, func::FuncDialect *dialect) {
+    // Attach default declare target interfaces to operations which can be
+    // marked as declare target (Global Operations and Functions/Subroutines in
+    // dialects that Fortran (or other languages that lower to MLIR) translates
+    // too
+    mlir::func::FuncOp::attachInterface<
+        mlir::omp::DeclareTargetDefaultModel<mlir::func::FuncOp>>(*ctx);
+    // Attach default early outlining interface to func ops.
+    mlir::func::FuncOp::attachInterface<
+        mlir::omp::EarlyOutliningDefaultModel<mlir::func::FuncOp>>(*ctx);
+  });
+}
diff --git a/mlir/lib/Dialect/OpenMP/ExternalModels/GPUExternalModels.cpp b/mlir/lib/Dialect/OpenMP/ExternalModels/GPUExternalModels.cpp
new file mode 100644
index 00000000000000..962aeafefc13d4
--- /dev/null
+++ b/mlir/lib/Dialect/OpenMP/ExternalModels/GPUExternalModels.cpp
@@ -0,0 +1,24 @@
+//===- GPUExternalModels.cpp - Implementation of GPU external models ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the OpenMP external models for the GPU dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/OpenMP/ExternalModels.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+
+using namespace mlir;
+
+void omp::registerGPUExternalModels(DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, gpu::GPUDialect *dialect) {
+    gpu::GPUModuleOp::attachInterface<
+        omp::OffloadModuleDefaultModel<gpu::GPUModuleOp>>(*ctx);
+  });
+}
diff --git a/mlir/lib/Dialect/OpenMP/ExternalModels/LLVMExternalModels.cpp b/mlir/lib/Dialect/OpenMP/ExternalModels/LLVMExternalModels.cpp
new file mode 100644
index 00000000000000..9c2cc5caa3e692
--- /dev/null
+++ b/mlir/lib/Dialect/OpenMP/ExternalModels/LLVMExternalModels.cpp
@@ -0,0 +1,42 @@
+//===- LLVMExternalModels.cpp - Implementation of LLVM external models ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the OpenMP external models for the LLVM dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/OpenMP/ExternalModels.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+
+using namespace mlir;
+
+namespace {
+struct LLVMPointerPointerLikeModel
+    : public omp::PointerLikeType::ExternalModel<LLVMPointerPointerLikeModel,
+                                                 LLVM::LLVMPointerType> {
+  Type getElementType(Type pointer) const { return Type(); }
+};
+} // namespace
+
+void omp::registerLLVMExternalModels(DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, LLVM::LLVMDialect *dialect) {
+    LLVM::LLVMPointerType::attachInterface<LLVMPointerPointerLikeModel>(*ctx);
+    // Attach default declare target interfaces to operations which can be
+    // marked as declare target (Global Operations and Functions/Subroutines in
+    // dialects that Fortran (or other languages that lower to MLIR) translates
+    // too
+    mlir::LLVM::GlobalOp::attachInterface<
+        mlir::omp::DeclareTargetDefaultModel<mlir::LLVM::GlobalOp>>(*ctx);
+    mlir::LLVM::LLVMFuncOp::attachInterface<
+        mlir::omp::DeclareTargetDefaultModel<mlir::LLVM::LLVMFuncOp>>(*ctx);
+    // Attach default early outlining interface to func ops.
+    mlir::LLVM::LLVMFuncOp::attachInterface<
+        mlir::omp::EarlyOutliningDefaultModel<mlir::LLVM::LLVMFuncOp>>(*ctx);
+  });
+}
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 6e69cd0d386bd2..b1877636f4c57f 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -11,8 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/OpenACCMPCommon/Interfaces/AtomicInterfaces.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/DialectImplementation.h"
@@ -39,20 +37,6 @@ using namespace mlir;
 using namespace mlir::omp;
 
 namespace {
-struct MemRefPointerLikeModel
-    : public PointerLikeType::ExternalModel<MemRefPointerLikeModel,
-                                            MemRefType> {
-  Type getElementType(Type pointer) const {
-    return llvm::cast<MemRefType>(pointer).getElementType();
-  }
-};
-
-struct LLVMPointerPointerLikeModel
-    : public PointerLikeType::ExternalModel<LLVMPointerPointerLikeModel,
-                                            LLVM::LLVMPointerType> {
-  Type getElementType(Type pointer) const { return Type(); }
-};
-
 struct OpenMPDialectFoldInterface : public DialectFoldInterface {
   using DialectFoldInterface::DialectFoldInterface;
 
@@ -78,33 +62,6 @@ void OpenMPDialect::initialize() {
       >();
 
   addInterface<OpenMPDialectFoldInterface>();
-  MemRefType::attachInterface<MemRefPointerLikeModel>(*getContext());
-  LLVM::LLVMPointerType::attachInterface<LLVMPointerPointerLikeModel>(
-      *getContext());
-
-  // Attach default offload module interface to module op to access
-  // offload functionality through
-  mlir::ModuleOp::attachInterface<mlir::omp::OffloadModuleDefaultModel>(
-      *getContext());
-
-  // Attach default declare target interfaces to operations which can be marked
-  // as declare target (Global Operations and Functions/Subroutines in dialects
-  // that Fortran (or other languages that lower to MLIR) translates too
-  mlir::LLVM::GlobalOp::attachInterface<
-      mlir::omp::DeclareTargetDefaultModel<mlir::LLVM::GlobalOp>>(
-      *getContext());
-  mlir::LLVM::LLVMFuncOp::attachInterface<
-      mlir::omp::DeclareTargetDefaultModel<mlir::LLVM::LLVMFuncOp>>(
-      *getContext());
-  mlir::func::FuncOp::attachInterface<
-      mlir::omp::DeclareTargetDefaultModel<mlir::func::FuncOp>>(*getContext());
-
-  // Attach default early outlining interface to func ops.
-  mlir::func::FuncOp::attachInterface<
-      mlir::omp::EarlyOutliningDefaultModel<mlir::func::FuncOp>>(*getContext());
-  mlir::LLVM::LLVMFuncOp::attachInterface<
-      mlir::omp::EarlyOutliningDefaultModel<mlir::LLVM::LLVMFuncOp>>(
-      *getContext());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Target/LLVMIR/CMakeLists.txt b/mlir/lib/Target/LLVMIR/CMakeLists.txt
index 94280a2ec9012b..072a597eb52e72 100644
--- a/mlir/lib/Target/LLVMIR/CMakeLists.txt
+++ b/mlir/lib/Target/LLVMIR/CMakeLists.txt
@@ -57,6 +57,10 @@ add_mlir_translation_library(MLIRToLLVMIRTranslationRegistration
   MLIRNVVMToLLVMIRTranslation
   MLIROpenACCToLLVMIRTranslation
   MLIROpenMPToLLVMIRTranslation
+  MLIROpenMPBuiltinExternalModels
+  MLIROpenMPFuncExternalModels
+  MLIROpenMPGPUExternalModels
+  MLIROpenMPLLVMExternalModels
   MLIRROCDLToLLVMIRTranslation
   MLIRSPIRVToLLVMIRTranslation
   )

>From fe36b64884ae761c87b09d677255df46145c46e6 Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev at gmail.com>
Date: Tue, 16 Jan 2024 02:25:29 +0000
Subject: [PATCH 7/9] [mlir][gpu] Add the OffloadEmbeddingAttr offloading
 translation attr

This patch adds the offloading translation attribute. This attribute uses LLVM
offloading infrastructure to embed GPU binaries in the IR. At the program start,
the LLVM offloading mechanism registers kernels and variables with the runtime
library: CUDA RT, HIP RT, or LibOMPTarget.

The offloading mechanism relies on the runtime library to dispatch the correct
kernel based on the registered symbols.

This patch is 3/4 on introducing the OffloadEmbeddingAttr GPU translation
attribute.

Note: Ignore the base commits; those are being reviewed in PRs #78057, #78098,
and #78073.
---
 .../mlir/Dialect/GPU/IR/CompilationAttrs.td   |  35 ++
 .../Target/LLVMIR/Dialect/GPU/CMakeLists.txt  |   5 +-
 ...ttr.cpp => OffloadingTranslationAttrs.cpp} | 434 +++++++++++++++---
 mlir/test/Target/LLVMIR/gpu.mlir              |  83 ++++
 4 files changed, 498 insertions(+), 59 deletions(-)
 rename mlir/lib/Target/LLVMIR/Dialect/GPU/{SelectObjectAttr.cpp => OffloadingTranslationAttrs.cpp} (54%)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
index 6659f4a2c58e82..812b72681343b9 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
@@ -107,4 +107,39 @@ def GPU_SelectObjectAttr : GPU_Attr<"SelectObject", "select_object", [
   let genVerifyDecl = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// GPU LLVM offload attribute.
+//===----------------------------------------------------------------------===//
+def GPU_OffloadOpenMP : I32EnumAttrCase<"OpenMP", 1, "omp">;
+def GPU_OffloadCUDA : I32EnumAttrCase<"CUDA", 2, "cuda">;
+def GPU_OffloadHIP : I32EnumAttrCase<"HIP", 3, "hip">;
+def GPU_OffloadKindEnum : GPU_I32Enum<
+  "OffloadKind", "GPU offload kind", [
+    GPU_OffloadOpenMP,
+    GPU_OffloadCUDA,
+    GPU_OffloadHIP
+  ]>;
+
+def GPU_OffloadEmbeddingAttr : GPU_Attr<"OffloadEmbedding", "offload_embedding", [
+    OffloadingTranslationAttrTrait
+  ]> {
+  let description = [{
+    This GPU offloading handler uses LLVM offloading infrastructure to embed GPU
+    binaries in the IR. At program start, the LLVM offloading mechanism registers
+    kernels and variables with the runtime library: CUDA RT, HIP RT or
+    LibOMPTarget.
+    The offloading mechanism relies on the runtime library to dispatch the
+    correct kernel based on the registered symbols.
+    This offload mechanism requires to specify which runtime is being called,
+    this is done by the `kind` parameter.
+    Example:
+    ```mlir
+    gpu.binary @binary <#gpu.offload_embedding<omp>> [...]
+    gpu.binary @binary <#gpu.offload_embedding<cuda>> [...]
+    ```
+  }];
+  let parameters = (ins "gpu::OffloadKind":$kind);
+  let assemblyFormat = [{ `<` $kind `>` }];
+}
+
 #endif // GPU_COMPILATION_ATTRS
diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/CMakeLists.txt b/mlir/lib/Target/LLVMIR/Dialect/GPU/CMakeLists.txt
index 11816ff5c2c1f1..b95b1e95a039ba 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/CMakeLists.txt
@@ -1,14 +1,17 @@
 add_mlir_translation_library(MLIRGPUToLLVMIRTranslation
   GPUToLLVMIRTranslation.cpp
-  SelectObjectAttr.cpp
+  OffloadingTranslationAttrs.cpp
 
   LINK_COMPONENTS
   Core
+  FrontendOffloading
+  Object
 
   LINK_LIBS PUBLIC
   MLIRIR
   MLIRGPUDialect
   MLIRLLVMDialect
   MLIRSupport
+  MLIRTargetLLVM
   MLIRTargetLLVMIRExport
   )
diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp b/mlir/lib/Target/LLVMIR/Dialect/GPU/OffloadingTranslationAttrs.cpp
similarity index 54%
rename from mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp
rename to mlir/lib/Target/LLVMIR/Dialect/GPU/OffloadingTranslationAttrs.cpp
index 0eb33287d608bd..4448b72615e21d 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/OffloadingTranslationAttrs.cpp
@@ -25,6 +25,9 @@
 
 using namespace mlir;
 
+//===----------------------------------------------------------------------===//
+// SelectObjectAttr
+//===----------------------------------------------------------------------===//
 namespace {
 // Implementation of the `OffloadingLLVMTranslationAttrInterface` model.
 class SelectObjectAttrImpl
@@ -54,13 +57,6 @@ std::string getBinaryIdentifier(StringRef binaryName) {
 }
 } // namespace
 
-void mlir::gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(
-    DialectRegistry &registry) {
-  registry.addExtension(+[](MLIRContext *ctx, gpu::GPUDialect *dialect) {
-    SelectObjectAttr::attachInterface<SelectObjectAttrImpl>(*ctx);
-  });
-}
-
 gpu::ObjectAttr
 SelectObjectAttrImpl::getSelectedObject(gpu::BinaryOp op) const {
   ArrayRef<Attribute> objects = op.getObjectsAttr().getValue();
@@ -136,6 +132,9 @@ class LaunchKernel {
   // Get the kernel launch callee.
   FunctionCallee getKernelLaunchFn();
 
+  // Get the kernel RT launch callee.
+  FunctionCallee getKernelRTLaunchFn();
+
   // Get the kernel launch callee.
   FunctionCallee getClusterKernelLaunchFn();
 
@@ -166,9 +165,15 @@ class LaunchKernel {
   // Create the void* kernel array for passing the arguments.
   Value *createKernelArgArray(mlir::gpu::LaunchFuncOp op);
 
+  // Returns a pair containing the function pointer to the kernel and the
+  // pointer to the kernel module.
+  mlir::FailureOr<std::pair<Value *, Value *>>
+  getKernelInfo(mlir::gpu::LaunchFuncOp op, mlir::gpu::ObjectAttr object);
+
   // Create the full kernel launch.
   mlir::LogicalResult createKernelLaunch(mlir::gpu::LaunchFuncOp op,
-                                         mlir::gpu::ObjectAttr object);
+                                         mlir::gpu::ObjectAttr object,
+                                         Value *kernelPtr = nullptr);
 
 private:
   Module &module;
@@ -244,6 +249,16 @@ llvm::FunctionCallee llvm::LaunchKernel::getClusterKernelLaunchFn() {
           false));
 }
 
+llvm::FunctionCallee llvm::LaunchKernel::getKernelRTLaunchFn() {
+  return module.getOrInsertFunction(
+      "mgpuLaunchKernelRT",
+      FunctionType::get(voidTy,
+                        ArrayRef<Type *>({ptrTy, intPtrTy, intPtrTy, intPtrTy,
+                                          intPtrTy, intPtrTy, intPtrTy, i32Ty,
+                                          ptrTy, ptrTy, ptrTy, i64Ty}),
+                        false));
+}
+
 llvm::FunctionCallee llvm::LaunchKernel::getModuleFunctionFn() {
   return module.getOrInsertFunction(
       "mgpuModuleGetFunction",
@@ -334,46 +349,14 @@ llvm::LaunchKernel::createKernelArgArray(mlir::gpu::LaunchFuncOp op) {
   return argArray;
 }
 
-// Emits LLVM IR to launch a kernel function:
+// Loads the kernel module pointer
 // %0 = call %binarygetter
 // %1 = call %moduleLoad(%0)
 // %2 = <see generateKernelNameConstant>
 // %3 = call %moduleGetFunction(%1, %2)
-// %4 = call %streamCreate()
-// %5 = <see generateParamsArray>
-// call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr)
-// call %streamSynchronize(%4)
-// call %streamDestroy(%4)
-// call %moduleUnload(%1)
-mlir::LogicalResult
-llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
-                                       mlir::gpu::ObjectAttr object) {
-  auto llvmValue = [&](mlir::Value value) -> Value * {
-    Value *v = moduleTranslation.lookupValue(value);
-    assert(v && "Value has not been translated.");
-    return v;
-  };
-
-  // Get grid dimensions.
-  mlir::gpu::KernelDim3 grid = op.getGridSizeOperandValues();
-  Value *gx = llvmValue(grid.x), *gy = llvmValue(grid.y),
-        *gz = llvmValue(grid.z);
-
-  // Get block dimensions.
-  mlir::gpu::KernelDim3 block = op.getBlockSizeOperandValues();
-  Value *bx = llvmValue(block.x), *by = llvmValue(block.y),
-        *bz = llvmValue(block.z);
-
-  // Get dynamic shared memory size.
-  Value *dynamicMemorySize = nullptr;
-  if (mlir::Value dynSz = op.getDynamicSharedMemorySize())
-    dynamicMemorySize = llvmValue(dynSz);
-  else
-    dynamicMemorySize = ConstantInt::get(i32Ty, 0);
-
-  // Create the argument array.
-  Value *argArray = createKernelArgArray(op);
-
+mlir::FailureOr<std::pair<llvm::Value *, llvm::Value *>>
+llvm::LaunchKernel::getKernelInfo(mlir::gpu::LaunchFuncOp op,
+                                  mlir::gpu::ObjectAttr object) {
   // Default JIT optimization level.
   llvm::Constant *optV = llvm::ConstantInt::get(i32Ty, 0);
   // Check if there's an optimization level embedded in the object.
@@ -385,7 +368,6 @@ llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
       return op.emitError("the optimization level must be an integer");
     optV = llvm::ConstantInt::get(i32Ty, optLevel.getValue());
   }
-
   // Load the kernel module.
   StringRef moduleName = op.getKernelModuleName().getValue();
   std::string binaryIdentifier = getBinaryIdentifier(moduleName);
@@ -417,6 +399,56 @@ llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
       getModuleFunctionFn(),
       {moduleObject,
        getOrCreateFunctionName(moduleName, op.getKernelName().getValue())});
+  return std::pair<Value *, Value *>(moduleFunction, moduleObject);
+}
+
+// Emits LLVM IR to launch a kernel function:
+// %4 = call %streamCreate()
+// %5 = <see generateParamsArray>
+// call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr)
+// call %streamSynchronize(%4)
+// call %streamDestroy(%4)
+// call %moduleUnload(%1)
+mlir::LogicalResult
+llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
+                                       mlir::gpu::ObjectAttr object,
+                                       Value *kernelPtr) {
+  auto llvmValue = [&](mlir::Value value) -> Value * {
+    Value *v = moduleTranslation.lookupValue(value);
+    assert(v && "Value has not been translated.");
+    return v;
+  };
+
+  // Get grid dimensions.
+  mlir::gpu::KernelDim3 grid = op.getGridSizeOperandValues();
+  Value *gx = llvmValue(grid.x), *gy = llvmValue(grid.y),
+        *gz = llvmValue(grid.z);
+
+  // Get block dimensions.
+  mlir::gpu::KernelDim3 block = op.getBlockSizeOperandValues();
+  Value *bx = llvmValue(block.x), *by = llvmValue(block.y),
+        *bz = llvmValue(block.z);
+
+  // Get dynamic shared memory size.
+  Value *dynamicMemorySize = nullptr;
+  if (mlir::Value dynSz = op.getDynamicSharedMemorySize())
+    dynamicMemorySize = llvmValue(dynSz);
+  else
+    dynamicMemorySize = ConstantInt::get(i32Ty, 0);
+
+  // Create the argument array.
+  Value *argArray = createKernelArgArray(op);
+
+  Value *moduleObject = nullptr, *moduleFunction = nullptr;
+
+  if (!kernelPtr) {
+    mlir::FailureOr<std::pair<Value *, Value *>> kernelInfo =
+        getKernelInfo(op, object);
+    if (failed(kernelInfo))
+      return failure();
+    moduleFunction = kernelInfo->first;
+    moduleObject = kernelInfo->second;
+  }
 
   // Get the stream to use for execution. If there's no async object then create
   // a stream to make a synchronous kernel launch.
@@ -436,19 +468,27 @@ llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
   Value *nullPtr = ConstantPointerNull::get(ptrTy);
 
   // Launch kernel with clusters if cluster size is specified.
-  if (op.hasClusterSize()) {
-    mlir::gpu::KernelDim3 cluster = op.getClusterSizeOperandValues();
-    Value *cx = llvmValue(cluster.x), *cy = llvmValue(cluster.y),
-          *cz = llvmValue(cluster.z);
-    builder.CreateCall(
-        getClusterKernelLaunchFn(),
-        ArrayRef<Value *>({moduleFunction, cx, cy, cz, gx, gy, gz, bx, by, bz,
-                           dynamicMemorySize, stream, argArray, nullPtr}));
+  if (moduleFunction) {
+    if (op.hasClusterSize()) {
+      mlir::gpu::KernelDim3 cluster = op.getClusterSizeOperandValues();
+      Value *cx = llvmValue(cluster.x), *cy = llvmValue(cluster.y),
+            *cz = llvmValue(cluster.z);
+      builder.CreateCall(
+          getClusterKernelLaunchFn(),
+          ArrayRef<Value *>({moduleFunction, cx, cy, cz, gx, gy, gz, bx, by, bz,
+                             dynamicMemorySize, stream, argArray, nullPtr}));
+    } else {
+      builder.CreateCall(getKernelLaunchFn(),
+                         ArrayRef<Value *>({moduleFunction, gx, gy, gz, bx, by,
+                                            bz, dynamicMemorySize, stream,
+                                            argArray, nullPtr, paramsCount}));
+    }
   } else {
-    builder.CreateCall(getKernelLaunchFn(),
-                       ArrayRef<Value *>({moduleFunction, gx, gy, gz, bx, by,
-                                          bz, dynamicMemorySize, stream,
-                                          argArray, nullPtr, paramsCount}));
+    assert(kernelPtr && "invalid kernel pointer");
+    builder.CreateCall(
+        getKernelRTLaunchFn(),
+        ArrayRef<Value *>({kernelPtr, gx, gy, gz, bx, by, bz, dynamicMemorySize,
+                           stream, argArray, nullPtr, paramsCount}));
   }
 
   // Sync & destroy the stream, for synchronous launches.
@@ -458,7 +498,285 @@ llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
   }
 
   // Unload the kernel module.
-  builder.CreateCall(getModuleUnloadFn(), {moduleObject});
+  if (moduleObject)
+    builder.CreateCall(getModuleUnloadFn(), {moduleObject});
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// OffloadEmbeddingAttr
+//===----------------------------------------------------------------------===//
+#include "mlir/Target/LLVM/Offload.h"
+#include "llvm/Frontend/Offloading/OffloadWrapper.h"
+#include "llvm/Frontend/Offloading/Utility.h"
+#include "llvm/Object/OffloadBinary.h"
+
+namespace {
+// Implementation of the `OffloadingLLVMTranslationAttrInterface` model.
+class OffloadEmbeddingAttrImpl
+    : public gpu::OffloadingLLVMTranslationAttrInterface::FallbackModel<
+          OffloadEmbeddingAttrImpl> {
+public:
+  // Translates a `gpu.binary`, embedding the binary into a host LLVM module as
+  // global binary string.
+  LogicalResult embedBinary(Attribute attribute, Operation *operation,
+                            llvm::IRBuilderBase &builder,
+                            LLVM::ModuleTranslation &moduleTranslation) const;
+
+  // Translates a `gpu.launch_func` to a sequence of LLVM instructions resulting
+  // in a kernel launch call.
+  LogicalResult launchKernel(Attribute attribute,
+                             Operation *launchFuncOperation,
+                             Operation *binaryOperation,
+                             llvm::IRBuilderBase &builder,
+                             LLVM::ModuleTranslation &moduleTranslation) const;
+};
+} // namespace
+
+namespace {
+llvm::object::ImageKind getImageKind(gpu::CompilationTarget format) {
+  switch (format) {
+  case gpu::CompilationTarget::Offload:
+    return llvm::object::IMG_Bitcode;
+  case gpu::CompilationTarget::Assembly:
+    return llvm::object::IMG_PTX;
+  case gpu::CompilationTarget::Binary:
+    return llvm::object::IMG_Object;
+  case gpu::CompilationTarget::Fatbin:
+    return llvm::object::IMG_Fatbinary;
+  }
+}
+
+llvm::object::OffloadKind getOffloadKind(gpu::OffloadKind offloadKind) {
+  switch (offloadKind) {
+  case gpu::OffloadKind::OpenMP:
+    return llvm::object::OFK_OpenMP;
+  case gpu::OffloadKind::CUDA:
+    return llvm::object::OFK_Cuda;
+  case gpu::OffloadKind::HIP:
+    return llvm::object::OFK_HIP;
+  }
+}
+
+using OffloadEntryArray = LLVM::OffloadHandler::OffloadEntryArray;
+
+/// Utility class for embedding binaries and launching kernels using the
+/// offloading attribute.
+class OffloadManager : public LLVM::OffloadHandler {
+public:
+  OffloadManager(gpu::BinaryOp binaryOp, llvm::Module &module,
+                 gpu::OffloadKind offloadKind)
+      : LLVM::OffloadHandler(module), binaryOp(binaryOp),
+        offloadKind(offloadKind) {}
+
+  /// Embed a GPU binary into a module.
+  LogicalResult embedBinary();
+
+  /// Generates the kernel launch call.
+  LogicalResult launchKernel(gpu::LaunchFuncOp launchFunc,
+                             llvm::IRBuilderBase &builder,
+                             LLVM::ModuleTranslation &moduleTranslation);
+
+protected:
+  /// Returns the name to be used for the offloading symbols.
+  StringRef getSymbolSuffix();
+
+  /// Emits the offloading entry for `launchFunc`.
+  LogicalResult emitOffloadingEntry(gpu::LaunchFuncOp launchFunc,
+                                    llvm::Constant *registeredSym);
+
+  /// Bundle OpenMP images together.
+  SmallVector<std::unique_ptr<llvm::MemoryBuffer>>
+  bundleOpenMP(ArrayRef<Attribute> objects);
+
+  /// Bundle gpu-objects together. TODO: support more than a single object.
+  FailureOr<SmallVector<std::unique_ptr<llvm::MemoryBuffer>>>
+  bundleGPU(ArrayRef<Attribute> objects);
+
+  /// Bundle objects depending on the `gpu::OffloadKind`.
+  FailureOr<SmallVector<std::unique_ptr<llvm::MemoryBuffer>>>
+  bundleImages(ArrayRef<Attribute> objects);
+
+  /// Emit registration code and embed the images.
+  LogicalResult wrapImages(llvm::Module &module, ArrayRef<ArrayRef<char>> imgs);
+
+  /// Convert a `ObjectAttr` to a OffloadingImage.
+  llvm::object::OffloadBinary::OffloadingImage
+  getOffloadingImage(gpu::ObjectAttr obj);
 
+  gpu::BinaryOp binaryOp;
+  gpu::OffloadKind offloadKind;
+};
+} // namespace
+
+llvm::object::OffloadBinary::OffloadingImage
+OffloadManager::getOffloadingImage(gpu::ObjectAttr obj) {
+  // Create the binary used by Libomptarget
+  auto targetAttr = cast<TargetInfoAttrInterface>(obj.getTarget());
+  llvm::object::OffloadBinary::OffloadingImage imageBinary{};
+  imageBinary.TheImageKind = getImageKind(obj.getFormat());
+  imageBinary.TheOffloadKind = getOffloadKind(offloadKind);
+  imageBinary.StringData["triple"] = targetAttr.getTargetTriple();
+  imageBinary.StringData["arch"] = targetAttr.getTargetChip();
+  imageBinary.Image =
+      llvm::MemoryBuffer::getMemBufferCopy(obj.getObject().getValue(), "");
+  return imageBinary;
+}
+
+SmallVector<std::unique_ptr<llvm::MemoryBuffer>>
+OffloadManager::bundleOpenMP(ArrayRef<Attribute> objects) {
+  // Bundle all the available objects in the binary.
+  SmallVector<std::unique_ptr<llvm::MemoryBuffer>> buffers;
+  for (Attribute attr : objects)
+    buffers.emplace_back(
+        llvm::MemoryBuffer::getMemBufferCopy(llvm::object::OffloadBinary::write(
+            getOffloadingImage(cast<gpu::ObjectAttr>(attr)))));
+  return buffers;
+}
+
+FailureOr<SmallVector<std::unique_ptr<llvm::MemoryBuffer>>>
+OffloadManager::bundleGPU(ArrayRef<Attribute> objects) {
+  if (objects.size() > 1)
+    return binaryOp.emitError("multiple objects are not yet supported");
+  SmallVector<std::unique_ptr<llvm::MemoryBuffer>> buffers;
+  assert(objects.size() == 1 && "there should be a single object");
+  auto object = cast<gpu::ObjectAttr>(objects[0]);
+  if (gpu::CompilationTarget frmt = object.getFormat();
+      frmt != gpu::CompilationTarget::Binary &&
+      frmt != gpu::CompilationTarget::Fatbin)
+    return binaryOp.emitError(
+        "the only supported objects are binaries and fat-binaries.");
+  buffers.emplace_back(
+      llvm::MemoryBuffer::getMemBuffer(object.getObject().getValue()));
+  return buffers;
+}
+
+FailureOr<SmallVector<std::unique_ptr<llvm::MemoryBuffer>>>
+OffloadManager::bundleImages(ArrayRef<Attribute> objects) {
+  switch (offloadKind) {
+  case gpu::OffloadKind::OpenMP:
+    return bundleOpenMP(objects);
+  case gpu::OffloadKind::CUDA:
+  case gpu::OffloadKind::HIP:
+    return bundleGPU(objects);
+  }
+}
+
+StringRef OffloadManager::getSymbolSuffix() { return binaryOp.getName(); }
+
+LogicalResult
+OffloadManager::emitOffloadingEntry(gpu::LaunchFuncOp launchFunc,
+                                    llvm::Constant *registeredSym) {
+  // Create the entry initializer.
+  std::pair<llvm::Constant *, llvm::GlobalVariable *> entry =
+      llvm::offloading::getOffloadingEntryInitializer(
+          module, registeredSym, launchFunc.getKernelName().getValue(), 0, 0,
+          0);
+  if (failed(insertOffloadEntry(getSymbolSuffix(), entry.first)))
+    return binaryOp.emitError("entry array symbols not found");
+  return success();
+}
+
+LogicalResult OffloadManager::wrapImages(llvm::Module &module,
+                                         ArrayRef<ArrayRef<char>> imgs) {
+  // This suffix is appended to all the symbols emitted by the `wrap*` methods.
+  std::string suffix = "." + getSymbolSuffix().str();
+  // Emit an empty entry array.
+  OffloadEntryArray entryArray = emitEmptyEntryArray(getSymbolSuffix());
+  switch (offloadKind) {
+  case gpu::OffloadKind::OpenMP:
+    if (auto error = llvm::offloading::wrapOpenMPBinaries(module, imgs,
+                                                          entryArray, suffix))
+      return binaryOp.emitError("failed wrapping the OpenMP binaries");
+    return success();
+  case gpu::OffloadKind::CUDA:
+    if (auto error = llvm::offloading::wrapCudaBinary(
+            module, imgs.front(), entryArray, suffix, false))
+      return binaryOp.emitError("failed wrapping the CUDA binaries");
+    return success();
+  case gpu::OffloadKind::HIP:
+    if (auto error = llvm::offloading::wrapHIPBinary(module, imgs.front(),
+                                                     entryArray, suffix, false))
+      return binaryOp.emitError("failed wrapping the HIP binaries");
+    return success();
+  }
+}
+
+LogicalResult OffloadManager::embedBinary() {
+  // Call all the methods in order, bundleImages -> wrapImages.
+  auto bundledImgs = bundleImages(binaryOp.getObjectsAttr().getValue());
+  if (failed(bundledImgs))
+    return failure();
+  SmallVector<ArrayRef<char>> imgs;
+  for (auto &img : bundledImgs.value())
+    imgs.push_back(ArrayRef<char>(img->getBufferStart(), img->getBufferSize()));
+  if (failed(wrapImages(module, imgs)))
+    return failure();
+  return success();
+}
+
+LogicalResult
+OffloadManager::launchKernel(gpu::LaunchFuncOp launchFunc,
+                             llvm::IRBuilderBase &builder,
+                             LLVM::ModuleTranslation &moduleTranslation) {
+  // OpenMMP kernels launches are handled by the `omp.target` op.
+  if (offloadKind == gpu::OffloadKind::OpenMP)
+    return binaryOp.emitError(
+        "it's invalid to call OpenMP kernels using gpu.launch_func");
+  llvm::Module *hostModule = moduleTranslation.getLLVMModule();
+  // Create or get the symbol to be registered.
+  std::string symbolId =
+      (binaryOp.getName() + "_K" + launchFunc.getKernelName().getValue()).str();
+  llvm::Constant *registeredSym = nullptr;
+  if (!(registeredSym = hostModule->getGlobalVariable(symbolId))) {
+    // Create the symbol used to register the kernel with the runtime.
+    registeredSym = new llvm::GlobalVariable(
+        *hostModule, builder.getInt8Ty(), /*isConstant=*/true,
+        llvm::GlobalValue::InternalLinkage, builder.getInt8(0), symbolId);
+    // Emit the offload entry.
+    if (failed(emitOffloadingEntry(launchFunc, registeredSym)))
+      return failure();
+  }
+  return llvm::LaunchKernel(*moduleTranslation.getLLVMModule(), builder,
+                            moduleTranslation)
+      .createKernelLaunch(launchFunc, nullptr, registeredSym);
+}
+
+LogicalResult OffloadEmbeddingAttrImpl::embedBinary(
+    Attribute attribute, Operation *operation, llvm::IRBuilderBase &builder,
+    LLVM::ModuleTranslation &moduleTranslation) const {
+  if (failed(OffloadManager(
+                 mlir::cast<gpu::BinaryOp>(operation),
+                 *moduleTranslation.getLLVMModule(),
+                 mlir::cast<gpu::OffloadEmbeddingAttr>(attribute).getKind())
+                 .embedBinary()))
+    return failure();
+  return success();
+}
+
+LogicalResult OffloadEmbeddingAttrImpl::launchKernel(
+    Attribute attribute, Operation *launchFuncOperation,
+    Operation *binaryOperation, llvm::IRBuilderBase &builder,
+    LLVM::ModuleTranslation &moduleTranslation) const {
+  if (failed(
+          OffloadManager(
+              mlir::cast<gpu::BinaryOp>(binaryOperation),
+              *moduleTranslation.getLLVMModule(),
+              mlir::cast<gpu::OffloadEmbeddingAttr>(attribute).getKind())
+              .launchKernel(mlir::cast<gpu::LaunchFuncOp>(launchFuncOperation),
+                            builder, moduleTranslation)))
+    return failure();
   return success();
 }
+
+//===----------------------------------------------------------------------===//
+// Interface registration
+//===----------------------------------------------------------------------===//
+void mlir::gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(
+    DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, gpu::GPUDialect *dialect) {
+    SelectObjectAttr::attachInterface<SelectObjectAttrImpl>(*ctx);
+    OffloadEmbeddingAttr::attachInterface<OffloadEmbeddingAttrImpl>(*ctx);
+  });
+}
diff --git a/mlir/test/Target/LLVMIR/gpu.mlir b/mlir/test/Target/LLVMIR/gpu.mlir
index 88672bd231df8f..74dfa53558d71f 100644
--- a/mlir/test/Target/LLVMIR/gpu.mlir
+++ b/mlir/test/Target/LLVMIR/gpu.mlir
@@ -101,3 +101,86 @@ module attributes {gpu.container_module} {
     llvm.return
   }
 }
+
+// -----
+
+// Test the `offload_embedding<cuda>` attribute.
+module attributes {gpu.container_module} {
+  // CHECK: @__begin_offload_kernel_module = internal constant [1 x %{{.*}}] [%{{.*}} { ptr @[[KERNEL_SYMBOL:.*]], ptr @[[ENTRY_NAME:.*]], i64 0, i32 0, i32 0 }]
+  // CHECK: @__end_offload_kernel_module = internal constant ptr getelementptr inbounds (%{{.*}}, ptr @__begin_offload_kernel_module, i64 1)
+  // CHECK: @[[FATBIN:.*]] = internal constant [4 x i8] c"BLOB", section ".nv_fatbin"
+  // CHECK: @[[FATBIN_HANDLE:.*]] = internal constant %{{.*}} { i32 1180844977, i32 1, ptr @[[FATBIN]]
+  // CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @[[REGISTRATION_CTOR:.*]], ptr null }]
+  // CHECK: @[[KERNEL_SYMBOL]] = internal constant i8 0
+  // CHECK-NEXT: @[[ENTRY_NAME]] = internal unnamed_addr constant [7 x i8] c"kernel\00"
+  gpu.binary @kernel_module <#gpu.offload_embedding<cuda>> [#gpu.object<#nvvm.target, bin = "BLOB">]
+  llvm.func @foo() {
+    // CHECK: [[ARGS:%.*]] = alloca %{{.*}}, align 8
+    // CHECK-NEXT: [[ARGS_ARRAY:%.*]] = alloca ptr, i64 2, align 8
+    // CHECK-NEXT: [[ARG0:%.*]] = getelementptr inbounds [[ARGS_TY]], ptr [[ARGS]], i32 0, i32 0
+    // CHECK-NEXT: store i32 32, ptr [[ARG0]], align 4
+    // CHECK-NEXT: %{{.*}} = getelementptr ptr, ptr [[ARGS_ARRAY]], i32 0
+    // CHECK-NEXT: store ptr [[ARG0]], ptr %{{.*}}, align 8
+    // CHECK-NEXT: [[ARG1:%.*]] = getelementptr inbounds [[ARGS_TY]], ptr [[ARGS]], i32 0, i32 1
+    // CHECK-NEXT: store i32 32, ptr [[ARG1]], align 4
+    // CHECK-NEXT: %{{.*}} = getelementptr ptr, ptr [[ARGS_ARRAY]], i32 1
+    // CHECK-NEXT: store ptr [[ARG1]], ptr %{{.*}}, align 8
+    // CHECK-NEXT: [[STREAM:%.*]] = call ptr @mgpuStreamCreate()
+    // CHECK-NEXT: call void @mgpuLaunchKernelRT(ptr @[[KERNEL_SYMBOL]], i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i32 256, ptr [[STREAM]], ptr [[ARGS_ARRAY]], ptr null, i64 2)
+    // CHECK-NEXT: call void @mgpuStreamSynchronize(ptr [[STREAM]])
+    // CHECK-NEXT: call void @mgpuStreamDestroy(ptr [[STREAM]])
+    %0 = llvm.mlir.constant(8 : index) : i64
+    %1 = llvm.mlir.constant(32 : i32) : i32
+    %2 = llvm.mlir.constant(256 : i32) : i32
+    gpu.launch_func @kernel_module::@kernel blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64 dynamic_shared_memory_size %2 args(%1 : i32, %1 : i32)
+    llvm.return
+  }
+  // CHECK: define internal void @[[REGISTRATION_CTOR]]
+  // CHECK: %{{.*}} = call ptr @__cudaRegisterFatBinary(ptr @[[FATBIN_HANDLE]])
+}
+
+// -----
+
+// Test the `offload_embedding<hip>` attribute.
+module attributes {gpu.container_module} {
+  // CHECK: @__begin_offload_kernel_module = internal constant [2 x %{{.*}}] [
+  // CHECK: %{{.*}} { ptr @[[KERNEL_1_SYMBOL:.*]], ptr @[[ENTRY_NAME_1:.*]], i64 0, i32 0, i32 0 },
+  // CHECK: %{{.*}} { ptr @[[KERNEL_2_SYMBOL:.*]], ptr @[[ENTRY_NAME_2:.*]], i64 0, i32 0, i32 0 }]
+  // CHECK: @__end_offload_kernel_module = internal constant ptr getelementptr inbounds (%{{.*}}, ptr @__begin_offload_kernel_module, i64 2)
+  // CHECK: @[[FATBIN:.*]] = internal constant [4 x i8] c"BLOB", section ".hip_fatbin"
+  // CHECK: @[[FATBIN_HANDLE:.*]] = internal constant %{{.*}} { i32 1212764230, i32 1, ptr @[[FATBIN]]
+  // CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @[[REGISTRATION_CTOR:.*]], ptr null }]
+  // CHECK: @[[KERNEL_1_SYMBOL]] = internal constant i8 0
+  // CHECK-NEXT: @[[ENTRY_NAME_1]] = internal unnamed_addr constant [9 x i8] c"kernel_1\00"
+  // CHECK: @[[KERNEL_2_SYMBOL]] = internal constant i8 0
+  // CHECK-NEXT: @[[ENTRY_NAME_2]] = internal unnamed_addr constant [9 x i8] c"kernel_2\00"
+  gpu.binary @kernel_module <#gpu.offload_embedding<hip>> [#gpu.object<#rocdl.target, bin = "BLOB">]
+  llvm.func @foo() {
+    %0 = llvm.mlir.constant(8 : index) : i64
+    %1 = llvm.mlir.constant(32 : i32) : i32
+    %2 = llvm.mlir.constant(256 : i32) : i32
+    gpu.launch_func @kernel_module::@kernel_1 blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64 dynamic_shared_memory_size %2 args(%1 : i32, %1 : i32)
+    gpu.launch_func @kernel_module::@kernel_2 blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64 dynamic_shared_memory_size %2 args(%1 : i32, %1 : i32)
+    llvm.return
+  }
+  // CHECK: define internal void @[[REGISTRATION_CTOR]]
+  // CHECK: %{{.*}} = call ptr @__hipRegisterFatBinary(ptr @[[FATBIN_HANDLE]])
+}
+
+// -----
+
+// Test the `offload_embedding<omp>` attribute.
+module attributes {gpu.container_module} {
+  // CHECK: @__begin_offload_kernel_module = internal constant [0 x %{{.*}}] zeroinitializer
+  // CHECK: @__end_offload_kernel_module = internal constant ptr @__begin_offload_kernel_module
+  // CHECK: @[[BINARY:.*]] = internal unnamed_addr constant [{{.*}} x i8] c"{{.*}}", section ".llvm.offloading", align 8
+  // CHECK: @[[BINARIES:.*]] = internal unnamed_addr constant [1 x %{{.*}}] [%{{.*}} { ptr getelementptr inbounds ([{{.*}} x i8], ptr @[[BINARY]], i64 0, i64 {{.*}}), ptr getelementptr inbounds ([{{.*}} x i8], ptr @[[BINARY]], i64 0, i64 {{.*}}), ptr @__begin_offload_kernel_module, ptr @__end_offload_kernel_module }]
+  // CHECK: @[[DESCRIPTOR:.*]] = internal constant %{{.*}} { i32 1, ptr @[[BINARIES]], ptr @__begin_offload_kernel_module, ptr @__end_offload_kernel_module }
+  // CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @[[REGISTRATION_CTOR:.*]], ptr null }]
+  // CHECK: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @[[REGISTRATION_DTOR:.*]], ptr null }]
+  gpu.binary @kernel_module <#gpu.offload_embedding<omp>> [#gpu.object<#rocdl.target, bin = "BLOB">]
+  // CHECK: define internal void @[[REGISTRATION_CTOR]]
+  // CHECK: call {{.*}} @__tgt_register_lib(ptr @[[DESCRIPTOR]])
+  // CHECK: define internal void @[[REGISTRATION_DTOR]]
+  // CHECK: call {{.*}} @__tgt_unregister_lib(ptr @[[DESCRIPTOR]])
+}

>From 959a1c87d913d5f53470d7c876dadc06b6b1e4b9 Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev at gmail.com>
Date: Tue, 16 Jan 2024 12:06:28 +0000
Subject: [PATCH 8/9] Add translation libs

---
 mlir/include/mlir/Target/LLVMIR/Dialect/All.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mlir/include/mlir/Target/LLVMIR/Dialect/All.h b/mlir/include/mlir/Target/LLVMIR/Dialect/All.h
index 7e76b29bb5977e..ab159fc88bc869 100644
--- a/mlir/include/mlir/Target/LLVMIR/Dialect/All.h
+++ b/mlir/include/mlir/Target/LLVMIR/Dialect/All.h
@@ -65,12 +65,15 @@ registerAllGPUToLLVMIRTranslations(DialectRegistry &registry) {
   registerBuiltinDialectTranslation(registry);
   registerGPUDialectTranslation(registry);
   registerLLVMDialectTranslation(registry);
+  registerOpenMPDialectTranslation(registry);
   registerNVVMDialectTranslation(registry);
   registerROCDLDialectTranslation(registry);
   registerSPIRVDialectTranslation(registry);
 
   // Extension required for translating GPU offloading Ops.
   gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(registry);
+  // Extensions required for translating the OpenMP dialect.
+  omp::registerAllExternalModels(registry);
 }
 
 /// Registers all dialects that can be translated from LLVM IR and the

>From d1d436087693acecb1479f6717628419f680338b Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev at gmail.com>
Date: Tue, 16 Jan 2024 15:59:26 +0000
Subject: [PATCH 9/9] [mlir][OpenMP] Add outlining pass for `TargetOp`

This patch adds a pass to outline OpenMP target operations into a GPU module,
allowing them to be compiled using the GPU dialect compilation infrastructure.

The pass works by traversing each function, outlining the ops to a GPU module,
and then cloning all the symbols referenced inside the target regions marked
with a declare target attribute. The outlining mechanism is similar to the one
found in `gpu-kernel-outlining`.

Note: Ignore the base commits, they are being reviewed in other PRs.
---
 .../mlir/Dialect/OpenMP/CMakeLists.txt        |   2 +
 .../Dialect/OpenMP/Transforms/CMakeLists.txt  |   7 +
 .../mlir/Dialect/OpenMP/Transforms/Passes.h   |  32 +++
 .../mlir/Dialect/OpenMP/Transforms/Passes.td  |  26 ++
 mlir/include/mlir/InitAllPasses.h             |   2 +
 mlir/lib/Dialect/OpenMP/CMakeLists.txt        |  20 ++
 .../OpenMP/Transforms/TargetOutlining.cpp     | 232 ++++++++++++++++++
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |   6 +-
 .../test/Dialect/OpenMP/target-outlining.mlir |  51 ++++
 9 files changed, 376 insertions(+), 2 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt
 create mode 100644 mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h
 create mode 100644 mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
 create mode 100644 mlir/lib/Dialect/OpenMP/Transforms/TargetOutlining.cpp
 create mode 100644 mlir/test/Dialect/OpenMP/target-outlining.mlir

diff --git a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
index 419e24a7335361..57f7cf482ffbca 100644
--- a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
@@ -23,3 +23,5 @@ mlir_tablegen(OpenMPTypeInterfaces.h.inc -gen-type-interface-decls)
 mlir_tablegen(OpenMPTypeInterfaces.cpp.inc -gen-type-interface-defs)
 add_public_tablegen_target(MLIROpenMPTypeInterfacesIncGen)
 add_dependencies(mlir-generic-headers MLIROpenMPTypeInterfacesIncGen)
+
+add_subdirectory(Transforms)
diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt
new file mode 100644
index 00000000000000..2115ec9492b27e
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls -name OMP)
+mlir_tablegen(Passes.capi.h.inc -gen-pass-capi-header --prefix OMP)
+mlir_tablegen(Passes.capi.cpp.inc -gen-pass-capi-impl --prefix OMP)
+add_public_tablegen_target(MLIROMPPassIncGen)
+
+add_mlir_doc(Passes OMPPasses ./ -gen-pass-doc)
diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h
new file mode 100644
index 00000000000000..9c04f66031fef7
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h
@@ -0,0 +1,32 @@
+//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes that expose pass constructors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES_H
+#define MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES_H
+
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+#include <optional>
+
+namespace mlir {
+namespace omp {
+#define GEN_PASS_DECL
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h.inc"
+
+/// Generate the code for registering passes.
+#define GEN_PASS_REGISTRATION
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h.inc"
+} // namespace omp
+} // namespace mlir
+
+#endif // MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES_H
diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
new file mode 100644
index 00000000000000..8762b2a804b6a7
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
@@ -0,0 +1,26 @@
+//===-- Passes.td - OMP pass definition file ---------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_OPENMP_PASSES
+#define MLIR_DIALECT_OPENMP_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def OMPTargetGPUOutlining : Pass<"omp-target-outline-to-gpu", "ModuleOp"> {
+  let summary = "Outline omp.target regions into GPU modules.";
+  let description = [{
+
+  }];
+  let dependentDialects = ["mlir::gpu::GPUDialect", "mlir::func::FuncDialect"];
+  let options = [
+    Option<"moduleName", "name", "std::string", [{"omp_offload"}],
+           "Name of the GPU module.">
+  ];
+}
+
+#endif // MLIR_DIALECT_OPENMP_PASSES
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
index 28dc3cc23daf2b..60b3d6beab83ca 100644
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -34,6 +34,7 @@
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/Mesh/Transforms/Passes.h"
 #include "mlir/Dialect/NVGPU/Transforms/Passes.h"
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h"
 #include "mlir/Dialect/SCF/Transforms/Passes.h"
 #include "mlir/Dialect/SPIRV/Transforms/Passes.h"
 #include "mlir/Dialect/Shape/Transforms/Passes.h"
@@ -79,6 +80,7 @@ inline void registerAllPasses() {
   memref::registerMemRefPasses();
   mesh::registerMeshPasses();
   ml_program::registerMLProgramPasses();
+  omp::registerOMPPasses();
   registerSCFPasses();
   registerShapePasses();
   spirv::registerSPIRVPasses();
diff --git a/mlir/lib/Dialect/OpenMP/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
index a1af7e40dcc6b6..5c9f38d0b6c0df 100644
--- a/mlir/lib/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
@@ -73,3 +73,23 @@ add_mlir_dialect_library(MLIROpenMPLLVMExternalModels
   MLIROpenMPDialect
   MLIRLLVMDialect
 )
+
+add_mlir_dialect_library(MLIROpenMPTransforms
+  Transforms/TargetOutlining.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenMP
+
+  LINK_COMPONENTS
+  Support
+
+  DEPENDS
+  MLIROMPPassIncGen
+
+  LINK_LIBS PUBLIC
+  MLIROpenMPDialect
+  MLIRGPUDialect
+  MLIRFuncDialect
+  MLIRPass
+  MLIRTransformUtils
+  )
diff --git a/mlir/lib/Dialect/OpenMP/Transforms/TargetOutlining.cpp b/mlir/lib/Dialect/OpenMP/Transforms/TargetOutlining.cpp
new file mode 100644
index 00000000000000..f8b5e1687bf5d3
--- /dev/null
+++ b/mlir/lib/Dialect/OpenMP/Transforms/TargetOutlining.cpp
@@ -0,0 +1,232 @@
+//===- TargetOutlining.cpp - Implementation of Target kernel outlining ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the GPU dialect kernel outlining pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h"
+
+#include "mlir/AsmParser/AsmParser.h"
+#include "mlir/Dialect/DLTI/DLTI.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Transforms/Utils.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/RegionUtils.h"
+
+#include "llvm/Support/FileSystem.h"
+
+namespace mlir {
+namespace omp {
+#define GEN_PASS_DEF_OMPTARGETGPUOUTLINING
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h.inc"
+} // namespace omp
+} // namespace mlir
+
+using namespace mlir;
+using namespace mlir::omp;
+
+namespace {
+/// Pass that moves the kernel of each LaunchOp into its separate nested module.
+///
+/// This pass moves the kernel code of each LaunchOp into a function created
+/// inside a nested module. It also creates an external function of the same
+/// name in the parent module.
+class OMPTargetGPUOutlining
+    : public omp::impl::OMPTargetGPUOutliningBase<OMPTargetGPUOutlining> {
+public:
+  using Base::Base;
+  void runOnOperation() override;
+
+private:
+  struct TargetOpInfo {
+    mutable TargetOp op;
+    StringRef parentName;
+    uint32_t functionId = 0;
+    uint32_t opId = 0;
+    uint32_t uniqueId = 0;
+  };
+  // Create the `TargetRegionEntryInfoAttr` for the TargetOp.
+  TargetRegionEntryInfoAttr getTargetEntryInfo(OpBuilder builder,
+                                               const TargetOpInfo &opInfo,
+                                               llvm::StringRef moduleName);
+  // Outline the TargetOp to the GPU module.
+  LogicalResult outlineTargetOp(gpu::GPUModuleOp module,
+                                SymbolTable &devSymbolTable,
+                                SymbolTable &hostSymbolTable,
+                                const TargetOpInfo &opInfo);
+  // Add the referenced declare target symbols to the module.
+  LogicalResult cloneDeclareTarget(OpBuilder builder, TargetOp op,
+                                   SymbolTable &devSymbolTable,
+                                   SymbolTable &hostSymbolTable,
+                                   StringRef moduleName);
+};
+} // namespace
+
+void OMPTargetGPUOutlining::runOnOperation() {
+  SymbolTable hostSymbolTable(getOperation());
+  // Collect all `omp.target` ops
+  SmallVector<TargetOpInfo> targetOps;
+  uint32_t uniqueId = 0;
+  uint32_t functionId = 0;
+  for (auto func : getOperation().getOps<FunctionOpInterface>()) {
+    uint32_t opId = 0;
+    func.walk([&](omp::TargetOp op) {
+      targetOps.push_back({op, func.getName(), functionId, opId++, uniqueId++});
+      return WalkResult::advance();
+    });
+    functionId++;
+  }
+  // Return early if there's no work to do
+  if (targetOps.empty())
+    return;
+  // Create the GPU module
+  OpBuilder builder(getOperation().getContext());
+  auto devModule = builder.create<gpu::GPUModuleOp>(
+      getOperation().getLoc(), moduleName, nullptr,
+      builder.getAttr<gpu::OffloadEmbeddingAttr>(gpu::OffloadKind::OpenMP));
+  hostSymbolTable.insert(devModule, getOperation().getBody()->begin());
+  if (auto moduleIface =
+          dyn_cast<OffloadModuleInterface>(devModule.getOperation())) {
+    moduleIface.setIsGPU(true);
+    moduleIface.setIsTargetDevice(true);
+  }
+  getOperation()->setAttr(gpu::GPUDialect::getContainerModuleAttrName(),
+                          UnitAttr::get(&getContext()));
+  SymbolTable devSymbolTable(devModule);
+  // Outline all the target Ops
+  for (TargetOpInfo &opInfo : targetOps)
+    if (failed(outlineTargetOp(devModule, devSymbolTable, hostSymbolTable,
+                               opInfo)))
+      return signalPassFailure();
+}
+
+TargetRegionEntryInfoAttr OMPTargetGPUOutlining::getTargetEntryInfo(
+    OpBuilder builder, const TargetOpInfo &opInfo, llvm::StringRef moduleName) {
+  auto fileLoc = opInfo.op.getLoc()->findInstanceOf<FileLineColLoc>();
+  // Try to create the entry info from `FileLineColLoc`
+  if (fileLoc) {
+    StringRef fileName = fileLoc.getFilename().getValue();
+
+    llvm::sys::fs::UniqueID id;
+    if (auto ec = llvm::sys::fs::getUniqueID(fileName, id)) {
+      opInfo.op.emitError("unable to get unique ID for file");
+      return nullptr;
+    }
+    uint64_t line = fileLoc.getLine();
+    return builder.getAttr<TargetRegionEntryInfoAttr>(
+        id.getDevice(), id.getFile(), line,
+        builder.getAttr<FlatSymbolRefAttr>(moduleName));
+  }
+  return builder.getAttr<TargetRegionEntryInfoAttr>(
+      opInfo.uniqueId, opInfo.functionId, opInfo.opId,
+      builder.getAttr<FlatSymbolRefAttr>(moduleName));
+}
+
+LogicalResult OMPTargetGPUOutlining::outlineTargetOp(
+    gpu::GPUModuleOp module, SymbolTable &devSymbolTable,
+    SymbolTable &hostSymbolTable, const TargetOpInfo &opInfo) {
+  TargetOp targetOp = opInfo.op;
+  OpBuilder builder(targetOp.getContext());
+  Location loc = targetOp.getLoc();
+  // Set the entry info.
+  targetOp.setTargetRegionEntryInfoAttr(
+      getTargetEntryInfo(builder, opInfo, module.getName()));
+  // Get the values that have to be mapped.
+  SmallVector<Value> outlinedValues;
+  SmallVector<Type> outlinedFnArgTypes;
+  if (auto ifExpr = targetOp.getIfExpr()) {
+    outlinedValues.push_back(ifExpr);
+    outlinedFnArgTypes.push_back(ifExpr.getType());
+  }
+  if (auto dev = targetOp.getDevice()) {
+    outlinedValues.push_back(dev);
+    outlinedFnArgTypes.push_back(dev.getType());
+  }
+  if (auto thrLimit = targetOp.getThreadLimit()) {
+    outlinedValues.push_back(thrLimit);
+    outlinedFnArgTypes.push_back(thrLimit.getType());
+  }
+  for (const auto &operand : targetOp.getMapOperands()) {
+    auto mapInfo = dyn_cast_or_null<MapInfoOp>(operand.getDefiningOp());
+    if (!mapInfo)
+      return targetOp.emitError("missing map info");
+    for (Value operand : mapInfo->getOperands()) {
+      outlinedValues.push_back(operand);
+      outlinedFnArgTypes.push_back(operand.getType());
+    }
+  }
+  // Create the outlined function.
+  FunctionType type =
+      FunctionType::get(targetOp.getContext(), outlinedFnArgTypes, {});
+  auto outlinedFunc =
+      builder.create<func::FuncOp>(loc, opInfo.parentName, type);
+  devSymbolTable.insert(outlinedFunc);
+  // Map the operands of the outlined function.
+  Block &entryBlock = outlinedFunc.getBody().emplaceBlock();
+  builder.setInsertionPointToEnd(&entryBlock);
+  IRMapping map;
+  for (Value arg : outlinedValues)
+    map.map(arg, entryBlock.addArgument(arg.getType(), arg.getLoc()));
+  for (const auto &operand : targetOp.getMapOperands()) {
+    auto mapInfo = dyn_cast_or_null<MapInfoOp>(operand.getDefiningOp());
+    auto outlinedInfo =
+        dyn_cast<MapInfoOp>(builder.clone(*(mapInfo.getOperation()), map));
+    map.map(operand, outlinedInfo);
+  }
+  // Clone the Op.
+  auto devTargetOp =
+      dyn_cast<TargetOp>(builder.clone(*(targetOp.getOperation()), map));
+  // Add an empty return.
+  builder.create<func::ReturnOp>(loc);
+  // Set the early outlining information.
+  auto outliningIface =
+      dyn_cast<EarlyOutliningInterface>(outlinedFunc.getOperation());
+  assert(outliningIface && "missing outlining interface");
+  outliningIface.setParentName(opInfo.parentName.str());
+  if (failed(cloneDeclareTarget(builder, devTargetOp, devSymbolTable,
+                                hostSymbolTable, module.getName())))
+    return failure();
+  return success();
+}
+
+LogicalResult OMPTargetGPUOutlining::cloneDeclareTarget(
+    OpBuilder builder, TargetOp op, SymbolTable &devSymbolTable,
+    SymbolTable &hostSymbolTable, StringRef moduleName) {
+  SmallVector<Operation *, 8> symbolDefWorklist = {op};
+  // Go through every symbol reference inside the TargetOp.
+  while (!symbolDefWorklist.empty()) {
+    if (std::optional<SymbolTable::UseRange> symbolUses =
+            SymbolTable::getSymbolUses(symbolDefWorklist.pop_back_val())) {
+      for (SymbolTable::SymbolUse symbolUse : *symbolUses) {
+        StringRef symbolName =
+            cast<FlatSymbolRefAttr>(symbolUse.getSymbolRef()).getValue();
+        // Check if the symbol is already in the device module.
+        if (symbolName == moduleName || devSymbolTable.lookup(symbolName))
+          continue;
+        // Find the symbol in the host module and determine whether it's valid.
+        Operation *symbolDef = hostSymbolTable.lookup(symbolName);
+        if (auto iface = dyn_cast<DeclareTargetInterface>(symbolDef);
+            !iface || !iface.isDeclareTarget()) {
+          return symbolDef->emitError("symbol must be a declare target");
+        }
+        // Clone the symbol.
+        Operation *symbolDefClone = symbolDef->clone();
+        symbolDefWorklist.push_back(symbolDefClone);
+        devSymbolTable.insert(symbolDefClone);
+      }
+    }
+  }
+  return success();
+}
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 5492d828a99a3f..adabf27bb596a2 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2461,10 +2461,13 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   // Determine whether the entry is going to be handled by
   // `OffloadEntriesInfoManager` or by this method. If `entryArraySection` is
   // null then it's handled by `OffloadEntriesInfoManager`
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   omp::TargetRegionEntryInfoAttr regionInfoAttr =
       targetOp.getTargetRegionEntryInfoAttr();
   FlatSymbolRefAttr entryArraySection =
-      regionInfoAttr ? regionInfoAttr.getSection() : FlatSymbolRefAttr();
+      (regionInfoAttr && !ompBuilder->Config.isTargetDevice())
+          ? regionInfoAttr.getSection()
+          : FlatSymbolRefAttr();
 
   // Create the target region
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTarget(
@@ -2479,7 +2482,6 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
 
   // Return early if the target op it's being emitted for a device or if the
   // entry is handled by `OffloadEntriesInfoManager`
-  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   if (ompBuilder->Config.isTargetDevice() || !entryArraySection)
     return bodyGenStatus;
 
diff --git a/mlir/test/Dialect/OpenMP/target-outlining.mlir b/mlir/test/Dialect/OpenMP/target-outlining.mlir
new file mode 100644
index 00000000000000..c3082236851cc8
--- /dev/null
+++ b/mlir/test/Dialect/OpenMP/target-outlining.mlir
@@ -0,0 +1,51 @@
+// RUN: mlir-opt %s --omp-target-outline-to-gpu | FileCheck %s
+
+module attributes {omp.is_target_device = false, omp.is_gpu = false} {
+  func.func @targetFn() -> () attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} {
+    return
+  }
+  llvm.func @main() {
+    omp.target {
+      func.call @targetFn() : () -> ()
+      omp.terminator
+    }
+    omp.target {
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// CHECK-LABEL: module attributes {gpu.container_module, omp.is_gpu = false, omp.is_target_device = false} {
+// CHECK-NEXT: gpu.module @[[DEV_MODULE:.*]] <#gpu.offload_embedding<omp>> attributes {omp.is_gpu = true, omp.is_target_device = true} {
+// CHECK-NEXT: func.func @{{.*}}() attributes {omp.outline_parent_name = "main"} {
+// CHECK-NEXT: omp.target info = #omp.tgt_entry_info<deviceID = [[DEVID_1:.*]], fileID = [[FILEID_1:.*]], line = [[LINE_1:.*]], section = @[[DEV_MODULE]]> {
+// CHECK-NEXT: func.call @targetFn() : () -> ()
+// CHECK-NEXT: omp.terminator
+// CHECK-NEXT: }
+// CHECK-NEXT: return
+// CHECK-NEXT: }
+// CHECK-NEXT: func.func @targetFn() attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} {
+// CHECK-NEXT: return
+// CHECK-NEXT: }
+// CHECK-NEXT: func.func @{{.*}}() attributes {omp.outline_parent_name = "main"} {
+// CHECK-NEXT: omp.target info = #omp.tgt_entry_info<deviceID = [[DEVID_2:.*]], fileID = [[FILEID_2:.*]], line = [[LINE_2:.*]], section = @[[DEV_MODULE]]> {
+// CHECK-NEXT: omp.terminator
+// CHECK-NEXT: }
+// CHECK-NEXT: return
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: func.func @targetFn() attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} {
+// CHECK-NEXT: return
+// CHECK-NEXT: }
+// CHECK-NEXT: llvm.func @main() {
+// CHECK-NEXT: omp.target info = #omp.tgt_entry_info<deviceID = [[DEVID_1]], fileID = [[FILEID_1]], line = [[LINE_1]], section = @[[DEV_MODULE]]> {
+// CHECK-NEXT: func.call @targetFn() : () -> ()
+// CHECK-NEXT: omp.terminator
+// CHECK-NEXT: }
+// CHECK-NEXT: omp.target info = #omp.tgt_entry_info<deviceID = [[DEVID_2]], fileID = [[FILEID_2]], line = [[LINE_2]], section = @[[DEV_MODULE]]> {
+// CHECK-NEXT: omp.terminator
+// CHECK-NEXT: }
+// CHECK-NEXT: llvm.return
+// CHECK-NEXT: }
+// CHECK-NEXT: }