[flang-commits] [flang] 7691666 - [MLIR][OpenMP] Initial Lowering of Declare Target for Data

Andrew Gozillon via flang-commits flang-commits at lists.llvm.org
Wed Sep 20 11:32:04 PDT 2023


Author: Andrew Gozillon
Date: 2023-09-20T13:31:15-05:00
New Revision: 76916669b96f6411ecc276ffdf77d305093178dc

URL: https://github.com/llvm/llvm-project/commit/76916669b96f6411ecc276ffdf77d305093178dc
DIFF: https://github.com/llvm/llvm-project/commit/76916669b96f6411ecc276ffdf77d305093178dc.diff

LOG: [MLIR][OpenMP] Initial Lowering of Declare Target for Data

This patch adds initial lowering for DeclareTargetAttr on
GlobalOp's utilising registerTargetGlobalVariable
and getAddrOfDeclareTargetVar from the
OMPIRBuilder.

It also adds initial processing of declare target map
operands, populating the combinedInfo that the
OMPIRBuilder requires to generate kernels and
it's kernel argument structure.

The combination of these additions allows simple mapping
of declare target globals to Target regions, as such a simple
runtime test showcasing this and testing it has been added.

The patch currently does not factor in filtering
based on device_type clauses (e.g. no emission of
globals for device if host specified), this will come in
a future iteration. And for the moment it's only been
tested with 1-D arrays and basic fortran data types,
more complex types (such as user defined derived
types from Fortran, allocatables or Fortran pointers)
may need further work.

reviewers: kiranchandramohan, skatrak

Differential Revision: https://reviews.llvm.org/D149368

Added: 
    mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir
    mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-host.mlir
    openmp/libomptarget/test/offloading/fortran/declare-target-array-in-target-region.f90

Modified: 
    flang/lib/Optimizer/Transforms/OMPEarlyOutlining.cpp
    mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

Removed: 
    


################################################################################
diff  --git a/flang/lib/Optimizer/Transforms/OMPEarlyOutlining.cpp b/flang/lib/Optimizer/Transforms/OMPEarlyOutlining.cpp
index 20ef66e0ad48cec..4eba921636e8abb 100644
--- a/flang/lib/Optimizer/Transforms/OMPEarlyOutlining.cpp
+++ b/flang/lib/Optimizer/Transforms/OMPEarlyOutlining.cpp
@@ -32,7 +32,7 @@ class OMPEarlyOutliningPass
   // Given a value this function will iterate over an operators results
   // and return the relevant index for the result the value corresponds to.
   // There may be a simpler way to do this however.
-  unsigned getResultIndex(mlir::Value value, mlir::Operation *op) {
+  static unsigned getResultIndex(mlir::Value value, mlir::Operation *op) {
     for (unsigned i = 0; i < op->getNumResults(); ++i) {
       if (op->getResult(i) == value)
         return i;
@@ -40,9 +40,10 @@ class OMPEarlyOutliningPass
     return 0;
   }
 
-  bool isDeclareTargetOp(mlir::Operation *op) {
-    if (fir::AddrOfOp addressOfOp = mlir::dyn_cast<fir::AddrOfOp>(op))
-      if (fir::GlobalOp gOp = mlir::dyn_cast<fir::GlobalOp>(
+  static bool isAddressOfGlobalDeclareTarget(mlir::Value value) {
+    if (fir::AddrOfOp addressOfOp =
+            mlir::dyn_cast_if_present<fir::AddrOfOp>(value.getDefiningOp()))
+      if (fir::GlobalOp gOp = mlir::dyn_cast_if_present<fir::GlobalOp>(
               addressOfOp->getParentOfType<mlir::ModuleOp>().lookupSymbol(
                   addressOfOp.getSymbol())))
         if (auto declareTargetGlobal =
@@ -59,14 +60,14 @@ class OMPEarlyOutliningPass
   // NOTE: Results in duplication of some values that would otherwise be
   // a single SSA value shared between operations, this is tidied up on
   // lowering to some extent.
-  mlir::Operation *
+  static mlir::Operation *
   cloneArgAndChildren(mlir::OpBuilder &builder, mlir::Operation *op,
                       llvm::SetVector<mlir::Value> &inputs,
                       mlir::Block::BlockArgListType &newInputs) {
     mlir::IRMapping valueMap;
-    for (auto opValue : op->getOperands()) {
+    for (mlir::Value opValue : op->getOperands()) {
       if (opValue.getDefiningOp()) {
-        auto resIdx = getResultIndex(opValue, opValue.getDefiningOp());
+        unsigned resIdx = getResultIndex(opValue, opValue.getDefiningOp());
         valueMap.map(opValue,
                      cloneArgAndChildren(builder, opValue.getDefiningOp(),
                                          inputs, newInputs)
@@ -82,11 +83,12 @@ class OMPEarlyOutliningPass
     return builder.clone(*op, valueMap);
   }
 
-  void cloneMapOpVariables(mlir::OpBuilder &builder, mlir::IRMapping &valueMap,
-                           mlir::IRMapping &mapInfoMap,
-                           llvm::SetVector<mlir::Value> &inputs,
-                           mlir::Block::BlockArgListType &newInputs,
-                           mlir::Value varPtr) {
+  static void cloneMapOpVariables(mlir::OpBuilder &builder,
+                                  mlir::IRMapping &valueMap,
+                                  mlir::IRMapping &mapInfoMap,
+                                  llvm::SetVector<mlir::Value> &inputs,
+                                  mlir::Block::BlockArgListType &newInputs,
+                                  mlir::Value varPtr) {
     if (fir::BoxAddrOp boxAddrOp =
             mlir::dyn_cast_if_present<fir::BoxAddrOp>(varPtr.getDefiningOp())) {
       mlir::Value newV =
@@ -97,7 +99,7 @@ class OMPEarlyOutliningPass
       return;
     }
 
-    if (varPtr.getDefiningOp() && isDeclareTargetOp(varPtr.getDefiningOp())) {
+    if (isAddressOfGlobalDeclareTarget(varPtr)) {
       fir::AddrOfOp addrOp =
           mlir::dyn_cast<fir::AddrOfOp>(varPtr.getDefiningOp());
       mlir::Value newV = builder.clone(*addrOp)->getResult(0);
@@ -129,18 +131,17 @@ class OMPEarlyOutliningPass
     // filter out declareTarget and map entries which are specially handled
     // at the moment, so we do not wish these to end up as function arguments
     // which would just be more noise in the IR.
-    for (auto value : inputs)
-      if (value.getDefiningOp())
-        if (mlir::isa<mlir::omp::MapInfoOp>(value.getDefiningOp()) ||
-            isDeclareTargetOp(value.getDefiningOp()))
-          inputs.remove(value);
+    for (mlir::Value value : inputs)
+      if (mlir::isa_and_nonnull<mlir::omp::MapInfoOp>(value.getDefiningOp()) ||
+          isAddressOfGlobalDeclareTarget(value))
+        inputs.remove(value);
 
     // Create new function and initialize
     mlir::FunctionType funcType = builder.getFunctionType(
         mlir::TypeRange(inputs.getArrayRef()), mlir::TypeRange());
     std::string parentName(parentFunc.getName());
     std::string funcName = getOutlinedFnName(parentName, count);
-    auto loc = targetOp.getLoc();
+    mlir::Location loc = targetOp.getLoc();
     mlir::func::FuncOp newFunc =
         mlir::func::FuncOp::create(loc, funcName, funcType);
     mlir::Block *entryBlock = newFunc.addEntryBlock();
@@ -175,11 +176,11 @@ class OMPEarlyOutliningPass
     // however, cloning across the minimum for the moment to avoid
     // optimisations breaking segments of the lowering seems prudent as this
     // was the original intent of the pass.
-    for (auto oper : targetOp.getOperation()->getOperands()) {
+    for (mlir::Value oper : targetOp->getOperands()) {
       if (auto mapEntry =
               mlir::dyn_cast<mlir::omp::MapInfoOp>(oper.getDefiningOp())) {
         mlir::IRMapping mapInfoMap;
-        for (auto bound : mapEntry.getBounds()) {
+        for (mlir::Value bound : mapEntry.getBounds()) {
           if (auto mapEntryBound = mlir::dyn_cast<mlir::omp::DataBoundsOp>(
                   bound.getDefiningOp())) {
             mapInfoMap.map(bound, cloneArgAndChildren(builder, mapEntryBound,

diff  --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 006ff515a18940a..84390576c0523c4 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -29,8 +29,11 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+
 #include <optional>
+#include <utility>
 
 using namespace mlir;
 
@@ -1507,6 +1510,104 @@ int64_t getSizeInBytes(DataLayout &DL, const Type &type) {
   return 0;
 }
 
+static llvm::OffloadEntriesInfoManager::OMPTargetDeviceClauseKind
+convertToDeviceClauseKind(mlir::omp::DeclareTargetDeviceType deviceClause) {
+  switch (deviceClause) {
+  case mlir::omp::DeclareTargetDeviceType::host:
+    return llvm::OffloadEntriesInfoManager::OMPTargetDeviceClauseHost;
+    break;
+  case mlir::omp::DeclareTargetDeviceType::nohost:
+    return llvm::OffloadEntriesInfoManager::OMPTargetDeviceClauseNoHost;
+    break;
+  case mlir::omp::DeclareTargetDeviceType::any:
+    return llvm::OffloadEntriesInfoManager::OMPTargetDeviceClauseAny;
+    break;
+  default:
+    return llvm::OffloadEntriesInfoManager::OMPTargetDeviceClauseNone;
+    break;
+  }
+}
+
+static llvm::OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind
+convertToCaptureClauseKind(
+    mlir::omp::DeclareTargetCaptureClause captureClasue) {
+  switch (captureClasue) {
+  case mlir::omp::DeclareTargetCaptureClause::to:
+    return llvm::OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
+    break;
+  case mlir::omp::DeclareTargetCaptureClause::link:
+    return llvm::OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink;
+    break;
+  default:
+    return llvm::OffloadEntriesInfoManager::OMPTargetGlobalVarEntryNone;
+    break;
+  }
+}
+
+static llvm::SmallString<64>
+getDeclareTargetRefPtrSuffix(LLVM::GlobalOp globalOp,
+                             llvm::OpenMPIRBuilder &ompBuilder) {
+  llvm::SmallString<64> suffix;
+  llvm::raw_svector_ostream os(suffix);
+  if (globalOp.getVisibility() == mlir::SymbolTable::Visibility::Private) {
+    auto loc = globalOp->getLoc()->findInstanceOf<FileLineColLoc>();
+    auto fileInfoCallBack = [&loc]() {
+      return std::pair<std::string, uint64_t>(
+          llvm::StringRef(loc.getFilename()), loc.getLine());
+    };
+
+    os << llvm::format(
+        "_%x", ompBuilder.getTargetEntryUniqueInfo(fileInfoCallBack).FileID);
+  }
+  os << "_decl_tgt_ref_ptr";
+
+  return suffix;
+}
+
+// Returns the reference pointer generated by the lowering of the declare target
+// operation in cases where the link clause is used or the to clause is used in
+// USM mode.
+static llvm::Value *
+getRefPtrIfDeclareTarget(mlir::Value value,
+                         LLVM::ModuleTranslation &moduleTranslation) {
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+
+  // An easier way to do this may just be to keep track of any pointer
+  // references and their mapping to their respective operation
+  if (auto addressOfOp =
+          llvm::dyn_cast_if_present<LLVM::AddressOfOp>(value.getDefiningOp())) {
+    if (auto gOp = llvm::dyn_cast_or_null<LLVM::GlobalOp>(
+            addressOfOp->getParentOfType<mlir::ModuleOp>().lookupSymbol(
+                addressOfOp.getGlobalName()))) {
+
+      if (auto declareTargetGlobal =
+              llvm::dyn_cast<mlir::omp::DeclareTargetInterface>(
+                  gOp.getOperation())) {
+
+        // In this case, we must utilise the reference pointer generated by the
+        // declare target operation, similar to Clang
+        if ((declareTargetGlobal.getDeclareTargetCaptureClause() ==
+             mlir::omp::DeclareTargetCaptureClause::link) ||
+            (declareTargetGlobal.getDeclareTargetCaptureClause() ==
+                 mlir::omp::DeclareTargetCaptureClause::to &&
+             ompBuilder->Config.hasRequiresUnifiedSharedMemory())) {
+          llvm::SmallString<64> suffix =
+              getDeclareTargetRefPtrSuffix(gOp, *ompBuilder);
+
+          if (gOp.getSymName().contains(suffix))
+            return moduleTranslation.getLLVMModule()->getNamedValue(
+                gOp.getSymName());
+
+          return moduleTranslation.getLLVMModule()->getNamedValue(
+              (gOp.getSymName().str() + suffix.str()).str());
+        }
+      }
+    }
+  }
+
+  return nullptr;
+}
+
 // Generate all map related information and fill the combinedInfo.
 static void genMapInfos(llvm::IRBuilderBase &builder,
                         LLVM::ModuleTranslation &moduleTranslation,
@@ -1516,7 +1617,7 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
                         const ArrayAttr &mapTypes,
                         const SmallVector<Value> &devPtrOperands = {},
                         const SmallVector<Value> &devAddrOperands = {},
-                        bool IsTargetParams = false) {
+                        bool isTargetParams = false) {
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
 
   auto fail = [&combinedInfo]() -> void {
@@ -1543,31 +1644,40 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
     // Unlike dev_ptr and dev_addr operands these map operands point
     // to a map entry operation which contains further information
     // on the variable being mapped and how it should be mapped.
-    auto MapInfoOp =
+    auto mapInfoOp =
         mlir::dyn_cast<mlir::omp::MapInfoOp>(mapOp.getDefiningOp());
 
     // TODO: Only LLVMPointerTypes are handled.
-    if (!MapInfoOp.getType().isa<LLVM::LLVMPointerType>())
+    if (!mapInfoOp.getType().isa<LLVM::LLVMPointerType>())
       return fail();
 
     llvm::Value *mapOpValue =
-        moduleTranslation.lookupValue(MapInfoOp.getVarPtr());
-    combinedInfo.BasePointers.emplace_back(mapOpValue);
+        moduleTranslation.lookupValue(mapInfoOp.getVarPtr());
+
+    llvm::Value *refPtr =
+        getRefPtrIfDeclareTarget(mapInfoOp.getVarPtr(), moduleTranslation);
+
+    combinedInfo.BasePointers.emplace_back(refPtr ? refPtr : mapOpValue);
     combinedInfo.Pointers.emplace_back(mapOpValue);
     combinedInfo.DevicePointers.emplace_back(
         llvm::OpenMPIRBuilder::DeviceInfoTy::None);
     combinedInfo.Names.emplace_back(LLVM::createMappingInformation(
-        MapInfoOp.getVarPtr().getLoc(), *ompBuilder));
+        mapInfoOp.getVarPtr().getLoc(), *ompBuilder));
+
+    auto mapFlag = llvm::omp::OpenMPOffloadMappingFlags(
+        mapTypes[index].cast<IntegerAttr>().getUInt());
 
-    combinedInfo.Types.emplace_back(
-        llvm::omp::OpenMPOffloadMappingFlags(
-            mapTypes[index].dyn_cast<IntegerAttr>().getUInt()) |
-        (IsTargetParams
-             ? llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM
-             : llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE));
+    // Declare Target Mappings are excluded from being marked as
+    // OMP_MAP_TARGET_PARAM as they are not passed as parameters, they're marked
+    // with OMP_MAP_PTR_AND_OBJ instead.
+    if (refPtr)
+      mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ;
+    else if (isTargetParams)
+      mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM;
 
+    combinedInfo.Types.emplace_back(mapFlag);
     combinedInfo.Sizes.emplace_back(
-        builder.getInt64(getSizeInBytes(DL, MapInfoOp.getVarPtr().getType())));
+        builder.getInt64(getSizeInBytes(DL, mapInfoOp.getVarPtr().getType())));
     index++;
   }
 
@@ -1856,6 +1966,37 @@ static bool targetOpSupported(Operation &opInst) {
   return true;
 }
 
+static void
+handleDeclareTargetMapVar(llvm::ArrayRef<Value> mapOperands,
+                          LLVM::ModuleTranslation &moduleTranslation,
+                          llvm::IRBuilderBase &builder) {
+  for (const mlir::Value &mapOp : mapOperands) {
+    auto mapInfoOp =
+        mlir::dyn_cast<mlir::omp::MapInfoOp>(mapOp.getDefiningOp());
+    llvm::Value *mapOpValue =
+        moduleTranslation.lookupValue(mapInfoOp.getVarPtr());
+    if (auto *declareTarget = getRefPtrIfDeclareTarget(mapInfoOp.getVarPtr(),
+                                                       moduleTranslation)) {
+      // The user's iterator will get invalidated if we modify an element,
+      // so we populate this vector of uses to alter each user on an individual
+      // basis to emit its own load (rather than one load for all).
+      llvm::SmallVector<llvm::User *> userVec;
+      for (llvm::User *user : mapOpValue->users())
+        userVec.push_back(user);
+
+      for (llvm::User *user : userVec) {
+        if (auto *insn = dyn_cast<llvm::Instruction>(user)) {
+          auto *load = builder.CreateLoad(
+              moduleTranslation.convertType(mapInfoOp.getVarPtr().getType()),
+              declareTarget);
+          load->moveBefore(insn);
+          user->replaceUsesOfWith(mapOpValue, load);
+        }
+      }
+    }
+  }
+}
+
 static LogicalResult
 convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
                  LLVM::ModuleTranslation &moduleTranslation) {
@@ -1866,13 +2007,25 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   auto targetOp = cast<omp::TargetOp>(opInst);
   auto &targetRegion = targetOp.getRegion();
 
+  // This function filters out kernel data that will not show up as kernel
+  // input arguments to the generated kernel function but will still need
+  // explicitly mapped through supplying information to the OpenMP runtime
+  // (declare target). It also prepares some data used for generating the
+  // kernel and populating the associated OpenMP runtime data structures.
+  auto getKernelArguments =
+      [&](const llvm::SetVector<Value> &operandSet,
+          llvm::SmallVectorImpl<llvm::Value *> &llvmInputs) {
+        for (Value operand : operandSet) {
+          if (!getRefPtrIfDeclareTarget(operand, moduleTranslation))
+            llvmInputs.push_back(moduleTranslation.lookupValue(operand));
+        }
+      };
+
   llvm::SetVector<Value> operandSet;
   getUsedValuesDefinedAbove(targetRegion, operandSet);
 
-  // Collect the input arguments.
   llvm::SmallVector<llvm::Value *> inputs;
-  for (Value operand : operandSet)
-    inputs.push_back(moduleTranslation.lookupValue(operand));
+  getKernelArguments(operandSet, inputs);
 
   LogicalResult bodyGenStatus = success();
 
@@ -1939,18 +2092,24 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
       ompLoc, allocaIP, builder.saveIP(), entryInfo, defaultValTeams,
       defaultValThreads, inputs, genMapInfoCB, bodyCB));
 
+  // Remap access operations to declare target reference pointers for the
+  // device, essentially generating extra loadop's as necessary
+  if (moduleTranslation.getOpenMPBuilder()->Config.isTargetDevice()) {
+    SmallVector<Value> mapOperands = targetOp.getMapOperands();
+    handleDeclareTargetMapVar(llvm::ArrayRef(mapOperands), moduleTranslation,
+                              builder);
+  }
   return bodyGenStatus;
 }
 
 static LogicalResult
-convertDeclareTargetAttr(Operation *op,
-                         omp::DeclareTargetAttr declareTargetAttr,
+convertDeclareTargetAttr(Operation *op, mlir::omp::DeclareTargetAttr attribute,
                          LLVM::ModuleTranslation &moduleTranslation) {
   // Amend omp.declare_target by deleting the IR of the outlined functions
   // created for target regions. They cannot be filtered out from MLIR earlier
-  // because the omp.target operation inside must be translated to LLVM, but the
-  // wrapper functions themselves must not remain at the end of the process.
-  // We know that functions where omp.declare_target does not match
+  // because the omp.target operation inside must be translated to LLVM, but
+  // the wrapper functions themselves must not remain at the end of the
+  // process. We know that functions where omp.declare_target does not match
   // omp.is_target_device at this stage can only be wrapper functions because
   // those that aren't are removed earlier as an MLIR transformation pass.
   if (FunctionOpInterface funcOp = dyn_cast<FunctionOpInterface>(op)) {
@@ -1960,7 +2119,8 @@ convertDeclareTargetAttr(Operation *op,
         return success();
 
       omp::DeclareTargetDeviceType declareType =
-          declareTargetAttr.getDeviceType().getValue();
+          attribute.getDeviceType().getValue();
+
       if (declareType == omp::DeclareTargetDeviceType::host) {
         llvm::Function *llvmFunc =
             moduleTranslation.lookupFunction(funcOp.getName());
@@ -1968,7 +2128,77 @@ convertDeclareTargetAttr(Operation *op,
         llvmFunc->eraseFromParent();
       }
     }
+    return success();
+  }
+
+  if (LLVM::GlobalOp gOp = dyn_cast<LLVM::GlobalOp>(op)) {
+    llvm::Module *llvmModule = moduleTranslation.getLLVMModule();
+    if (auto *gVal = llvmModule->getNamedValue(gOp.getSymName())) {
+      llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+      bool isDeclaration = gOp.isDeclaration();
+      bool isExternallyVisible =
+          gOp.getVisibility() != mlir::SymbolTable::Visibility::Private;
+      auto loc = op->getLoc()->findInstanceOf<FileLineColLoc>();
+      llvm::StringRef mangledName = gOp.getSymName();
+      auto captureClause =
+          convertToCaptureClauseKind(attribute.getCaptureClause().getValue());
+      auto deviceClause =
+          convertToDeviceClauseKind(attribute.getDeviceType().getValue());
+      // unused for MLIR at the moment, required in Clang for book
+      // keeping
+      std::vector<llvm::GlobalVariable *> generatedRefs;
+
+      std::vector<llvm::Triple> targetTriple;
+      auto targetTripleAttr =
+          op->getParentOfType<mlir::ModuleOp>()
+              ->getAttr(LLVM::LLVMDialect::getTargetTripleAttrName())
+              .dyn_cast_or_null<mlir::StringAttr>();
+      if (targetTripleAttr)
+        targetTriple.emplace_back(targetTripleAttr.data());
+
+      auto fileInfoCallBack = [&loc]() {
+        std::string filename = "";
+        std::uint64_t lineNo = 0;
+
+        if (loc) {
+          filename = loc.getFilename().str();
+          lineNo = loc.getLine();
+        }
+
+        return std::pair<std::string, std::uint64_t>(llvm::StringRef(filename),
+                                                     lineNo);
+      };
+
+      ompBuilder->registerTargetGlobalVariable(
+          captureClause, deviceClause, isDeclaration, isExternallyVisible,
+          ompBuilder->getTargetEntryUniqueInfo(fileInfoCallBack), mangledName,
+          generatedRefs, /*OpenMPSimd*/ false, targetTriple,
+          /*GlobalInitializer*/ nullptr, /*VariableLinkage*/ nullptr,
+          gVal->getType(), gVal);
+
+      if (ompBuilder->Config.isTargetDevice() &&
+          (attribute.getCaptureClause().getValue() !=
+               mlir::omp::DeclareTargetCaptureClause::to ||
+           ompBuilder->Config.hasRequiresUnifiedSharedMemory())) {
+        ompBuilder->getAddrOfDeclareTargetVar(
+            captureClause, deviceClause, isDeclaration, isExternallyVisible,
+            ompBuilder->getTargetEntryUniqueInfo(fileInfoCallBack), mangledName,
+            generatedRefs, /*OpenMPSimd*/ false, targetTriple, gVal->getType(),
+            /*GlobalInitializer*/ nullptr,
+            /*VariableLinkage*/ nullptr);
+        // A global has already been generated by this stage, unlike Clang, so
+        // this needs to be specially removed here for device when we're
+        // anything but a To clause specified variable with no unified shared
+        // memory.
+        if (llvm::GlobalValue *llvmVal =
+                llvmModule->getNamedValue(mangledName)) {
+          llvmVal->removeFromParent();
+          llvmVal->dropAllReferences();
+        }
+      }
+    }
   }
+
   return success();
 }
 

diff  --git a/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir b/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir
new file mode 100644
index 000000000000000..f279772bd6c0969
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir
@@ -0,0 +1,33 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// This test the generation of additional load operations for declare target link variables
+// inside of target op regions when lowering to IR for device. Unfortunately as the host file is not 
+// passed as a module attribute, we miss out on the metadata and entryinfo.
+//
+// Unfortunately, only so much can be tested as the device side is dependent on a *.bc 
+// file created by the host and appended as an attribute to the module.
+
+module attributes {omp.is_target_device = true} {
+  // CHECK-DAG: @_QMtest_0Esp_decl_tgt_ref_ptr = weak global ptr null, align 8
+  llvm.mlir.global external @_QMtest_0Esp() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    llvm.return %0 : i32
+  }
+                                                            
+  llvm.func @_QQmain() attributes {} {
+    %0 = llvm.mlir.addressof @_QMtest_0Esp : !llvm.ptr<i32>
+  
+  // CHECK-DAG:   omp.target:                                       ; preds = %user_code.entry
+  // CHECK-DAG: %1 = load ptr, ptr @_QMtest_0Esp_decl_tgt_ref_ptr, align 8
+  // CHECK-DAG: store i32 1, ptr %1, align 4
+  // CHECK-DAG: br label %omp.region.cont
+    %map = omp.map_info var_ptr(%0 : !llvm.ptr<i32>)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr<i32> {name = ""}
+    omp.target   map_entries(%map : !llvm.ptr<i32>) {
+      %1 = llvm.mlir.constant(1 : i32) : i32
+      llvm.store %1, %0 : !llvm.ptr<i32>
+      omp.terminator
+    }
+
+    llvm.return
+  }
+}

diff  --git a/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-host.mlir b/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-host.mlir
new file mode 100644
index 000000000000000..709af26f1d3dfa4
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-host.mlir
@@ -0,0 +1,140 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// CHECK-DAG: %struct.__tgt_offload_entry = type { ptr, ptr, i64, i32, i32 } 
+// CHECK-DAG: !omp_offload.info = !{!{{.*}}}
+module attributes {llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_target_device = false} {
+
+  // CHECK-DAG: @_QMtest_0Earray_1d = global [3 x i32] [i32 1, i32 2, i32 3]
+  // CHECK-DAG: @_QMtest_0Earray_1d_decl_tgt_ref_ptr = weak global ptr @_QMtest_0Earray_1d 
+  // CHECK-DAG: @.omp_offloading.entry_name{{.*}} = internal unnamed_addr constant [36 x i8] c"_QMtest_0Earray_1d_decl_tgt_ref_ptr\00"
+  // CHECK-DAG: @.omp_offloading.entry._QMtest_0Earray_1d_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Earray_1d_decl_tgt_ref_ptr, ptr @.omp_offloading.entry_name{{.*}}, i64 8, i32 1, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Earray_1d_decl_tgt_ref_ptr", i32 {{.*}}, i32 {{.*}}}
+  llvm.mlir.global external @_QMtest_0Earray_1d(dense<[1, 2, 3]> : tensor<3xi32>) {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : !llvm.array<3 x i32>
+
+  // CHECK-DAG: @_QMtest_0Earray_2d = global [2 x [2 x i32]] {{.*}}
+  // CHECK-DAG: @_QMtest_0Earray_2d_decl_tgt_ref_ptr = weak global ptr @_QMtest_0Earray_2d 
+  // CHECK-DAG: @.omp_offloading.entry_name{{.*}} = internal unnamed_addr constant [36 x i8] c"_QMtest_0Earray_2d_decl_tgt_ref_ptr\00"
+  // CHECK-DAG: @.omp_offloading.entry._QMtest_0Earray_2d_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Earray_2d_decl_tgt_ref_ptr, ptr @.omp_offloading.entry_name{{.*}}, i64 8, i32 1, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Earray_2d_decl_tgt_ref_ptr", i32 {{.*}}, i32 {{.*}}}
+  llvm.mlir.global external @_QMtest_0Earray_2d() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : !llvm.array<2 x array<2 x i32>> {
+    %0 = llvm.mlir.undef : !llvm.array<2 x array<2 x i32>>
+    %1 = llvm.mlir.constant(1 : i32) : i32
+    %2 = llvm.insertvalue %1, %0[0, 0] : !llvm.array<2 x array<2 x i32>> 
+    %3 = llvm.mlir.constant(2 : i32) : i32
+    %4 = llvm.insertvalue %3, %2[0, 1] : !llvm.array<2 x array<2 x i32>> 
+    %5 = llvm.mlir.constant(3 : i32) : i32
+    %6 = llvm.insertvalue %5, %4[1, 0] : !llvm.array<2 x array<2 x i32>> 
+    %7 = llvm.mlir.constant(4 : i32) : i32
+    %8 = llvm.insertvalue %7, %6[1, 1] : !llvm.array<2 x array<2 x i32>> 
+    %9 = llvm.mlir.constant(2 : index) : i64
+    %10 = llvm.mlir.constant(2 : index) : i64
+    llvm.return %8 : !llvm.array<2 x array<2 x i32>>
+  }
+
+  // CHECK-DAG:  @_QMtest_0Edata_extended_link_1 = global float 2.000000e+00
+  // CHECK-DAG:  @_QMtest_0Edata_extended_link_1_decl_tgt_ref_ptr = weak global ptr @_QMtest_0Edata_extended_link_1
+  // CHECK-DAG:  @.omp_offloading.entry_name{{.*}} = internal unnamed_addr constant [48 x i8] c"_QMtest_0Edata_extended_link_1_decl_tgt_ref_ptr\00"
+  // CHECK-DAG:  @.omp_offloading.entry._QMtest_0Edata_extended_link_1_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Edata_extended_link_1_decl_tgt_ref_ptr, ptr @.omp_offloading.entry_name{{.*}}, i64 8, i32 1, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG:  !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Edata_extended_link_1_decl_tgt_ref_ptr", i32 {{.*}}, i32 {{.*}}}
+  llvm.mlir.global external @_QMtest_0Edata_extended_link_1() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : f32 {
+    %0 = llvm.mlir.constant(2.000000e+00 : f32) : f32
+    llvm.return %0 : f32
+  }
+
+  // CHECK-DAG:  @_QMtest_0Edata_extended_link_2 = global float 3.000000e+00
+  // CHECK-DAG:  @_QMtest_0Edata_extended_link_2_decl_tgt_ref_ptr = weak global ptr @_QMtest_0Edata_extended_link_2
+  // CHECK-DAG:  @.omp_offloading.entry_name{{.*}} = internal unnamed_addr constant [48 x i8] c"_QMtest_0Edata_extended_link_2_decl_tgt_ref_ptr\00"
+  // CHECK-DAG:  @.omp_offloading.entry._QMtest_0Edata_extended_link_2_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Edata_extended_link_2_decl_tgt_ref_ptr, ptr @.omp_offloading.entry_name{{.*}}, i64 8, i32 1, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG:  !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Edata_extended_link_2_decl_tgt_ref_ptr", i32 {{.*}}, i32 {{.*}}}
+  llvm.mlir.global external @_QMtest_0Edata_extended_link_2() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : f32 {
+    %0 = llvm.mlir.constant(3.000000e+00 : f32) : f32
+    llvm.return %0 : f32
+  }
+
+  // CHECK-DAG: @_QMtest_0Edata_extended_to_1 = global float 2.000000e+00
+  // CHECK-DAG: @.omp_offloading.entry_name{{.*}} = internal unnamed_addr constant [29 x i8] c"_QMtest_0Edata_extended_to_1\00"
+  // CHECK-DAG: @.omp_offloading.entry._QMtest_0Edata_extended_to_1 = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Edata_extended_to_1, ptr @.omp_offloading.entry_name{{.*}}, i64 4, i32 0, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Edata_extended_to_1", i32 {{.*}}, i32 {{.*}}}
+  llvm.mlir.global external @_QMtest_0Edata_extended_to_1() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : f32 {
+    %0 = llvm.mlir.constant(2.000000e+00 : f32) : f32
+    llvm.return %0 : f32
+  }
+
+  // CHECK-DAG: @_QMtest_0Edata_extended_to_2 = global float 3.000000e+00
+  // CHECK-DAG: @.omp_offloading.entry_name{{.*}} = internal unnamed_addr constant [29 x i8] c"_QMtest_0Edata_extended_to_2\00"
+  // CHECK-DAG: @.omp_offloading.entry._QMtest_0Edata_extended_to_2 = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Edata_extended_to_2, ptr @.omp_offloading.entry_name{{.*}}, i64 4, i32 0, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Edata_extended_to_2", i32 {{.*}}, i32 {{.*}}}
+  llvm.mlir.global external @_QMtest_0Edata_extended_to_2() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : f32 {
+    %0 = llvm.mlir.constant(3.000000e+00 : f32) : f32
+    llvm.return %0 : f32
+  }
+
+  // CHECK-DAG: @_QMtest_0Edata_int = global i32 1
+  // CHECK-DAG: @_QMtest_0Edata_int_decl_tgt_ref_ptr = weak global ptr @_QMtest_0Edata_int 
+  // CHECK-DAG: @.omp_offloading.entry_name{{.*}} = internal unnamed_addr constant [36 x i8] c"_QMtest_0Edata_int_decl_tgt_ref_ptr\00"
+  // CHECK-DAG: @.omp_offloading.entry._QMtest_0Edata_int_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Edata_int_decl_tgt_ref_ptr, ptr @.omp_offloading.entry_name{{.*}}, i64 8, i32 1, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Edata_int_decl_tgt_ref_ptr", i32 {{.*}}, i32 {{.*}}}
+  llvm.mlir.global external @_QMtest_0Edata_int() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : i32 {
+    %0 = llvm.mlir.constant(10 : i32) : i32
+    llvm.return %0 : i32
+  }
+
+  // CHECK-DAG: @_QMtest_0Edata_int_clauseless = global i32 1
+  // CHECK-DAG: @.omp_offloading.entry_name{{.*}} = internal unnamed_addr constant [30 x i8] c"_QMtest_0Edata_int_clauseless\00"
+  // CHECK-DAG: @.omp_offloading.entry._QMtest_0Edata_int_clauseless = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Edata_int_clauseless, ptr @.omp_offloading.entry_name{{.*}}, i64 4, i32 0, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Edata_int_clauseless", i32 {{.*}}, i32 {{.*}}}
+  llvm.mlir.global external @_QMtest_0Edata_int_clauseless() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : i32 {
+    %0 = llvm.mlir.constant(1 : i32) : i32
+    llvm.return %0 : i32
+  }
+
+  // CHECK-DAG: @_QMtest_0Edata_int_to = global i32 5
+  // CHECK-DAG: @.omp_offloading.entry_name{{.*}} = internal unnamed_addr constant [22 x i8] c"_QMtest_0Edata_int_to\00"
+  // CHECK-DAG: @.omp_offloading.entry._QMtest_0Edata_int_to = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Edata_int_to, ptr @.omp_offloading.entry_name{{.*}}, i64 4, i32 0, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Edata_int_to", i32 {{.*}}, i32 {{.*}}}
+  llvm.mlir.global external @_QMtest_0Edata_int_to() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : i32 {
+    %0 = llvm.mlir.constant(5 : i32) : i32
+    llvm.return %0 : i32
+  }
+
+  // CHECK-DAG: @_QMtest_0Ept1 = global { ptr, i64, i32, i8, i8, i8, i8 } { ptr null, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20180515, i8 0, i8 9, i8 1, i8 0 }
+  // CHECK-DAG: @_QMtest_0Ept1_decl_tgt_ref_ptr = weak global ptr @_QMtest_0Ept1
+  // CHECK-DAG: @.omp_offloading.entry_name{{.*}} = internal unnamed_addr constant [31 x i8] c"_QMtest_0Ept1_decl_tgt_ref_ptr\00"
+  // CHECK-DAG: @.omp_offloading.entry._QMtest_0Ept1_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Ept1_decl_tgt_ref_ptr, ptr @.omp_offloading.entry_name{{.*}}, i64 8, i32 1, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Ept1_decl_tgt_ref_ptr", i32 {{.*}}, i32 {{.*}}}
+  llvm.mlir.global external @_QMtest_0Ept1() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : !llvm.struct<(ptr<i32>, i64, i32, i8, i8, i8, i8)> {
+    %0 = llvm.mlir.null : !llvm.ptr<i32>
+    %1 = llvm.mlir.constant(9 : i32) : i32
+    %2 = llvm.mlir.null : !llvm.ptr<i32>
+    %3 = llvm.getelementptr %2[1] : (!llvm.ptr<i32>) -> !llvm.ptr<i32>
+    %4 = llvm.ptrtoint %3 : !llvm.ptr<i32> to i64
+    %5 = llvm.mlir.undef : !llvm.struct<(ptr<i32>, i64, i32, i8, i8, i8, i8)>
+    %6 = llvm.insertvalue %4, %5[1] : !llvm.struct<(ptr<i32>, i64, i32, i8, i8, i8, i8)> 
+    %7 = llvm.mlir.constant(20180515 : i32) : i32
+    %8 = llvm.insertvalue %7, %6[2] : !llvm.struct<(ptr<i32>, i64, i32, i8, i8, i8, i8)> 
+    %9 = llvm.mlir.constant(0 : i32) : i32
+    %10 = llvm.trunc %9 : i32 to i8
+    %11 = llvm.insertvalue %10, %8[3] : !llvm.struct<(ptr<i32>, i64, i32, i8, i8, i8, i8)> 
+    %12 = llvm.trunc %1 : i32 to i8
+    %13 = llvm.insertvalue %12, %11[4] : !llvm.struct<(ptr<i32>, i64, i32, i8, i8, i8, i8)> 
+    %14 = llvm.mlir.constant(1 : i32) : i32
+    %15 = llvm.trunc %14 : i32 to i8
+    %16 = llvm.insertvalue %15, %13[5] : !llvm.struct<(ptr<i32>, i64, i32, i8, i8, i8, i8)> 
+    %17 = llvm.mlir.constant(0 : i32) : i32
+    %18 = llvm.trunc %17 : i32 to i8
+    %19 = llvm.insertvalue %18, %16[6] : !llvm.struct<(ptr<i32>, i64, i32, i8, i8, i8, i8)> 
+    %20 = llvm.bitcast %0 : !llvm.ptr<i32> to !llvm.ptr<i32>
+    %21 = llvm.insertvalue %20, %19[0] : !llvm.struct<(ptr<i32>, i64, i32, i8, i8, i8, i8)> 
+    llvm.return %21 : !llvm.struct<(ptr<i32>, i64, i32, i8, i8, i8, i8)>
+  }
+
+  // CHECK-DAG: @_QMtest_0Ept2_tar = global i32 5
+  // CHECK-DAG: @_QMtest_0Ept2_tar_decl_tgt_ref_ptr = weak global ptr @_QMtest_0Ept2_tar
+  // CHECK-DAG: @.omp_offloading.entry_name{{.*}} = internal unnamed_addr constant [35 x i8] c"_QMtest_0Ept2_tar_decl_tgt_ref_ptr\00"
+  // CHECK-DAG: @.omp_offloading.entry._QMtest_0Ept2_tar_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Ept2_tar_decl_tgt_ref_ptr, ptr @.omp_offloading.entry_name{{.*}}, i64 8, i32 1, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Ept2_tar_decl_tgt_ref_ptr", i32 {{.*}}, i32 {{.*}}}
+  llvm.mlir.global external @_QMtest_0Ept2_tar() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : i32 {
+    %0 = llvm.mlir.constant(5 : i32) : i32
+    llvm.return %0 : i32
+  }
+}

diff  --git a/openmp/libomptarget/test/offloading/fortran/declare-target-array-in-target-region.f90 b/openmp/libomptarget/test/offloading/fortran/declare-target-array-in-target-region.f90
new file mode 100644
index 000000000000000..f5e3ae00653a9ab
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/declare-target-array-in-target-region.f90
@@ -0,0 +1,34 @@
+! Offloading test with a target region mapping a declare target
+! Fortran array writing some values to it and checking the host
+! correctly receives the updates made on the device.
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+module test_0
+    implicit none
+    INTEGER :: sp(10) = (/0,0,0,0,0,0,0,0,0,0/)
+    !$omp declare target link(sp)
+end module test_0
+
+program main
+    use test_0
+    integer :: i = 1
+    integer :: j = 11
+!$omp target map(tofrom:sp, i, j)
+    do while (i <= j)
+        sp(i) = i;
+        i = i + 1
+    end do
+!$omp end target
+
+PRINT *, sp(:)
+
+end program
+
+! CHECK: 1 2 3 4 5 6 7 8 9 10


        


More information about the flang-commits mailing list