[flang-commits] [flang] d2d4c3b - [MLIR][OpenMP] LLVM IR translation of host_eval (#116052)

Tue Jan 14 05:07:42 PST 2025

Author: Sergio Afonso
Date: 2025-01-14T13:07:38Z
New Revision: d2d4c3bd598453a1d871096466953ec25483034f

URL: https://github.com/llvm/llvm-project/commit/d2d4c3bd598453a1d871096466953ec25483034f
DIFF: https://github.com/llvm/llvm-project/commit/d2d4c3bd598453a1d871096466953ec25483034f.diff

LOG: [MLIR][OpenMP] LLVM IR translation of host_eval (#116052)

This patch adds support for processing the `host_eval` clause of
`omp.target` to populate default and runtime kernel launch attributes.
Specifically, these related to the `num_teams`, `thread_limit` and
`num_threads` clauses attached to operations nested inside of
`omp.target`. As a result, the `thread_limit` clause of `omp.target` is
also supported.

The implementation of `initTargetDefaultAttrs()` is intended to reflect
clang's own processing of multiple constructs and clauses in order to
define a default number of teams and threads to be used as kernel
attributes and to populate global variables in the target device module.

One side effect of this change is that it is no longer possible to
translate to LLVM IR target device MLIR modules unless they have a
supported target triple. This is because the local `getGridValue()`
function in the `OpenMPIRBuilder` only works for certain architectures,
and it is called whenever the maximum number of threads has not been
explicitly defined. This limitation also matches clang.

Evaluating the collapsed loop trip count of SPMD and Generic-SPMD
kernels remains unsupported.

Added: 
    mlir/test/Target/LLVMIR/openmp-target-launch-device.mlir
    mlir/test/Target/LLVMIR/openmp-target-launch-host.mlir

Modified: 
    flang/test/Integration/OpenMP/target-filtering.f90
    flang/test/Lower/OpenMP/function-filtering-2.f90
    flang/test/Lower/OpenMP/function-filtering-3.f90
    flang/test/Lower/OpenMP/function-filtering.f90
    mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
    mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir
    mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir
    mlir/test/Target/LLVMIR/omptarget-constant-indexing-device-region.mlir
    mlir/test/Target/LLVMIR/omptarget-debug.mlir
    mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir
    mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
    mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
    mlir/test/Target/LLVMIR/omptarget-target-inside-task.mlir
    mlir/test/Target/LLVMIR/omptarget-threadprivate-device-lowering.mlir
    mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir
    mlir/test/Target/LLVMIR/openmp-task-target-device.mlir
    mlir/test/Target/LLVMIR/openmp-todo.mlir

Removed: 
    


################################################################################
diff  --git a/flang/test/Integration/OpenMP/target-filtering.f90 b/flang/test/Integration/OpenMP/target-filtering.f90
index d1ab1b47e580d4..699c1040d91f9c 100644

--- a/flang/test/Integration/OpenMP/target-filtering.f90
+++ b/flang/test/Integration/OpenMP/target-filtering.f90
@@ -7,7 +7,7 @@
 !===----------------------------------------------------------------------===!
 
 !RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s --check-prefixes HOST,ALL
-!RUN: %flang_fc1 -emit-llvm -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefixes DEVICE,ALL
+!RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-llvm -fopenmp -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefixes DEVICE,ALL
 
 !HOST: define {{.*}}@{{.*}}before{{.*}}(
 !DEVICE-NOT: define {{.*}}@before{{.*}}(

diff  --git a/flang/test/Lower/OpenMP/function-filtering-2.f90 b/flang/test/Lower/OpenMP/function-filtering-2.f90
index 0c02aa223820e7..a2c5e29cfdcbf6 100644
--- a/flang/test/Lower/OpenMP/function-filtering-2.f90
+++ b/flang/test/Lower/OpenMP/function-filtering-2.f90
@@ -1,9 +1,9 @@
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM,LLVM-HOST %s
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -emit-hlfir %s -o - | FileCheck --check-prefix=MLIR %s
-! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM,LLVM-DEVICE %s
-! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefix=MLIR %s
+! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM,LLVM-DEVICE %s
+! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefix=MLIR %s
 ! RUN: bbc -fopenmp -fopenmp-version=52 -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-HOST,MLIR-ALL %s
-! RUN: bbc -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
+! RUN: bbc -target amdgcn-amd-amdhsa -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
 
 ! MLIR: func.func @{{.*}}implicit_invocation() attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to)>}
 ! MLIR: return

diff  --git a/flang/test/Lower/OpenMP/function-filtering-3.f90 b/flang/test/Lower/OpenMP/function-filtering-3.f90
index a277c06d620669..4754da85de3aa7 100644
--- a/flang/test/Lower/OpenMP/function-filtering-3.f90
+++ b/flang/test/Lower/OpenMP/function-filtering-3.f90
@@ -1,9 +1,9 @@
 ! RUN: %flang_fc1 -fopenmp -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM-HOST,LLVM-ALL %s
 ! RUN: %flang_fc1 -fopenmp -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-HOST,MLIR-ALL %s
-! RUN: %flang_fc1 -fopenmp -fopenmp-is-target-device -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM-DEVICE,LLVM-ALL %s
-! RUN: %flang_fc1 -fopenmp -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
+! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -fopenmp -fopenmp-is-target-device -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM-DEVICE,LLVM-ALL %s
+! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -fopenmp -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
 ! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-HOST,MLIR-ALL %s
-! RUN: bbc -fopenmp -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
+! RUN: bbc -target amdgcn-amd-amdhsa -fopenmp -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
 
 ! Check that the correct LLVM IR functions are kept for the host and device
 ! after running the whole set of translation and transformation passes from

diff  --git a/flang/test/Lower/OpenMP/function-filtering.f90 b/flang/test/Lower/OpenMP/function-filtering.f90
index a72822503c3739..9f37e6f028d7a9 100644
--- a/flang/test/Lower/OpenMP/function-filtering.f90
+++ b/flang/test/Lower/OpenMP/function-filtering.f90
@@ -1,9 +1,9 @@
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM-HOST,LLVM-ALL %s
 ! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-HOST,MLIR-ALL %s
-! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM-DEVICE,LLVM-ALL %s
-! RUN: %flang_fc1 -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
+! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -flang-experimental-hlfir -emit-llvm %s -o - | FileCheck --check-prefixes=LLVM-DEVICE,LLVM-ALL %s
+! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
 ! RUN: bbc -fopenmp -fopenmp-version=52 -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-HOST,MLIR-ALL %s
-! RUN: bbc -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
+! RUN: bbc -target amdgcn-amd-amdhsa -fopenmp -fopenmp-version=52 -fopenmp-is-target-device -emit-hlfir %s -o - | FileCheck --check-prefixes=MLIR-DEVICE,MLIR-ALL %s
 
 ! Check that the correct LLVM IR functions are kept for the host and device
 ! after running the whole set of translation and transformation passes from

diff  --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 5c36187540690e..0be515e63b470c 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -174,9 +174,14 @@ static LogicalResult checkImplementationStatus(Operation &op) {
     if (op.getHint())
       op.emitWarning("hint clause discarded");
   };
-  auto checkHostEval = [&todo](auto op, LogicalResult &result) {
-    if (!op.getHostEvalVars().empty())
-      result = todo("host_eval");
+  auto checkHostEval = [](auto op, LogicalResult &result) {
+    // Host evaluated clauses are supported, except for loop bounds.
+    for (BlockArgument arg :
+         cast<omp::BlockArgOpenMPOpInterface>(*op).getHostEvalBlockArgs())
+      for (Operation *user : arg.getUsers())
+        if (isa<omp::LoopNestOp>(user))
+          result = op.emitError("not yet implemented: host evaluation of loop "
+                                "bounds in omp.target operation");
   };
   auto checkIf = [&todo](auto op, LogicalResult &result) {
     if (op.getIfExpr())
@@ -216,18 +221,30 @@ static LogicalResult checkImplementationStatus(Operation &op) {
       result = todo("priority");
   };
   auto checkPrivate = [&todo](auto op, LogicalResult &result) {
-    if (!op.getPrivateVars().empty() || op.getPrivateSyms())
-      result = todo("privatization");
+    if constexpr (std::is_same_v<std::decay_t<decltype(op)>, omp::TargetOp>) {
+      // Privatization clauses are supported, except on some situations, so we
+      // need to check here whether any of these unsupported cases are being
+      // translated.
+      if (std::optional<ArrayAttr> privateSyms = op.getPrivateSyms()) {
+        for (Attribute privatizerNameAttr : *privateSyms) {
+          omp::PrivateClauseOp privatizer = findPrivatizer(
+              op.getOperation(), cast<SymbolRefAttr>(privatizerNameAttr));
+
+          if (privatizer.getDataSharingType() ==
+              omp::DataSharingClauseType::FirstPrivate)
+            result = todo("firstprivate");
+        }
+      }
+    } else {
+      if (!op.getPrivateVars().empty() || op.getPrivateSyms())
+        result = todo("privatization");
+    }
   };
   auto checkReduction = [&todo](auto op, LogicalResult &result) {
     if (!op.getReductionVars().empty() || op.getReductionByref() ||
         op.getReductionSyms())
       result = todo("reduction");
   };
-  auto checkThreadLimit = [&todo](auto op, LogicalResult &result) {
-    if (op.getThreadLimit())
-      result = todo("thread_limit");
-  };
   auto checkTaskReduction = [&todo](auto op, LogicalResult &result) {
     if (!op.getTaskReductionVars().empty() || op.getTaskReductionByref() ||
         op.getTaskReductionSyms())
@@ -292,20 +309,7 @@ static LogicalResult checkImplementationStatus(Operation &op) {
         checkIf(op, result);
         checkInReduction(op, result);
         checkIsDevicePtr(op, result);
-        // Privatization clauses are supported, except on some situations, so we
-        // need to check here whether any of these unsupported cases are being
-        // translated.
-        if (std::optional<ArrayAttr> privateSyms = op.getPrivateSyms()) {
-          for (Attribute privatizerNameAttr : *privateSyms) {
-            omp::PrivateClauseOp privatizer = findPrivatizer(
-                op.getOperation(), cast<SymbolRefAttr>(privatizerNameAttr));
-
-            if (privatizer.getDataSharingType() ==
-                omp::DataSharingClauseType::FirstPrivate)
-              result = todo("firstprivate");
-          }
-        }
-        checkThreadLimit(op, result);
+        checkPrivate(op, result);
       })
       .Default([](Operation &) {
         // Assume all clauses for an operation can be translated unless they are
@@ -3919,6 +3923,223 @@ createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg,
   return builder.saveIP();
 }
 
+/// Follow uses of `host_eval`-defined block arguments of the given `omp.target`
+/// operation and populate output variables with their corresponding host value
+/// (i.e. operand evaluated outside of the target region), based on their uses
+/// inside of the target region.
+///
+/// Loop bounds and steps are only optionally populated, if output vectors are
+/// provided.
+static void extractHostEvalClauses(omp::TargetOp targetOp, Value &numThreads,
+                                   Value &numTeamsLower, Value &numTeamsUpper,
+                                   Value &threadLimit) {
+  auto blockArgIface = llvm::cast<omp::BlockArgOpenMPOpInterface>(*targetOp);
+  for (auto item : llvm::zip_equal(targetOp.getHostEvalVars(),
+                                   blockArgIface.getHostEvalBlockArgs())) {
+    Value hostEvalVar = std::get<0>(item), blockArg = std::get<1>(item);
+
+    for (Operation *user : blockArg.getUsers()) {
+      llvm::TypeSwitch<Operation *>(user)
+          .Case([&](omp::TeamsOp teamsOp) {
+            if (teamsOp.getNumTeamsLower() == blockArg)
+              numTeamsLower = hostEvalVar;
+            else if (teamsOp.getNumTeamsUpper() == blockArg)
+              numTeamsUpper = hostEvalVar;
+            else if (teamsOp.getThreadLimit() == blockArg)
+              threadLimit = hostEvalVar;
+            else
+              llvm_unreachable("unsupported host_eval use");
+          })
+          .Case([&](omp::ParallelOp parallelOp) {
+            if (parallelOp.getNumThreads() == blockArg)
+              numThreads = hostEvalVar;
+            else
+              llvm_unreachable("unsupported host_eval use");
+          })
+          .Case([&](omp::LoopNestOp loopOp) {
+            // TODO: Extract bounds and step values. Currently, this cannot be
+            // reached because translation would have been stopped earlier as a
+            // result of `checkImplementationStatus` detecting and reporting
+            // this situation.
+            llvm_unreachable("unsupported host_eval use");
+          })
+          .Default([](Operation *) {
+            llvm_unreachable("unsupported host_eval use");
+          });
+    }
+  }
+}
+
+/// If \p op is of the given type parameter, return it casted to that type.
+/// Otherwise, if its immediate parent operation (or some other higher-level
+/// parent, if \p immediateParent is false) is of that type, return that parent
+/// casted to the given type.
+///
+/// If \p op is \c null or neither it or its parent(s) are of the specified
+/// type, return a \c null operation.
+template <typename OpTy>
+static OpTy castOrGetParentOfType(Operation *op, bool immediateParent = false) {
+  if (!op)
+    return OpTy();
+
+  if (OpTy casted = dyn_cast<OpTy>(op))
+    return casted;
+
+  if (immediateParent)
+    return dyn_cast_if_present<OpTy>(op->getParentOp());
+
+  return op->getParentOfType<OpTy>();
+}
+
+/// If the given \p value is defined by an \c llvm.mlir.constant operation and
+/// it is of an integer type, return its value.
+static std::optional<int64_t> extractConstInteger(Value value) {
+  if (!value)
+    return std::nullopt;
+
+  if (auto constOp =
+          dyn_cast_if_present<LLVM::ConstantOp>(value.getDefiningOp()))
+    if (auto constAttr = dyn_cast<IntegerAttr>(constOp.getValue()))
+      return constAttr.getInt();
+
+  return std::nullopt;
+}
+
+/// Populate default `MinTeams`, `MaxTeams` and `MaxThreads` to their default
+/// values as stated by the corresponding clauses, if constant.
+///
+/// These default values must be set before the creation of the outlined LLVM
+/// function for the target region, so that they can be used to initialize the
+/// corresponding global `ConfigurationEnvironmentTy` structure.
+static void
+initTargetDefaultAttrs(omp::TargetOp targetOp,
+                       llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &attrs,
+                       bool isTargetDevice) {
+  // TODO: Handle constant 'if' clauses.
+  Operation *capturedOp = targetOp.getInnermostCapturedOmpOp();
+
+  Value numThreads, numTeamsLower, numTeamsUpper, threadLimit;
+  if (!isTargetDevice) {
+    extractHostEvalClauses(targetOp, numThreads, numTeamsLower, numTeamsUpper,
+                           threadLimit);
+  } else {
+    // In the target device, values for these clauses are not passed as
+    // host_eval, but instead evaluated prior to entry to the region. This
+    // ensures values are mapped and available inside of the target region.
+    if (auto teamsOp = castOrGetParentOfType<omp::TeamsOp>(capturedOp)) {
+      numTeamsLower = teamsOp.getNumTeamsLower();
+      numTeamsUpper = teamsOp.getNumTeamsUpper();
+      threadLimit = teamsOp.getThreadLimit();
+    }
+
+    if (auto parallelOp = castOrGetParentOfType<omp::ParallelOp>(capturedOp))
+      numThreads = parallelOp.getNumThreads();
+  }
+
+  // Handle clauses impacting the number of teams.
+
+  int32_t minTeamsVal = 1, maxTeamsVal = -1;
+  if (castOrGetParentOfType<omp::TeamsOp>(capturedOp)) {
+    // TODO: Use `hostNumTeamsLower` to initialize `minTeamsVal`. For now, match
+    // clang and set min and max to the same value.
+    if (numTeamsUpper) {
+      if (auto val = extractConstInteger(numTeamsUpper))
+        minTeamsVal = maxTeamsVal = *val;
+    } else {
+      minTeamsVal = maxTeamsVal = 0;
+    }
+  } else if (castOrGetParentOfType<omp::ParallelOp>(capturedOp,
+                                                    /*immediateParent=*/true) ||
+             castOrGetParentOfType<omp::SimdOp>(capturedOp,
+                                                /*immediateParent=*/true)) {
+    minTeamsVal = maxTeamsVal = 1;
+  } else {
+    minTeamsVal = maxTeamsVal = -1;
+  }
+
+  // Handle clauses impacting the number of threads.
+
+  auto setMaxValueFromClause = [](Value clauseValue, int32_t &result) {
+    if (!clauseValue)
+      return;
+
+    if (auto val = extractConstInteger(clauseValue))
+      result = *val;
+
+    // Found an applicable clause, so it's not undefined. Mark as unknown
+    // because it's not constant.
+    if (result < 0)
+      result = 0;
+  };
+
+  // Extract 'thread_limit' clause from 'target' and 'teams' directives.
+  int32_t targetThreadLimitVal = -1, teamsThreadLimitVal = -1;
+  setMaxValueFromClause(targetOp.getThreadLimit(), targetThreadLimitVal);
+  setMaxValueFromClause(threadLimit, teamsThreadLimitVal);
+
+  // Extract 'max_threads' clause from 'parallel' or set to 1 if it's SIMD.
+  int32_t maxThreadsVal = -1;
+  if (castOrGetParentOfType<omp::ParallelOp>(capturedOp))
+    setMaxValueFromClause(numThreads, maxThreadsVal);
+  else if (castOrGetParentOfType<omp::SimdOp>(capturedOp,
+                                              /*immediateParent=*/true))
+    maxThreadsVal = 1;
+
+  // For max values, < 0 means unset, == 0 means set but unknown. Select the
+  // minimum value between 'max_threads' and 'thread_limit' clauses that were
+  // set.
+  int32_t combinedMaxThreadsVal = targetThreadLimitVal;
+  if (combinedMaxThreadsVal < 0 ||
+      (teamsThreadLimitVal >= 0 && teamsThreadLimitVal < combinedMaxThreadsVal))
+    combinedMaxThreadsVal = teamsThreadLimitVal;
+
+  if (combinedMaxThreadsVal < 0 ||
+      (maxThreadsVal >= 0 && maxThreadsVal < combinedMaxThreadsVal))
+    combinedMaxThreadsVal = maxThreadsVal;
+
+  // Update kernel bounds structure for the `OpenMPIRBuilder` to use.
+  attrs.MinTeams = minTeamsVal;
+  attrs.MaxTeams.front() = maxTeamsVal;
+  attrs.MinThreads = 1;
+  attrs.MaxThreads.front() = combinedMaxThreadsVal;
+}
+
+/// Gather LLVM runtime values for all clauses evaluated in the host that are
+/// passed to the kernel invocation.
+///
+/// This function must be called only when compiling for the host. Also, it will
+/// only provide correct results if it's called after the body of \c targetOp
+/// has been fully generated.
+static void
+initTargetRuntimeAttrs(llvm::IRBuilderBase &builder,
+                       LLVM::ModuleTranslation &moduleTranslation,
+                       omp::TargetOp targetOp,
+                       llvm::OpenMPIRBuilder::TargetKernelRuntimeAttrs &attrs) {
+  Value numThreads, numTeamsLower, numTeamsUpper, teamsThreadLimit;
+  extractHostEvalClauses(targetOp, numThreads, numTeamsLower, numTeamsUpper,
+                         teamsThreadLimit);
+
+  // TODO: Handle constant 'if' clauses.
+  if (Value targetThreadLimit = targetOp.getThreadLimit())
+    attrs.TargetThreadLimit.front() =
+        moduleTranslation.lookupValue(targetThreadLimit);
+
+  if (numTeamsLower)
+    attrs.MinTeams = moduleTranslation.lookupValue(numTeamsLower);
+
+  if (numTeamsUpper)
+    attrs.MaxTeams.front() = moduleTranslation.lookupValue(numTeamsUpper);
+
+  if (teamsThreadLimit)
+    attrs.TeamsThreadLimit.front() =
+        moduleTranslation.lookupValue(teamsThreadLimit);
+
+  if (numThreads)
+    attrs.MaxThreads = moduleTranslation.lookupValue(numThreads);
+
+  // TODO: Populate attrs.LoopTripCount if it is target SPMD.
+}
+
 static LogicalResult
 convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
                  LLVM::ModuleTranslation &moduleTranslation) {
@@ -3928,7 +4149,9 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
 
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   bool isTargetDevice = ompBuilder->Config.isTargetDevice();
+
   auto parentFn = opInst.getParentOfType<LLVM::LLVMFuncOp>();
+  auto argIface = cast<omp::BlockArgOpenMPOpInterface>(opInst);
   auto &targetRegion = targetOp.getRegion();
   // Holds the private vars that have been mapped along with the block argument
   // that corresponds to the MapInfoOp corresponding to the private var in
@@ -3943,8 +4166,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::DenseMap<Value, Value> mappedPrivateVars;
   DataLayout dl = DataLayout(opInst.getParentOfType<ModuleOp>());
   SmallVector<Value> mapVars = targetOp.getMapVars();
-  ArrayRef<BlockArgument> mapBlockArgs =
-      cast<omp::BlockArgOpenMPOpInterface>(opInst).getMapBlockArgs();
+  ArrayRef<BlockArgument> mapBlockArgs = argIface.getMapBlockArgs();
   llvm::Function *llvmOutlinedFn = nullptr;
 
   // TODO: It can also be false if a compile-time constant `false` IF clause is
@@ -3958,7 +4180,6 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   // to quickly look up the corresponding map variable, if any for each
   // private variable.
   if (!targetOp.getPrivateVars().empty() && !targetOp.getMapVars().empty()) {
-    auto argIface = llvm::cast<omp::BlockArgOpenMPOpInterface>(*targetOp);
     OperandRange privateVars = targetOp.getPrivateVars();
     std::optional<ArrayAttr> privateSyms = targetOp.getPrivateSyms();
     std::optional<DenseI64ArrayAttr> privateMapIndices =
@@ -4032,7 +4253,7 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
     // Do privatization after moduleTranslation has already recorded
     // mapped values.
     MutableArrayRef<BlockArgument> privateBlockArgs =
-        cast<omp::BlockArgOpenMPOpInterface>(opInst).getPrivateBlockArgs();
+        argIface.getPrivateBlockArgs();
     SmallVector<mlir::Value> mlirPrivateVars;
     SmallVector<llvm::Value *> llvmPrivateVars;
     SmallVector<omp::PrivateClauseOp> privateDecls;
@@ -4115,14 +4336,29 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
                                         allocaIP, codeGenIP);
   };
 
-  // TODO: Populate default and runtime attributes based on the construct and
-  // clauses.
   llvm::OpenMPIRBuilder::TargetKernelRuntimeAttrs runtimeAttrs;
-  llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs defaultAttrs = {
-      /*ExecFlags=*/llvm::omp::OMP_TGT_EXEC_MODE_GENERIC, /*MaxTeams=*/{-1},
-      /*MinTeams=*/0, /*MaxThreads=*/{0}, /*MinThreads=*/0};
+  llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs defaultAttrs;
+  initTargetDefaultAttrs(targetOp, defaultAttrs, isTargetDevice);
 
+  // Collect host-evaluated values needed to properly launch the kernel from the
+  // host.
+  if (!isTargetDevice)
+    initTargetRuntimeAttrs(builder, moduleTranslation, targetOp, runtimeAttrs);
+
+  // Pass host-evaluated values as parameters to the kernel / host fallback,
+  // except if they are constants. In any case, map the MLIR block argument to
+  // the corresponding LLVM values.
   llvm::SmallVector<llvm::Value *, 4> kernelInput;
+  SmallVector<Value> hostEvalVars = targetOp.getHostEvalVars();
+  ArrayRef<BlockArgument> hostEvalBlockArgs = argIface.getHostEvalBlockArgs();
+  for (auto [arg, var] : llvm::zip_equal(hostEvalBlockArgs, hostEvalVars)) {
+    llvm::Value *value = moduleTranslation.lookupValue(var);
+    moduleTranslation.mapValue(arg, value);
+
+    if (!llvm::isa<llvm::Constant>(value))
+      kernelInput.push_back(value);
+  }
+
   for (size_t i = 0; i < mapVars.size(); ++i) {
     // declare target arguments are not passed to kernels as arguments
     // TODO: We currently do not handle cases where a member is explicitly

diff  --git a/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir b/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir
index 2550730f3fdf26..ae5cf97c958db3 100644
--- a/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
 
-module attributes {omp.is_target_device = true} {
+module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true} {
   llvm.func @_QQmain() attributes {fir.bindc_name = "main"} {
     %0 = llvm.mlir.addressof @_QFEi : !llvm.ptr
     %1 = llvm.mlir.addressof @_QFEsp : !llvm.ptr
@@ -23,7 +23,7 @@ module attributes {omp.is_target_device = true} {
   }
 }
 
-// CHECK: define {{.*}} void @__omp_offloading_{{.*}}_{{.*}}__QQmain_l{{.*}}(ptr %[[DYN_PTR:.*]], ptr %[[ARG_BYREF:.*]], ptr %[[ARG_BYCOPY:.*]]) {
+// CHECK: define {{.*}} void @__omp_offloading_{{.*}}_{{.*}}__QQmain_l{{.*}}(ptr %[[DYN_PTR:.*]], ptr %[[ARG_BYREF:.*]], ptr %[[ARG_BYCOPY:.*]]) #{{[0-9]+}} {
 
 // CHECK: entry:
 // CHECK: %[[ALLOCA_BYREF:.*]] = alloca ptr, align 8

diff  --git a/mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir b/mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir
index 842d9d78a3c386..0aa592fe1bee1f 100644
--- a/mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-constant-alloca-raise.mlir
@@ -10,7 +10,7 @@
 // constant sized) allocations performs its task reasonably in these 
 // scenarios. 
 
-module attributes {omp.is_target_device = true} {
+module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true} {
   llvm.func @_QQmain() attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>} {
     %1 = llvm.mlir.constant(1 : i64) : i64
     %2 = llvm.alloca %1 x !llvm.struct<(ptr)> : (i64) -> !llvm.ptr
@@ -33,7 +33,7 @@ module attributes {omp.is_target_device = true} {
   llvm.func @_ExternalCall(!llvm.ptr, !llvm.ptr) -> !llvm.struct<()>
 }
 
-// CHECK:      define weak_odr protected void @{{.*}}QQmain_l{{.*}}({{.*}}, {{.*}}) {
+// CHECK:      define weak_odr protected amdgpu_kernel void @{{.*}}QQmain_l{{.*}}({{.*}}, {{.*}}) #{{[0-9]+}} {
 // CHECK-NEXT: entry:
 // CHECK-NEXT:  %[[MOVED_ALLOCA1:.*]] = alloca { ptr }, align 8
 // CHECK-NEXT:  %[[MOVED_ALLOCA2:.*]] = alloca i32, i64 1, align 4

diff  --git a/mlir/test/Target/LLVMIR/omptarget-constant-indexing-device-region.mlir b/mlir/test/Target/LLVMIR/omptarget-constant-indexing-device-region.mlir
index 86fe6db3ff8192..ff046fe194c710 100644
--- a/mlir/test/Target/LLVMIR/omptarget-constant-indexing-device-region.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-constant-indexing-device-region.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
 
-module attributes {omp.is_target_device = true} {
+module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true} {
   llvm.func @_QQmain() attributes {bindc_name = "main"} {
     %0 = llvm.mlir.addressof @_QFEsp : !llvm.ptr
     %1 = llvm.mlir.constant(10 : index) : i64
@@ -30,7 +30,7 @@ module attributes {omp.is_target_device = true} {
 }
 
 
-// CHECK: define {{.*}} void @__omp_offloading_{{.*}}_{{.*}}__QQmain_{{.*}}(ptr %{{.*}}, ptr %[[ARG1:.*]]) {
+// CHECK: define {{.*}} void @__omp_offloading_{{.*}}_{{.*}}__QQmain_{{.*}}(ptr %{{.*}}, ptr %[[ARG1:.*]]) #{{[0-9]+}} {
 
 // CHECK: %[[ARG1_ALLOCA:.*]] = alloca ptr, align 8
 // CHECK: store ptr %[[ARG1]], ptr %[[ARG1_ALLOCA]], align 8

diff  --git a/mlir/test/Target/LLVMIR/omptarget-debug.mlir b/mlir/test/Target/LLVMIR/omptarget-debug.mlir
index bc930695c501de..dc2df5fdfba05b 100644
--- a/mlir/test/Target/LLVMIR/omptarget-debug.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-debug.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
 
-module attributes {omp.is_target_device = true} {
+module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true} {
   llvm.func @_QQmain() {
     %0 = llvm.mlir.constant(1 : i32) : i32
     %1 = llvm.alloca %0 x i32 : (i32) -> !llvm.ptr

diff  --git a/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir b/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir
index b59e03bc465a2f..d37b21f20153ba 100644
--- a/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir
@@ -7,7 +7,7 @@
 // Unfortunately, only so much can be tested as the device side is dependent on a *.bc
 // file created by the host and appended as an attribute to the module.
 
-module attributes {omp.is_target_device = true} {
+module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true} {
   // CHECK-DAG: @_QMtest_0Esp_decl_tgt_ref_ptr = weak global ptr null, align 8
   llvm.mlir.global external @_QMtest_0Esp() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : i32 {
     %0 = llvm.mlir.constant(0 : i32) : i32

diff  --git a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
index 4903656c22ec72..de965f99fd4a1c 100644
--- a/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-parallel-llvm.mlir
@@ -52,7 +52,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 }
 
 // CHECK: define weak_odr protected amdgpu_kernel void @[[FUNC0:.*]](
-// CHECK-SAME: ptr %[[TMP:.*]], ptr %[[TMP0:.*]]) {
+// CHECK-SAME: ptr %[[TMP:.*]], ptr %[[TMP0:.*]]) #{{[0-9]+}} {
 // CHECK:         %[[TMP1:.*]] = alloca [1 x ptr], align 8, addrspace(5)
 // CHECK:         %[[TMP2:.*]] = addrspacecast ptr addrspace(5) %[[TMP1]] to ptr
 // CHECK:         %[[STRUCTARG:.*]] = alloca { ptr }, align 8, addrspace(5)
@@ -96,7 +96,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
 // is passed as a param to kmpc_parallel_51 function
 
 // CHECK: define weak_odr protected amdgpu_kernel void @{{.*}}(
-// CHECK-SAME: ptr {{.*}}, ptr {{.*}}, ptr %[[IFCOND_ARG2:.*]]) {
+// CHECK-SAME: ptr {{.*}}, ptr {{.*}}, ptr %[[IFCOND_ARG2:.*]]) #{{[0-9]+}} {
 // CHECK:         store ptr %[[IFCOND_ARG2]], ptr %[[IFCOND_TMP1:.*]], align 8
 // CHECK:         %[[IFCOND_TMP2:.*]] = load i32, ptr %[[IFCOND_TMP1]], align 4
 // CHECK:         %[[IFCOND_TMP3:.*]] = icmp ne i32 %[[IFCOND_TMP2]], 0

diff  --git a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
index fa32a3030108d8..5938aaeafa5efe 100644
--- a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
 
-module attributes {omp.is_target_device = true} {
+module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true} {
   llvm.func @omp_target_region_() {
     %0 = llvm.mlir.constant(20 : i32) : i32
     %1 = llvm.mlir.constant(10 : i32) : i32
@@ -29,8 +29,8 @@ module attributes {omp.is_target_device = true} {
 // CHECK:      @[[SRC_LOC:.*]] = private unnamed_addr constant [23 x i8] c"{{[^"]*}}", align 1
 // CHECK:      @[[IDENT:.*]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[SRC_LOC]] }, align 8
 // CHECK:      @[[DYNA_ENV:.*]] = weak_odr protected global %struct.DynamicEnvironmentTy zeroinitializer
-// CHECK:      @[[KERNEL_ENV:.*]] = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 -1, i32 0, i32 0 }, ptr @[[IDENT]], ptr @[[DYNA_ENV]] }
-// CHECK:      define weak_odr protected void @__omp_offloading_{{[^_]+}}_{{[^_]+}}_omp_target_region__l{{[0-9]+}}(ptr %[[DYN_PTR:.*]], ptr %[[ADDR_A:.*]], ptr %[[ADDR_B:.*]], ptr %[[ADDR_C:.*]])
+// CHECK:      @[[KERNEL_ENV:.*]] = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 1, i32 256, i32 -1, i32 -1, i32 0, i32 0 }, ptr @[[IDENT]], ptr @[[DYNA_ENV]] }
+// CHECK:      define weak_odr protected amdgpu_kernel void @__omp_offloading_{{[^_]+}}_{{[^_]+}}_omp_target_region__l{{[0-9]+}}(ptr %[[DYN_PTR:.*]], ptr %[[ADDR_A:.*]], ptr %[[ADDR_B:.*]], ptr %[[ADDR_C:.*]])
 // CHECK:        %[[TMP_A:.*]] = alloca ptr, align 8
 // CHECK:        store ptr %[[ADDR_A]], ptr %[[TMP_A]], align 8
 // CHECK:        %[[TMP_B:.*]] = alloca ptr, align 8

diff  --git a/mlir/test/Target/LLVMIR/omptarget-target-inside-task.mlir b/mlir/test/Target/LLVMIR/omptarget-target-inside-task.mlir
index be6bb6df9e45a8..d4743ea88d9d90 100644
--- a/mlir/test/Target/LLVMIR/omptarget-target-inside-task.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-target-inside-task.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
 
-module attributes {omp.is_target_device = true, omp.is_gpu = true} {
+module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true, omp.is_gpu = true} {
   llvm.func @omp_target_region_() {
     %0 = llvm.mlir.constant(20 : i32) : i32
     %1 = llvm.mlir.constant(10 : i32) : i32
@@ -36,5 +36,5 @@ module attributes {omp.is_target_device = true, omp.is_gpu = true} {
   }
 }
 
-// CHECK: define weak_odr protected void @__omp_offloading_{{.*}}_{{.*}}_omp_target_region__l19
+// CHECK: define weak_odr protected amdgpu_kernel void @__omp_offloading_{{.*}}_{{.*}}_omp_target_region__l19
 // CHECK: ret void

diff  --git a/mlir/test/Target/LLVMIR/omptarget-threadprivate-device-lowering.mlir b/mlir/test/Target/LLVMIR/omptarget-threadprivate-device-lowering.mlir
index 279ecb3f8e998f..b0c641c65f9fb7 100644
--- a/mlir/test/Target/LLVMIR/omptarget-threadprivate-device-lowering.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-threadprivate-device-lowering.mlir
@@ -4,7 +4,7 @@
 // omp.threadprivate does not crash on lowering during the OpenMP target device
 // pass when used in conjunction with target code in the same module.
 
-module attributes {omp.is_target_device = true } {
+module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true } {
   llvm.func @func() attributes {omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>} {
     %0 = llvm.mlir.addressof @_QFEpointer2 : !llvm.ptr
     %1 = omp.threadprivate %0 : !llvm.ptr -> !llvm.ptr
@@ -23,7 +23,7 @@ module attributes {omp.is_target_device = true } {
   }
 }
 
-// CHECK: define weak_odr protected void @{{.*}}(ptr %{{.*}}, ptr %[[ARG1:.*]]) {
+// CHECK: define weak_odr protected amdgpu_kernel void @{{.*}}(ptr %{{.*}}, ptr %[[ARG1:.*]]) #{{[0-9]+}} {
 // CHECK:  %[[ALLOCA:.*]] = alloca ptr, align 8
 // CHECK:  store ptr %[[ARG1]], ptr %[[ALLOCA]], align 8
 // CHECK:  %[[LOAD_ALLOCA:.*]] = load ptr, ptr %[[ALLOCA]], align 8

diff  --git a/mlir/test/Target/LLVMIR/openmp-target-launch-device.mlir b/mlir/test/Target/LLVMIR/openmp-target-launch-device.mlir
new file mode 100644
index 00000000000000..a418445324817e
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-target-launch-device.mlir
@@ -0,0 +1,45 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// CHECK:      @[[EXEC_MODE1:.*]] = weak protected constant i8 1
+// CHECK:      @llvm.compiler.used{{.*}} = appending global [1 x ptr] [ptr @[[EXEC_MODE1]]], section "llvm.metadata"
+// CHECK:      @[[KERNEL1_ENV:.*_kernel_environment]] = weak_odr protected constant %struct.KernelEnvironmentTy {
+// CHECK-SAME: %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 [[EXEC_MODE1:1]], i32 [[MIN_THREADS1:1]], i32 [[MAX_THREADS1:10]], i32 [[MIN_TEAMS1:1]], i32 [[MAX_TEAMS1:-1]], i32 0, i32 0 },
+// CHECK-SAME: ptr @{{.*}}, ptr @{{.*}} }
+
+// CHECK:      @[[EXEC_MODE2:.*]] = weak protected constant i8 1
+// CHECK:      @llvm.compiler.used{{.*}} = appending global [1 x ptr] [ptr @[[EXEC_MODE2]]], section "llvm.metadata"
+// CHECK:      @[[KERNEL2_ENV:.*_kernel_environment]] = weak_odr protected constant %struct.KernelEnvironmentTy {
+// CHECK-SAME: %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 [[EXEC_MODE2:1]], i32 [[MIN_THREADS2:1]], i32 [[MAX_THREADS2:30]], i32 [[MIN_TEAMS2:40]], i32 [[MAX_TEAMS2:40]], i32 0, i32 0 },
+// CHECK-SAME: ptr @{{.*}}, ptr @{{.*}} }
+
+module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true, omp.is_gpu = true} {
+  llvm.func @main(%num_teams : !llvm.ptr) {
+    // CHECK: define weak_odr protected amdgpu_kernel void @__omp_offloading_{{.*}}_main_l{{[0-9]+}}(ptr %[[KERNEL_ARGS:.*]], ptr %[[NUM_TEAMS_ARG:.*]]) #[[ATTRS1:[0-9]+]]
+    // CHECK: %{{.*}} = call i32 @__kmpc_target_init(ptr @[[KERNEL1_ENV]], ptr %[[KERNEL_ARGS]])
+    %target_threads = llvm.mlir.constant(20) : i32
+    %0 = omp.map.info var_ptr(%num_teams : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr
+    omp.target thread_limit(%target_threads : i32) map_entries(%0 -> %arg_teams : !llvm.ptr) {
+      %teams_threads = llvm.mlir.constant(10) : i32
+      %num_teams1 = llvm.load %arg_teams : !llvm.ptr -> i32
+      omp.teams num_teams(to %num_teams1 : i32) thread_limit(%teams_threads : i32) {
+        omp.terminator
+      }
+      omp.terminator
+    }
+
+    // CHECK: define weak_odr protected amdgpu_kernel void @__omp_offloading_{{.*}}_main_l{{[0-9]+}}(ptr %[[KERNEL_ARGS:.*]]) #[[ATTRS2:[0-9]+]]
+    // CHECK: %{{.*}} = call i32 @__kmpc_target_init(ptr @[[KERNEL2_ENV]], ptr %[[KERNEL_ARGS]])
+    %target_threads2 = llvm.mlir.constant(30) : i32
+    omp.target thread_limit(%target_threads2 : i32) {
+      %num_teams2 = llvm.mlir.constant(40) : i32
+      omp.teams num_teams(to %num_teams2 : i32) {
+        omp.terminator
+      }
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// CHECK: attributes #[[ATTRS1]] = { "amdgpu-flat-work-group-size"="[[MIN_THREADS1]],[[MAX_THREADS1]]" "omp_target_thread_limit"="[[MAX_THREADS1]]" }
+// CHECK: attributes #[[ATTRS2]] = { "amdgpu-flat-work-group-size"="[[MIN_THREADS2]],[[MAX_THREADS2]]" "amdgpu-max-num-workgroups"="[[MIN_TEAMS2]],1,1" "omp_target_num_teams"="[[MIN_TEAMS2]]" "omp_target_thread_limit"="[[MAX_THREADS2]]" }

diff  --git a/mlir/test/Target/LLVMIR/openmp-target-launch-host.mlir b/mlir/test/Target/LLVMIR/openmp-target-launch-host.mlir
new file mode 100644
index 00000000000000..abc67017b620d6
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-target-launch-host.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// CHECK: define void @main(i32 %[[NUM_TEAMS_ARG:.*]])
+// CHECK: %[[KERNEL_ARGS:.*]] = alloca %struct.__tgt_kernel_arguments
+// CHECK: %[[NUM_TEAMS:.*]] = insertvalue [3 x i32] zeroinitializer, i32 %[[NUM_TEAMS_ARG]], 0
+
+// CHECK: %[[NUM_TEAMS_KARG:.*]] = getelementptr inbounds nuw %struct.__tgt_kernel_arguments, ptr %[[KERNEL_ARGS]], i32 0, i32 10
+// CHECK: store [3 x i32] %[[NUM_TEAMS]], ptr %[[NUM_TEAMS_KARG]], align 4
+
+// CHECK: %[[NUM_THREADS_ARG:.*]] = getelementptr inbounds nuw %struct.__tgt_kernel_arguments, ptr %[[KERNEL_ARGS]], i32 0, i32 11
+// CHECK: store [3 x i32] [i32 10, i32 0, i32 0], ptr %[[NUM_THREADS_ARG]], align 4
+
+// CHECK: %{{.*}} = call i32 @__tgt_target_kernel(ptr {{.*}}, i64 -1, i32 %[[NUM_TEAMS_ARG]], i32 [[NUM_THREADS:10]], ptr @.[[OUTLINED_FN:.*]].region_id, ptr %[[KERNEL_ARGS]])
+// CHECK: call void @[[OUTLINED_FN]](i32 %[[NUM_TEAMS_ARG]])
+
+// CHECK: define internal void @[[OUTLINED_FN]](i32 %[[NUM_TEAMS_OUTLINED:.*]])
+// CHECK: call void @__kmpc_push_num_teams_51(ptr {{.*}}, i32 {{.*}}, i32 %[[NUM_TEAMS_OUTLINED]], i32 %[[NUM_TEAMS_OUTLINED]], i32 [[NUM_THREADS]])
+module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  llvm.func @main(%num_teams : i32) {
+    %target_threads = llvm.mlir.constant(20) : i32
+    %teams_threads = llvm.mlir.constant(10) : i32
+    omp.target thread_limit(%target_threads : i32)
+               host_eval(%num_teams -> %arg_teams, %teams_threads -> %arg_teams_threads : i32, i32) {
+      omp.teams num_teams(to %arg_teams : i32) thread_limit(%arg_teams_threads : i32) {
+        omp.terminator
+      }
+      omp.terminator
+    }
+    llvm.return
+  }
+}

diff  --git a/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir b/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir
index a94bbdce891f9a..3872d908e7a201 100644
--- a/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-target-use-device-nested.mlir
@@ -3,7 +3,7 @@
 // This tests check that target code nested inside a target data region which
 // has only use_device_ptr mapping corectly generates code on the device pass.
 
-// CHECK:         define weak_odr protected void @__omp_offloading{{.*}}main_
+// CHECK:         define weak_odr protected amdgpu_kernel void @__omp_offloading{{.*}}main_
 // CHECK-NEXT:       entry:
 // CHECK-NEXT:         %[[VAL_3:.*]] = alloca ptr, align 8
 // CHECK-NEXT:         store ptr %[[VAL_4:.*]], ptr %[[VAL_3]], align 8
@@ -21,7 +21,7 @@
 // CHECK-NEXT:         %[[VAL_13:.*]] = load ptr, ptr %[[VAL_11]], align 8
 // CHECK-NEXT:         store i32 999, ptr %[[VAL_13]], align 4
 // CHECK-NEXT:         br label %[[VAL_14:.*]]
-module attributes {omp.is_target_device = true } {
+module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true } {
   llvm.func @_QQmain() attributes {fir.bindc_name = "main"} {
     %0 = llvm.mlir.constant(1 : i64) : i64
     %a = llvm.alloca %0 x !llvm.ptr : (i64) -> !llvm.ptr

diff  --git a/mlir/test/Target/LLVMIR/openmp-task-target-device.mlir b/mlir/test/Target/LLVMIR/openmp-task-target-device.mlir
index 4d6b36e5f54509..f09257d0913245 100644
--- a/mlir/test/Target/LLVMIR/openmp-task-target-device.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-task-target-device.mlir
@@ -3,7 +3,7 @@
 // This tests the fix for https://github.com/llvm/llvm-project/issues/84606
 // We are only interested in ensuring that the -mlir-to-llmvir pass doesn't crash.
 // CHECK: {{.*}} = add i32 {{.*}}, 5
-module attributes {omp.is_target_device = true } {
+module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_target_device = true} {
   llvm.func @_QQmain() attributes {fir.bindc_name = "main", omp.declare_target = #omp.declaretarget<device_type = (host), capture_clause = (to)>} {
     %0 = llvm.mlir.constant(0 : i32) : i32
     %1 = llvm.mlir.constant(1 : i64) : i64

diff  --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index ffb865edff80dd..392a6558dcfa69 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -248,10 +248,20 @@ llvm.func @target_has_device_addr(%x : !llvm.ptr) {
 // -----
 
 llvm.func @target_host_eval(%x : i32) {
-  // expected-error at below {{not yet implemented: Unhandled clause host_eval in omp.target operation}}
+  // expected-error at below {{not yet implemented: host evaluation of loop bounds in omp.target operation}}
   // expected-error at below {{LLVM Translation failed for operation: omp.target}}
-  omp.target host_eval(%x -> %arg0 : i32) {
-    omp.teams num_teams(to %arg0 : i32) {
+  omp.target host_eval(%x -> %lb, %x -> %ub, %x -> %step : i32, i32, i32) {
+    omp.teams {
+      omp.parallel {
+        omp.distribute {
+          omp.wsloop {
+            omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+              omp.yield
+            }
+          } {omp.composite}
+        } {omp.composite}
+        omp.terminator
+      } {omp.composite}
       omp.terminator
     }
     omp.terminator
@@ -329,17 +339,6 @@ llvm.func @target_firstprivate(%x : !llvm.ptr) {
 
 // -----
 
-llvm.func @target_thread_limit(%x : i32) {
-  // expected-error at below {{not yet implemented: Unhandled clause thread_limit in omp.target operation}}
-  // expected-error at below {{LLVM Translation failed for operation: omp.target}}
-  omp.target thread_limit(%x : i32) {
-    omp.terminator
-  }
-  llvm.return
-}
-
-// -----
-
 llvm.func @target_enter_data_depend(%x: !llvm.ptr) {
   // expected-error at below {{not yet implemented: Unhandled clause depend in omp.target_enter_data operation}}
   // expected-error at below {{LLVM Translation failed for operation: omp.target_enter_data}}