[Openmp-commits] [clang] [llvm] [openmp] [OpenMP] support for Emissary APIs as discussed in 89169 (PR #175265)

Greg Rodgers via Openmp-commits openmp-commits at lists.llvm.org
Fri Jan 9 15:11:11 PST 2026


https://github.com/gregrodgers created https://github.com/llvm/llvm-project/pull/175265

This is the initial support for Emissary APIs as discussed here:

https://discourse.llvm.org/t/emissary-apis-a-general-purpose-framework-for-gpu-initiated-host-execution-of-native-host-apis/89169

Currently, the functions printf, fprintf, MPI_Send, MPI_Recv, and much of the Fortran IO Runtime is working.   The later provides the ability to have print and write FORTRAN statements in your target region.  

There is already printf and fprintf support in the device libc which uses the same offload RPC infrastructure that emissary uses.  To disable emissary printf and fprintf set -fno-use-emissary-print.  use-emissary-print is the default because it is much faster.   

>From cdcedd36b2a590e443974941bd3d4873abe21796 Mon Sep 17 00:00:00 2001
From: gregrodgers <Gregory.Rodgers at amd.com>
Date: Fri, 9 Jan 2026 17:01:48 -0600
Subject: [PATCH] [OpenMP] support for Emissary APIs as discussed in 89169

---
 clang/include/clang/Basic/LangOptions.def     |   1 +
 clang/include/clang/Options/Options.td        |   6 +
 clang/lib/CodeGen/CGEmitEmissaryExec.cpp      | 371 ++++++++++++++
 clang/lib/CodeGen/CGExpr.cpp                  |  11 +
 clang/lib/CodeGen/CMakeLists.txt              |   1 +
 clang/lib/CodeGen/CodeGenFunction.h           |   1 +
 clang/lib/Driver/ToolChains/Clang.cpp         |   7 +
 clang/lib/Headers/CMakeLists.txt              |   2 +
 clang/lib/Headers/EmissaryIds.h               |  97 ++++
 clang/lib/Headers/EmissaryMPI.h               | 202 ++++++++
 clang/lib/Headers/llvm_libc_wrappers/stdio.h  |  17 +
 offload/include/Shared/RPCOpcodes.h           |   2 +
 offload/libomptarget/CMakeLists.txt           |   1 +
 offload/plugins-nextgen/common/CMakeLists.txt |  15 +
 .../plugins-nextgen/common/include/Emissary.h | 263 ++++++++++
 .../plugins-nextgen/common/src/Emissary.cpp   | 259 ++++++++++
 .../common/src/EmissaryFortrt.cpp             | 470 ++++++++++++++++++
 .../common/src/EmissaryPrint.cpp              | 423 ++++++++++++++++
 offload/plugins-nextgen/common/src/RPC.cpp    | 152 ++++++
 openmp/device/CMakeLists.txt                  |  10 +
 openmp/device/src/EmissaryFortrt.cpp          | 144 ++++++
 openmp/device/src/EmissaryPrint.cpp           |  79 +++
 openmp/device/src/Misc.cpp                    |  62 +++
 23 files changed, 2596 insertions(+)
 create mode 100644 clang/lib/CodeGen/CGEmitEmissaryExec.cpp
 create mode 100644 clang/lib/Headers/EmissaryIds.h
 create mode 100644 clang/lib/Headers/EmissaryMPI.h
 create mode 100644 offload/plugins-nextgen/common/include/Emissary.h
 create mode 100644 offload/plugins-nextgen/common/src/Emissary.cpp
 create mode 100644 offload/plugins-nextgen/common/src/EmissaryFortrt.cpp
 create mode 100644 offload/plugins-nextgen/common/src/EmissaryPrint.cpp
 create mode 100644 openmp/device/src/EmissaryFortrt.cpp
 create mode 100644 openmp/device/src/EmissaryPrint.cpp

diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 8cba1dbaee24e..69ff43cef7307 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -227,6 +227,7 @@ LANGOPT(OpenMPExtensions  , 1, 1, NotCompatible, "Enable all Clang extensions fo
 LANGOPT(OpenMPSimd        , 1, 0, NotCompatible, "Use SIMD only OpenMP support.")
 LANGOPT(OpenMPUseTLS      , 1, 0, NotCompatible, "Use TLS for threadprivates or runtime calls")
 LANGOPT(OpenMPIsTargetDevice    , 1, 0, NotCompatible, "Generate code only for OpenMP target device")
+LANGOPT(UseEmissaryPrint  , 1, 0, NotCompatible, "Enables use of certain IO functions with Emissary rather than LIBC")
 LANGOPT(OpenMPCUDAMode    , 1, 0, NotCompatible, "Generate code for OpenMP pragmas in SIMT/SPMD mode")
 LANGOPT(OpenMPIRBuilder   , 1, 0, NotCompatible, "Use the experimental OpenMP-IR-Builder codegen path.")
 LANGOPT(OpenMPCUDANumSMs  , 32, 0, NotCompatible, "Number of SMs for CUDA devices.")
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 6a72931727a7c..8cf017d8effd7 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -8940,6 +8940,12 @@ def fopenmp_host_ir_file_path : Separate<["-"], "fopenmp-host-ir-file-path">,
 
 } // let Visibility = [CC1Option, FC1Option]
 
+defm use_emissary_print: BoolFOption<"use-emissary-print",
+  LangOpts<"UseEmissaryPrint">, DefaultTrue,
+  PosFlag<SetTrue, [], [ClangOption, CC1Option],
+  "Enable use of Emissary printf/fprint overriding device libc printf/fprintf">,
+  NegFlag<SetFalse>>;
+
 //===----------------------------------------------------------------------===//
 // Coarray Options
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/CGEmitEmissaryExec.cpp b/clang/lib/CodeGen/CGEmitEmissaryExec.cpp
new file mode 100644
index 0000000000000..cc574d1ffdcf2
--- /dev/null
+++ b/clang/lib/CodeGen/CGEmitEmissaryExec.cpp
@@ -0,0 +1,371 @@
+//===------- CGEmitEmissaryExec.cpp - Codegen for _emissary_exec --==------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// EmitEmissaryExec:
+//
+// When a device call to the varadic function _emissary_exec is encountered
+// (in CGExpr.cpp) EmitEmissaryExec does these steps:
+//
+// 1. If string lens are runtime dependent, Emit code to determine runtime len.
+// 2. Emits call to allocate memory __llvm_emissary_premalloc,
+// 3. Emit stores of each arg into arg buffer,
+// 4. Emits call to function __llvm_emissary_rpc or __llvm_emissary_rpc_dm
+//
+// The arg buffer is a struct that contains the length, number of args, an
+// array of 4-byte keys that represent the type of of each arg, an array of
+// aligned "data" values for each arg, and finally the runtime string values.
+// If an arg is a string the data value is the runtime length of the string.
+// Each 4-byte key contains the llvm type ID and the number of bits for the
+// type. encoded by the macro _PACK_TY_BITLEN(x,y) ((uint32_t)x << 16) |
+// ((uint32_t)y)
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../../clang/lib/Headers/EmissaryIds.h"
+#include "CodeGenFunction.h"
+#include "clang/Basic/Builtins.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h"
+
+using namespace clang;
+using namespace CodeGen;
+
+// These static helper functions support EmitEmissaryExec.
+static llvm::Function *GetOmpStrlenDeclaration(CodeGenModule &CGM) {
+  auto &M = CGM.getModule();
+  // Args are pointer to char and maxstringlen
+  llvm::Type *ArgTypes[] = {CGM.Int8PtrTy, CGM.Int32Ty};
+  llvm::FunctionType *OmpStrlenFTy =
+      llvm::FunctionType::get(CGM.Int32Ty, ArgTypes, false);
+  if (auto *F = M.getFunction("__strlen_max")) {
+    assert(F->getFunctionType() == OmpStrlenFTy);
+    return F;
+  }
+  llvm::Function *FN = llvm::Function::Create(
+      OmpStrlenFTy, llvm::GlobalVariable::ExternalLinkage, "__strlen_max", &M);
+  return FN;
+}
+
+// Deterimines if an expression is a string with variable lenth
+static bool isVarString(const clang::Expr *argX, const clang::Type *argXTy,
+                        const llvm::Value *Arg) {
+  if ((argXTy->isPointerType() || argXTy->isConstantArrayType()) &&
+      argXTy->getPointeeOrArrayElementType()->isCharType() && !argX->isLValue())
+    return true;
+  // Ensure the VarDecl has an inititalizer
+  if (const auto *DRE = dyn_cast<DeclRefExpr>(argX))
+    if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl()))
+      if (!VD->getInit() ||
+          !llvm::isa<StringLiteral>(VD->getInit()->IgnoreImplicit()))
+        return true;
+  return false;
+}
+
+// Deterimines if an argument is a string
+static bool isString(const clang::Type *argXTy) {
+  if ((argXTy->isPointerType() || argXTy->isConstantArrayType()) &&
+      argXTy->getPointeeOrArrayElementType()->isCharType())
+    return true;
+  else
+    return false;
+}
+
+// Gets a string literal to write into the transfer buffer
+static const StringLiteral *getSL(const clang::Expr *argX,
+                                  const clang::Type *argXTy) {
+  // String in argX has known constant length
+  if (!argXTy->isConstantArrayType()) {
+    // Allow constant string to be a declared variable,
+    // But it must be constant and initialized.
+    const DeclRefExpr *DRE = cast<DeclRefExpr>(argX);
+    const VarDecl *VarD = cast<VarDecl>(DRE->getDecl());
+    argX = VarD->getInit()->IgnoreImplicit();
+  }
+  const StringLiteral *SL = cast<StringLiteral>(argX);
+  return SL;
+}
+
+// Returns a function pointer to __llvm_emissary_premalloc
+static llvm::Function *GetEmissaryAllocDeclaration(CodeGenModule &CGM) {
+  auto &M = CGM.getModule();
+  // clang::CodeGen::CodeGenTypes &CGT = CGM.getTypes();
+  const char *_executeName = "__llvm_emissary_premalloc";
+  llvm::Type *ArgTypes[] = {CGM.Int32Ty};
+  llvm::Function *FN;
+  // Maybe this should be pointer to char instead of pointer to void
+  llvm::FunctionType *VargsFnAllocFuncType = llvm::FunctionType::get(
+      CGM.getTypes().ConvertType(
+          CGM.getContext().getPointerType(CGM.getContext().VoidTy)),
+      ArgTypes, false);
+  if (!(FN = M.getFunction(_executeName)))
+    FN = llvm::Function::Create(VargsFnAllocFuncType,
+                                llvm::GlobalVariable::ExternalLinkage,
+                                _executeName, &M);
+  assert(FN->getFunctionType() == VargsFnAllocFuncType);
+  return FN;
+}
+
+// Returns a function pointer to __llvm_emissary_rpc
+static llvm::Function *GetEmissaryExecDeclaration(CodeGenModule &CGM,
+                                                  bool hasXfers) {
+  const char *_executeName =
+      hasXfers ? "__llvm_emissary_rpc_dm" : "__llvm_emissary_rpc";
+  auto &M = CGM.getModule();
+  llvm::Type *ArgTypes[] = {
+      CGM.Int32Ty, CGM.getTypes().ConvertType(CGM.getContext().getPointerType(
+                       CGM.getContext().VoidTy))};
+  llvm::Function *FN;
+  llvm::FunctionType *VarfnFuncType =
+      llvm::FunctionType::get(CGM.Int64Ty, ArgTypes, false);
+  if (!(FN = M.getFunction(_executeName)))
+    FN = llvm::Function::Create(
+        VarfnFuncType, llvm::GlobalVariable::ExternalLinkage, _executeName, &M);
+  assert(FN->getFunctionType() == VarfnFuncType);
+  return FN;
+}
+
+// A macro to pack the llvm type ID and numbits into 4-byte key
+#define _PACK_TY_BITLEN(x, y) ((uint32_t)x << 16) | ((uint32_t)y)
+
+//  ----- External function EmitEmissaryExec called from CGExpr.cpp -----
+RValue CodeGenFunction::EmitEmissaryExec(const CallExpr *E) {
+  assert(getTarget().getTriple().isAMDGCN() ||
+         getTarget().getTriple().isNVPTX());
+  assert(E->getNumArgs() >= 1); // _emissary_exec always has at least one arg.
+  const llvm::DataLayout &DL = CGM.getDataLayout();
+  CallArgList Args;
+
+  EmitCallArgs(Args,
+               E->getDirectCallee()->getType()->getAs<FunctionProtoType>(),
+               E->arguments(), E->getDirectCallee(),
+               /* ParamsToSkip = */ 0);
+
+  // We don't know how to emit non-scalar varargs.
+  if (std::any_of(Args.begin() + 1, Args.end(), [&](const CallArg &A) {
+        return !A.getRValue(*this).isScalar();
+      })) {
+    CGM.ErrorUnsupported(E, "non-scalar arg in GPU vargs function");
+    return RValue::get(llvm::ConstantInt::get(IntTy, 0));
+  }
+  // NumArgs always includes emisid, but E->getNumArgs() could be 1 less if
+  // inserted it above.
+  unsigned NumArgs = (unsigned)Args.size();
+  llvm::SmallVector<llvm::Type *, 32> ArgTypes;
+  llvm::SmallVector<llvm::Value *, 32> VarStrLengths;
+  llvm::Value *TotalVarStrsLength = llvm::ConstantInt::get(Int32Ty, 0);
+  bool hasVarStrings = false;
+  ArgTypes.push_back(Int32Ty); // 1st field in struct is total DataLen
+  ArgTypes.push_back(Int32Ty); // 2nd field in struct will be num args
+  // An array of 4-byte keys that describe the arg type
+  for (unsigned I = 0; I < NumArgs; ++I)
+    ArgTypes.push_back(Int32Ty);
+
+  // Track the size of the numeric data length and string length
+  unsigned DataLen_CT =
+      (unsigned)(DL.getTypeAllocSize(Int32Ty)) * (NumArgs + 2);
+  unsigned AllStringsLen_CT = 0;
+
+  // ---  1st Pass over Args to create ArgTypes and count size ---
+  size_t structOffset = 4 * (NumArgs + 2);
+  for (unsigned I = 0; I < NumArgs; I++) {
+    llvm::Value *Arg = Args[I].getRValue(*this).getScalarVal();
+    llvm::Type *ArgType = Arg->getType();
+    // Skip string processing on arg0 which may not be in E->getArg(0)
+    if (I != 0) {
+      const Expr *argX = E->getArg(I)->IgnoreParenCasts();
+      auto *argXTy = argX->getType().getTypePtr();
+      if (isString(argXTy)) {
+        if (isVarString(argX, argXTy, Arg)) {
+          hasVarStrings = true;
+          if (auto *PtrTy = dyn_cast<llvm::PointerType>(ArgType))
+            if (PtrTy->getPointerAddressSpace()) {
+              Arg = Builder.CreateAddrSpaceCast(Arg, CGM.Int8PtrTy);
+              ArgType = Arg->getType();
+            }
+          llvm::Value *VarStrLen =
+              Builder.CreateCall(GetOmpStrlenDeclaration(CGM),
+                                 {Arg, llvm::ConstantInt::get(Int32Ty, 1024)});
+          VarStrLengths.push_back(VarStrLen);
+          TotalVarStrsLength = Builder.CreateAdd(TotalVarStrsLength, VarStrLen,
+                                                 "sum_of_var_strings_length");
+          ArgType = Int32Ty;
+        } else {
+          const StringLiteral *SL = getSL(argX, argXTy);
+          StringRef ArgString = SL->getString();
+          AllStringsLen_CT += ((int)ArgString.size() + 1);
+          // change ArgType from char ptr to int to contain string length
+          ArgType = Int32Ty;
+        }
+      } // end of processing string argument
+    } // End of skip 1st arg
+    // if ArgTypeSize is >4 bytes we need to insert dummy align
+    // values in the struct so all stores can be aligned .
+    // These dummy fields must be inserted before the arg.
+    //
+    // In the pass below where the stores are generated careful
+    // tracking of the index into the struct is necessary.
+    size_t needsPadding = (structOffset % (size_t)DL.getTypeAllocSize(ArgType));
+    if (needsPadding) {
+      DataLen_CT += (unsigned)needsPadding;
+      structOffset += needsPadding;
+      ArgTypes.push_back(Int32Ty); // could assert that needsPadding == 4 here
+    }
+
+    ArgTypes.push_back(ArgType);
+    DataLen_CT += ((int)DL.getTypeAllocSize(ArgType));
+    structOffset += (size_t)DL.getTypeAllocSize(ArgType);
+  }
+
+  // ---  Generate call to __llvm_emissary_premalloc to get data pointer
+  if (hasVarStrings)
+    TotalVarStrsLength = Builder.CreateAdd(
+        TotalVarStrsLength,
+        llvm::ConstantInt::get(Int32Ty, AllStringsLen_CT + DataLen_CT),
+        "total_buffer_size");
+  llvm::Value *BufferLen =
+      hasVarStrings
+          ? TotalVarStrsLength
+          : llvm::ConstantInt::get(Int32Ty, AllStringsLen_CT + DataLen_CT);
+  llvm::Value *DataStructPtr =
+      Builder.CreateCall(GetEmissaryAllocDeclaration(CGM), {BufferLen});
+
+  // --- Cast the generic return pointer to be a struct in device global memory
+  llvm::StructType *DataStructTy =
+      llvm::StructType::create(ArgTypes, "varfn_args_store");
+  unsigned AS = getContext().getTargetAddressSpace(LangAS::cuda_device);
+  llvm::Value *BufferPtr = Builder.CreatePointerCast(
+      DataStructPtr, llvm::PointerType::get(CGM.getLLVMContext(), AS),
+      "varfn_args_store_casted");
+  // ---  Header of struct contains length and NumArgs ---
+  llvm::Value *DataLenField = llvm::ConstantInt::get(Int32Ty, DataLen_CT);
+  llvm::Value *P = Builder.CreateStructGEP(DataStructTy, BufferPtr, 0);
+  Builder.CreateAlignedStore(DataLenField, P,
+                             DL.getPrefTypeAlign(DataLenField->getType()));
+  llvm::Value *NumArgsField = llvm::ConstantInt::get(Int32Ty, NumArgs);
+  P = Builder.CreateStructGEP(DataStructTy, BufferPtr, 1);
+  Builder.CreateAlignedStore(NumArgsField, P,
+                             DL.getPrefTypeAlign(NumArgsField->getType()));
+
+  // ---  2nd Pass: create array of 4-byte keys to describe each arg
+  for (unsigned I = 0; I < NumArgs; I++) {
+    llvm::Type *ty = Args[I].getRValue(*this).getScalarVal()->getType();
+    llvm::Type::TypeID argtypeid =
+        Args[I].getRValue(*this).getScalarVal()->getType()->getTypeID();
+
+    // Get type size in bits. Usually 64 or 32.
+    uint32_t numbits = 0;
+    if (I > 0 &&
+        isString(E->getArg(I)->IgnoreParenCasts()->getType().getTypePtr()))
+      // The llvm typeID for string is pointer.  Since pointer numbits is 0,
+      // we set numbits to 1 to distinguish pointer type ID as string pointer.
+      numbits = 1;
+    else
+      numbits = ty->getScalarSizeInBits();
+    // Create a key that combines llvm typeID and size
+    llvm::Value *Key =
+        llvm::ConstantInt::get(Int32Ty, _PACK_TY_BITLEN(argtypeid, numbits));
+    P = Builder.CreateStructGEP(DataStructTy, BufferPtr, I + 2);
+    Builder.CreateAlignedStore(Key, P, DL.getPrefTypeAlign(Key->getType()));
+  }
+
+  // ---  3rd Pass: Store data values for each arg ---
+  unsigned varstring_index = 0;
+  unsigned structIndex = 2 + NumArgs;
+  structOffset = 4 * structIndex;
+  bool hasXfers;
+  for (unsigned I = 0; I < NumArgs; I++) {
+    llvm::Value *Arg;
+    if (I == 0) {
+      Arg = Args[I].getKnownRValue().getScalarVal();
+      llvm::ConstantInt *CI = llvm::dyn_cast<llvm::ConstantInt>(Arg);
+      uint64_t uint64value = CI->getZExtValue();
+      uint32_t lower_32 = (uint32_t)(uint64value & 0xFFFFFFFF);
+      hasXfers = lower_32 ? true : false;
+    } else {
+      const Expr *argX = E->getArg(I)->IgnoreParenCasts();
+      auto *argXTy = argX->getType().getTypePtr();
+      if (isString(argXTy)) {
+        if (isVarString(argX, argXTy, Arg)) {
+          Arg = VarStrLengths[varstring_index];
+          varstring_index++;
+        } else {
+          const StringLiteral *SL = getSL(argX, argXTy);
+          StringRef ArgString = SL->getString();
+          int ArgStrLen = (int)ArgString.size() + 1;
+          // Change Arg from a char pointer to the integer string length
+          Arg = llvm::ConstantInt::get(Int32Ty, ArgStrLen);
+        }
+      } else {
+        Arg = Args[I].getKnownRValue().getScalarVal();
+      }
+    }
+    size_t structElementSize = (size_t)DL.getTypeAllocSize(Arg->getType());
+    size_t needsPadding = (structOffset % structElementSize);
+    if (needsPadding) {
+      // Skip over dummy fields in struct to align
+      structOffset += needsPadding; // should assert needsPadding == 4
+      structIndex++;
+    }
+    P = Builder.CreateStructGEP(DataStructTy, BufferPtr, structIndex);
+    Builder.CreateAlignedStore(Arg, P, DL.getPrefTypeAlign(Arg->getType()));
+    structOffset += structElementSize;
+    structIndex++;
+  }
+
+  // ---  4th Pass: memcpy all strings after the data values ---
+  // bitcast the struct in device global memory as a char buffer
+  Address BufferPtrByteAddr =
+      Address(Builder.CreatePointerCast(
+                  BufferPtr, llvm::PointerType::get(CGM.getLLVMContext(), AS),
+                  "_casted"),
+              Int8Ty, CharUnits::fromQuantity(1));
+
+  // BufferPtrByteAddr is a pointer to where we want to write the next string
+  BufferPtrByteAddr = Builder.CreateConstInBoundsByteGEP(
+      BufferPtrByteAddr, CharUnits::fromQuantity(DataLen_CT));
+  varstring_index = 0;
+  // Skip string processing on arg0 which may not be in E->getArg(0)
+  for (unsigned I = 1; I < NumArgs; ++I) {
+    llvm::Value *Arg = Args[I].getKnownRValue().getScalarVal();
+    const Expr *argX = E->getArg(I)->IgnoreParenCasts();
+    auto *argXTy = argX->getType().getTypePtr();
+    if (isString(argXTy)) {
+      if (isVarString(argX, argXTy, Arg)) {
+        llvm::Value *varStrLength = VarStrLengths[varstring_index];
+        varstring_index++;
+        Address SrcAddr = Address(Arg, Int8Ty, CharUnits::fromQuantity(1));
+        Builder.CreateMemCpy(BufferPtrByteAddr, SrcAddr, varStrLength);
+        // update BufferPtrByteAddr for next string memcpy
+        llvm::Value *PtrAsInt = BufferPtrByteAddr.emitRawPointer(*this);
+        BufferPtrByteAddr =
+            Address(Builder.CreateGEP(Int8Ty, PtrAsInt,
+                                      ArrayRef<llvm::Value *>(varStrLength)),
+                    Int8Ty, CharUnits::fromQuantity(1));
+      } else {
+        const StringLiteral *SL = getSL(argX, argXTy);
+        StringRef ArgString = SL->getString();
+        int ArgStrLen = (int)ArgString.size() + 1;
+        Address SrcAddr = CGM.GetAddrOfConstantStringFromLiteral(SL);
+        Builder.CreateMemCpy(BufferPtrByteAddr, SrcAddr, ArgStrLen);
+        // update BufferPtrByteAddr for next memcpy
+        BufferPtrByteAddr = Builder.CreateConstInBoundsByteGEP(
+            BufferPtrByteAddr, CharUnits::fromQuantity(ArgStrLen));
+      }
+    }
+  }
+  // --- Generate call to __llvm_emissary_rpc and return RValue
+  llvm::Value *emis_rc = Builder.CreateCall(
+      GetEmissaryExecDeclaration(CGM, hasXfers), {BufferLen, DataStructPtr});
+  // truncate long long int to int for printf return value.
+  if ((E->getDirectCallee()->getNameAsString() == "fprintf") ||
+      (E->getDirectCallee()->getNameAsString() == "printf"))
+    emis_rc = Builder.CreateTrunc(emis_rc, CGM.Int32Ty, "emis_rc");
+  return RValue::get(emis_rc);
+}
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 6309c37788f0c..a4ee21d968b5e 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -6829,6 +6829,17 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType,
       StaticOperator = true;
   }
 
+  // Replace calls to _emissary_exec found in emissary device stubs with calls
+  // to either __llvm_emissary_rpc or __llvm_emissary_rpc_dm. Before the call
+  // EmitEmissaryExec generates code to allocate an arg buffer and to fill the
+  // arg buffer.
+  if ((CGM.getTriple().isAMDGCN() || CGM.getTriple().isNVPTX()) && FnType &&
+      dyn_cast<FunctionProtoType>(FnType) &&
+      dyn_cast<FunctionProtoType>(FnType)->isVariadic() &&
+      (E->getDirectCallee()->getNameAsString() == "_emissary_exec") &&
+      CGM.getLangOpts().OpenMP)
+    return EmitEmissaryExec(E);
+
   auto Arguments = E->arguments();
   if (StaticOperator) {
     // If we're calling a static operator, we need to emit the object argument
diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt
index dbbc35b372f42..f699cee7fea11 100644
--- a/clang/lib/CodeGen/CMakeLists.txt
+++ b/clang/lib/CodeGen/CMakeLists.txt
@@ -62,6 +62,7 @@ add_clang_library(clangCodeGen
   CGAtomic.cpp
   CGBlocks.cpp
   CGBuiltin.cpp
+  CGEmitEmissaryExec.cpp
   CGCUDANV.cpp
   CGCUDARuntime.cpp
   CGCXX.cpp
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 855e43631f436..3e962e5d0e34c 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4708,6 +4708,7 @@ class CodeGenFunction : public CodeGenTypeCache {
 
   RValue EmitNVPTXDevicePrintfCallExpr(const CallExpr *E);
   RValue EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E);
+  RValue EmitEmissaryExec(const CallExpr *E);
 
   RValue EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
                          const CallExpr *E, ReturnValueSlot ReturnValue);
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 4399eb475be75..b3be55c9d7d3d 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7813,6 +7813,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
+  // To opt-out of emissary printf/fprint set -fno-use-emissary-print. This
+  // will use the slower device libc for printf/fprintf. In the default mode
+  // (use-emissary-print), all other device libc functions are still active.
+  if (Args.hasFlag(options::OPT_fuse_emissary_print,
+                   options::OPT_fno_use_emissary_print, true))
+    CmdArgs.push_back("-DOFFLOAD_ENABLE_EMISSARY_PRINT");
+
   if (Triple.isAMDGPU()) {
     handleAMDGPUCodeObjectVersionOptions(D, Args, CmdArgs);
 
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index c92b370b88d2d..c1edefa5f6aaa 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -41,6 +41,8 @@ set(core_files
   tgmath.h
   unwind.h
   varargs.h
+  EmissaryIds.h
+  EmissaryMPI.h
   )
 
 set(arm_common_files
diff --git a/clang/lib/Headers/EmissaryIds.h b/clang/lib/Headers/EmissaryIds.h
new file mode 100644
index 0000000000000..f9ff616a0aacb
--- /dev/null
+++ b/clang/lib/Headers/EmissaryIds.h
@@ -0,0 +1,97 @@
+//===- openmp/device/include/EmissaryIds.h enum & headers ----- C++ -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines Emissary API identifiers. This header is used by both host
+// and device compilations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OFFLOAD_EMISSARY_IDS_H
+#define OFFLOAD_EMISSARY_IDS_H
+/// The sets of emissary APIs under development
+typedef enum {
+  EMIS_ID_INVALID,
+  EMIS_ID_FORTRT,
+  EMIS_ID_PRINT,
+  EMIS_ID_MPI,
+  EMIS_ID_HDF5,
+  EMIS_ID_RESERVE,
+} offload_emis_id_t;
+
+typedef enum {
+  _print_INVALID,
+  _printf_idx,
+  _fprintf_idx,
+  _ockl_asan_report_idx,
+} offload_emis_print_t;
+
+/// The vargs function used by emissary API device stubs
+unsigned long long _emissary_exec(unsigned long long, ...);
+
+//#define _PACK_EMIS_IDS(x, y)                                                   \
+//  ((unsigned long long)x << 32) | ((unsigned long long)y)
+
+#define _PACK_EMIS_IDS(a, b, c, d)                                             \
+  ((unsigned long long)a << 48) | ((unsigned long long)b << 32) |              \
+      ((unsigned long long)c << 16) | ((unsigned long long)d)
+
+typedef enum {
+  _FortranAio_INVALID,
+  _FortranAioBeginExternalListOutput_idx,
+  _FortranAioOutputAscii_idx,
+  _FortranAioOutputInteger32_idx,
+  _FortranAioEndIoStatement_idx,
+  _FortranAioOutputInteger8_idx,
+  _FortranAioOutputInteger16_idx,
+  _FortranAioOutputInteger64_idx,
+  _FortranAioOutputReal32_idx,
+  _FortranAioOutputReal64_idx,
+  _FortranAioOutputComplex32_idx,
+  _FortranAioOutputComplex64_idx,
+  _FortranAioOutputLogical_idx,
+  _FortranAAbort_idx,
+  _FortranAStopStatementText_idx,
+  _FortranAioBeginExternalFormattedOutput_idx,
+  _FortranAStopStatement_idx,
+} offload_emis_fortrt_idx;
+
+/// This structure is created by emisExtractArgBuf to get information
+/// from the data buffer passed by rpc.
+typedef struct {
+  unsigned int DataLen;
+  unsigned int NumArgs;
+  unsigned int emisid;
+  unsigned int emisfnid;
+  unsigned int NumSendXfers;
+  unsigned int NumRecvXfers;
+  unsigned long long data_not_used;
+  char *keyptr;
+  char *argptr;
+  char *strptr;
+} emisArgBuf_t;
+
+typedef unsigned long long EmissaryReturn_t;
+typedef unsigned long long emis_argptr_t;
+typedef EmissaryReturn_t emisfn_t(void *, ...);
+
+#define MAXVARGS 32
+
+typedef enum service_rc {
+  _ERC_SUCCESS = 0,
+  _ERC_STATUS_ERROR = 1,
+  _ERC_DATA_USED_ERROR = 2,
+  _ERC_ADDINT_ERROR = 3,
+  _ERC_ADDFLOAT_ERROR = 4,
+  _ERC_ADDSTRING_ERROR = 5,
+  _ERC_UNSUPPORTED_ID_ERROR = 6,
+  _ERC_INVALID_ID_ERROR = 7,
+  _ERC_ERROR_INVALID_REQUEST = 8,
+  _ERC_EXCEED_MAXVARGS_ERROR = 9,
+} service_rc;
+
+#endif // OFFLOAD_EMISSARY_IDS_H
diff --git a/clang/lib/Headers/EmissaryMPI.h b/clang/lib/Headers/EmissaryMPI.h
new file mode 100644
index 0000000000000..c9529e1ea73ce
--- /dev/null
+++ b/clang/lib/Headers/EmissaryMPI.h
@@ -0,0 +1,202 @@
+//===----------------  openmp/device/include/EmissaryMPI.h  ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// EmissaryMPI.h This include must be included by MPI application
+//
+//===----------------------------------------------------------------------===//
+#include "EmissaryIds.h"
+#include <stdarg.h>
+#include <unordered_map>
+
+typedef enum {
+  _MPI_INVALID,
+  _MPI_Send_idx,
+  _MPI_Recv_idx,
+  _MPI_Allreduce_idx,
+  _MPI_Reduce_idx,
+} offload_emis_mpi_t;
+
+// -------- DELETE THIS BLOCK WHEN MPI_Type_size on GPU WORKS ----------------
+// Emissary_Initialize_MPI builds a table of lengths for each each MPI_Datatype.
+// That table is called _mpi_type_lens and passed to the device.
+// We need this because we do not yet have a GPU version of MPI_Type_size.
+// If we did we can avoid the table search for datatype_size. This is
+// how MPI datatype length should be calculated.
+//   int datatype_size;
+//   MPI_Type_size(v2,&datatype_size) ;
+// Delete this table search when we have a working MPI_Type_size.
+#define _MPI_DATATYPES 5
+typedef struct mpi_type_len_t {
+  uint64_t dt_signature;
+  uint32_t dt_size;
+} mpi_type_len_t;
+#pragma omp begin declare target
+mpi_type_len_t _mpi_type_lens[_MPI_DATATYPES];
+#pragma omp end declare target
+void Emissary_Initialize_MPI() {
+  MPI_Datatype _mpi_int = MPI_INT;
+  MPI_Datatype _mpi_float = MPI_FLOAT;
+  MPI_Datatype _mpi_unsigned = MPI_UNSIGNED;
+  MPI_Datatype _mpi_double = MPI_DOUBLE;
+  MPI_Datatype _mpi_char = MPI_CHAR;
+  _mpi_type_lens[0] = {(uint64_t)_mpi_int, 4};
+  _mpi_type_lens[1] = {(uint64_t)_mpi_unsigned, 4};
+  _mpi_type_lens[2] = {(uint64_t)_mpi_float, 4};
+  _mpi_type_lens[3] = {(uint64_t)_mpi_double, 8};
+  _mpi_type_lens[4] = {(uint64_t)_mpi_char, 1};
+#pragma omp target update to(_mpi_type_lens[0 : _MPI_DATATYPES])
+}
+// -------- END BLOCK TO DELETE WHEN MPI_Type_size on GPU WORKS ---------------
+
+///  Device stubs must use the identical host API interface.
+///  Stubs call _emissary_exec with additional args that include
+///  the identifier and additional D2H and H2D transfer vectors.
+///  whose params include an identifier
+///
+#if defined(__NVPTX__) || defined(__AMDGCN__)
+
+extern "C" int MPI_Send(const void *buf, int count, MPI_Datatype datatype,
+                        int dest, int tag, MPI_Comm comm) {
+  uint64_t dt_signature = (uint64_t)datatype;
+  int datatype_size = 8; // Default in case we do not have it in our table.
+  for (int i = 0; i < _MPI_DATATYPES; i++)
+    if (_mpi_type_lens[i].dt_signature == dt_signature) {
+      datatype_size = _mpi_type_lens[i].dt_size;
+      break;
+    }
+  return (int)_emissary_exec(
+      // The emissary identifier is a static 64 bit field that encodes
+      // the emissary id, emissary function, D2H Xfer cnt, and H2D Xfer cnt.
+      _PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Send_idx, 1, 0),
+      // Each D2H transfer vector needs a pair of args to describe the xfer
+      // The first is the device pointer, the 2nd is size.
+      buf, (int)count * datatype_size,
+      // These are the actual 6 MPI_Send Args passed directly from the params
+      buf, count, datatype, dest, tag, comm);
+}
+extern "C" int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source,
+                        int tag, MPI_Comm comm, MPI_Status *st) {
+  uint64_t dt_signature = (uint64_t)datatype;
+  int datatype_size = 8;
+  for (int i = 0; i < _MPI_DATATYPES; i++)
+    if (_mpi_type_lens[i].dt_signature == dt_signature) {
+      datatype_size = _mpi_type_lens[i].dt_size;
+      break;
+    }
+  return (int)_emissary_exec(_PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Recv_idx, 0, 1),
+                             buf,
+                             (int)count * datatype_size, // This is a H2D Xfer
+                             buf, count, datatype, source, tag, comm, st);
+}
+extern "C" int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count,
+                             MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) {
+  return (int)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Allreduce_idx, 1, 1), sendbuf, recvbuf,
+      count, datatype, op, comm);
+}
+extern "C" int MPI_Reduce(const void *sendbuf, void *recvbuf, int count,
+                          MPI_Datatype datatype, MPI_Op op, int root,
+                          MPI_Comm comm) {
+  return (int)_emissary_exec(_PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Reduce_idx, 1, 1),
+                             sendbuf, recvbuf, count, datatype, op, root, comm);
+}
+
+#else
+
+/// Host variadic wrapper functions.
+extern "C" {
+extern int V_MPI_Send(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  int v1 = va_arg(args, int);
+  MPI_Datatype v2 = va_arg(args, MPI_Datatype);
+  int v3 = va_arg(args, int);
+  int v4 = va_arg(args, int);
+  MPI_Comm v5 = va_arg(args, MPI_Comm);
+  va_end(args);
+  int rval = MPI_Send(v0, v1, v2, v3, v4, v5);
+  return rval;
+}
+extern int V_MPI_Recv(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  int v1 = va_arg(args, int);
+  MPI_Datatype v2 = va_arg(args, MPI_Datatype);
+  int v3 = va_arg(args, int);
+  int v4 = va_arg(args, int);
+  MPI_Comm v5 = va_arg(args, MPI_Comm);
+  MPI_Status *v6 = va_arg(args, MPI_Status *);
+  va_end(args);
+  int rval = MPI_Recv(v0, v1, v2, v3, v4, v5, v6);
+  return rval;
+}
+extern int V_MPI_Allreduce(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *buf = va_arg(args, void *);
+  void *recvbuf = va_arg(args, void *);
+  int count = va_arg(args, int);
+  MPI_Datatype datatype = va_arg(args, MPI_Datatype);
+  MPI_Op op = va_arg(args, MPI_Op);
+  MPI_Comm comm = va_arg(args, MPI_Comm);
+  va_end(args);
+  int rval = MPI_Allreduce(buf, recvbuf, count, datatype, op, comm);
+  return rval;
+}
+extern int V_MPI_Reduce(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *buf = va_arg(args, void *);
+  void *recvbuf = va_arg(args, void *);
+  int count = va_arg(args, int);
+  MPI_Datatype datatype = va_arg(args, MPI_Datatype);
+  MPI_Op op = va_arg(args, MPI_Op);
+  int root = va_arg(args, int);
+  MPI_Comm comm = va_arg(args, MPI_Comm);
+  va_end(args);
+  int rval = MPI_Reduce(buf, recvbuf, count, datatype, op, root, comm);
+  return rval;
+}
+
+/// EmissaryMPI function selector
+EmissaryReturn_t EmissaryMPI(char *data, emisArgBuf_t *ab, emis_argptr_t *a[]) {
+
+  switch (ab->emisfnid) {
+  case _MPI_Send_idx: {
+    void *fnptr = (void *)V_MPI_Send;
+    int return_value_int =
+        V_MPI_Send(fnptr, a[0], a[1], a[2], a[3], a[4], a[5]);
+    return (EmissaryReturn_t)return_value_int;
+  }
+  case _MPI_Recv_idx: {
+    void *fnptr = (void *)V_MPI_Recv;
+    int return_value_int =
+        V_MPI_Recv(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6]);
+    return (EmissaryReturn_t)return_value_int;
+  }
+  case _MPI_Allreduce_idx: {
+    void *fnptr = (void *)V_MPI_Allreduce;
+    int return_value_int =
+        V_MPI_Allreduce(fnptr, a[0], a[1], a[2], a[3], a[4], a[5]);
+    return (EmissaryReturn_t)return_value_int;
+  }
+  case _MPI_Reduce_idx: {
+    void *fnptr = (void *)V_MPI_Reduce;
+    int return_value_int =
+        V_MPI_Reduce(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6]);
+    return (EmissaryReturn_t)return_value_int;
+  }
+  }
+  return (EmissaryReturn_t)0;
+}
+
+} // end extern "C"
+
+#endif
diff --git a/clang/lib/Headers/llvm_libc_wrappers/stdio.h b/clang/lib/Headers/llvm_libc_wrappers/stdio.h
index 0c3e44823da70..e8c0250cfb01b 100644
--- a/clang/lib/Headers/llvm_libc_wrappers/stdio.h
+++ b/clang/lib/Headers/llvm_libc_wrappers/stdio.h
@@ -21,6 +21,23 @@
 #define __LIBC_ATTRS
 #endif
 
+// To turn off emissary print (and this macro) set -fno-use-emissary-print.
+#ifdef OFFLOAD_ENABLE_EMISSARY_PRINT
+#if defined(__NVPTX__) || defined(__AMDGCN__)
+#include <EmissaryIds.h>
+#define fprintf(...)                                                           \
+  _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _fprintf_idx, 0, 0),            \
+                 __VA_ARGS__);
+#define printf(...)                                                            \
+  _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _printf_idx, 0, 0), __VA_ARGS__);
+#define fputc(c, stream) fprintf(stream, "%c", (unsigned char)(c))
+#define putc(c, stream) fprintf(stream, "%c", (unsigned char)(c))
+#define putchar(c) printf("%c", (char)(c))
+#define fputs(str, stream) fprintf((stream), "%s", (str))
+#define puts(str) fprintf(stdout, "%s", (str))
+#endif
+#endif
+
 // Some headers provide these as macros. Temporarily undefine them so they do
 // not conflict with any definitions for the GPU.
 
diff --git a/offload/include/Shared/RPCOpcodes.h b/offload/include/Shared/RPCOpcodes.h
index beee29df1f707..fbdc839620cec 100644
--- a/offload/include/Shared/RPCOpcodes.h
+++ b/offload/include/Shared/RPCOpcodes.h
@@ -18,6 +18,8 @@
 
 typedef enum {
   OFFLOAD_HOST_CALL = LLVM_OFFLOAD_OPCODE(0),
+  OFFLOAD_EMISSARY = LLVM_OFFLOAD_OPCODE(1),
+  OFFLOAD_EMISSARY_DM = LLVM_OFFLOAD_OPCODE(2),
 } offload_opcode_t;
 
 #undef LLVM_OFFLOAD_OPCODE
diff --git a/offload/libomptarget/CMakeLists.txt b/offload/libomptarget/CMakeLists.txt
index 93e684e53bf17..d3247c09a50a8 100644
--- a/offload/libomptarget/CMakeLists.txt
+++ b/offload/libomptarget/CMakeLists.txt
@@ -57,6 +57,7 @@ target_compile_definitions(omptarget PRIVATE
 foreach(plugin IN LISTS LIBOMPTARGET_PLUGINS_TO_BUILD)
   target_link_libraries(omptarget PRIVATE omptarget.rtl.${plugin})
 endforeach()
+target_link_libraries(omptarget PRIVATE flang_rt.runtime)
 
 target_compile_options(omptarget PRIVATE ${offload_compile_flags})
 target_link_options(omptarget PRIVATE ${offload_link_flags})
diff --git a/offload/plugins-nextgen/common/CMakeLists.txt b/offload/plugins-nextgen/common/CMakeLists.txt
index ea0910abf95d5..0232fe2d83f28 100644
--- a/offload/plugins-nextgen/common/CMakeLists.txt
+++ b/offload/plugins-nextgen/common/CMakeLists.txt
@@ -7,6 +7,15 @@ tablegen(OFFLOAD include/OffloadErrcodes.inc -gen-errcodes -I ${CMAKE_CURRENT_SO
 tablegen(OFFLOAD include/OffloadInfo.inc -gen-info -I ${CMAKE_CURRENT_SOURCE_DIR}/../../liboffload/API)
 add_public_tablegen_target(PluginErrcodes)
 
+if(OFFLOAD_ENABLE_EMISSARY_APIS)
+  set(emissary_sources
+    src/Emissary.cpp
+    src/EmissaryFortrt.cpp
+    src/EmissaryPrint.cpp
+  )
+  set(emissary_headers ${CMAKE_CURRENT_SOURCE_DIR}/../../../openmp/device/include)
+endif()
+
 # NOTE: Don't try to build `PluginInterface` using `add_llvm_library` because we
 # don't want to export `PluginInterface` while `add_llvm_library` requires that.
 add_library(PluginCommon OBJECT
@@ -16,6 +25,7 @@ add_library(PluginCommon OBJECT
   src/RPC.cpp
   src/OffloadError.cpp
   src/Utils/ELF.cpp
+  ${emissary_sources}
 )
 add_dependencies(PluginCommon intrinsics_gen PluginErrcodes)
 
@@ -34,6 +44,10 @@ endif()
 # Include the RPC server from the `libc` project.
 include(FindLibcCommonUtils)
 target_link_libraries(PluginCommon PRIVATE llvm-libc-common-utilities)
+if(OFFLOAD_ENABLE_EMISSARY_APIS)
+  target_link_libraries(PluginCommon PRIVATE flang_rt.runtime
+    -L${CMAKE_BINARY_DIR}/../../lib -L${CMAKE_INSTALL_PREFIX}/lib)
+endif()
 
 # Define the TARGET_NAME and DEBUG_PREFIX.
 target_compile_definitions(PluginCommon PRIVATE
@@ -50,6 +64,7 @@ target_include_directories(PluginCommon PUBLIC
   ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
   ${LIBOMPTARGET_BINARY_INCLUDE_DIR}
   ${LIBOMPTARGET_INCLUDE_DIR}
+  ${emissary_headers}
 )
 
 set_target_properties(PluginCommon PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/offload/plugins-nextgen/common/include/Emissary.h b/offload/plugins-nextgen/common/include/Emissary.h
new file mode 100644
index 0000000000000..317c6ea51136b
--- /dev/null
+++ b/offload/plugins-nextgen/common/include/Emissary.h
@@ -0,0 +1,263 @@
+//===-- offload/plugins-nextgen/common/include/Emissary.h ------ C++ ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines emissary helper functions. This include is only used for host
+// compilation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OFFLOAD_EMISSARY_H
+#define OFFLOAD_EMISSARY_H
+
+#include "EmissaryIds.h"
+#include "RPC.h"
+// #include "shared/rpc.h"
+#include "shared/rpc_server.h"
+
+extern "C" {
+
+/// Called by rpc after receiving emissary argument buffer
+EmissaryReturn_t Emissary(char *data, emisArgBuf_t *ab,
+                          std::unordered_map<void *, void *> *D2HAddrList);
+
+/// Called by Emissary for all Fortrt emissary functions
+EmissaryReturn_t
+EmissaryFortrt(char *data, emisArgBuf_t *ab,
+               std::unordered_map<void *, void *> *D2HAddrList);
+
+/// Called by Emissary for all misc print functions
+EmissaryReturn_t EmissaryPrint(char *data, emisArgBuf_t *ab);
+
+/// Called by Emissary for all MPI emissary API functions
+__attribute((weak)) EmissaryReturn_t EmissaryMPI(char *data, emisArgBuf_t *ab,
+                                                 emis_argptr_t *arg[]);
+
+/// Called by Emissary for all HDF5 Emissary API functions
+__attribute((weak)) EmissaryReturn_t EmissaryHDF5(char *data, emisArgBuf_t *ab,
+                                                  emis_argptr_t *arg[]);
+
+/// Called by Emissary to support user-defined emissary API
+__attribute((weak)) EmissaryReturn_t EmissaryReserve(char *data,
+                                                     emisArgBuf_t *ab,
+                                                     emis_argptr_t *arg[]);
+
+/// emisExtractArgBuf is called within the "case OFFLOAD_EMISSARY:" stanza
+/// in offload/plugins-nextgen/common/src/RPC.cpp to build the emisArgBuf_t
+/// structure from the emissary data buffer sent to the CPU by rpc.
+/// This buffer is created by clang CodeGen when variadic function
+/// _emissary_exec(...) is encountered when compiling any emissary device
+/// stub to define the device function.
+void emisExtractArgBuf(char *buf, emisArgBuf_t *ab);
+
+/// Move the ArgBuf tracker past a set of XferSrgs
+void emisSkipXferArgSet(emisArgBuf_t *ab);
+
+/// Get uint32 value extended to uint64_t value from a char ptr
+uint64_t getuint32(char *val);
+/// Get uint64_t value from a char ptr
+uint64_t getuint64(char *val);
+/// Get a function pointer from a char ptr
+void *getfnptr(char *val);
+
+/// Builds the array of pointers passed to V_ functions
+uint32_t EmissaryBuildVargs(int NumArgs, char *keyptr, char *dataptr,
+                            char *strptr, unsigned long long *data_not_used,
+                            emis_argptr_t *a[],
+                            std::unordered_map<void *, void *> *D2HAddrList);
+
+} // end extern "C"
+
+/// Call the associated V_ function
+template <typename T, typename FT>
+extern T EmissaryCallFnptr(uint32_t NumArgs, void *fnptr, emis_argptr_t *a[]);
+
+// We would like to get llvm typeID enum from Type.h. e.g.
+// #include ".../llvm/include/llvm/IR/Type.h"
+// But we cannot include LLVM headers in a runtime function.
+// So we a have a manual copy of llvm TypeID enum from Type.h
+// The codegen for _emissary_exec puts this ID in the key for
+// each arg and the host runtime needs to decode this key.
+enum TypeID {
+  // PrimitiveTypes
+  HalfTyID = 0,  ///< 16-bit floating point type
+  BFloatTyID,    ///< 16-bit floating point type (7-bit significand)
+  FloatTyID,     ///< 32-bit floating point type
+  DoubleTyID,    ///< 64-bit floating point type
+  X86_FP80TyID,  ///< 80-bit floating point type (X87)
+  FP128TyID,     ///< 128-bit floating point type (112-bit significand)
+  PPC_FP128TyID, ///< 128-bit floating point type (two 64-bits, PowerPC)
+  VoidTyID,      ///< type with no size
+  LabelTyID,     ///< Labels
+  MetadataTyID,  ///< Metadata
+  X86_AMXTyID,   ///< AMX vectors (8192 bits, X86 specific)
+  TokenTyID,     ///< Tokens
+
+  // Derived types... see DerivedTypes.h file.
+  IntegerTyID,        ///< Arbitrary bit width integers
+  FunctionTyID,       ///< Functions
+  PointerTyID,        ///< Pointers
+  StructTyID,         ///< Structures
+  ArrayTyID,          ///< Arrays
+  FixedVectorTyID,    ///< Fixed width SIMD vector type
+  ScalableVectorTyID, ///< Scalable SIMD vector type
+  TypedPointerTyID,   ///< Typed pointer used by some GPU targets
+  TargetExtTyID,      ///< Target extension type
+};
+
+template <typename T, typename FT>
+extern T EmissaryCallFnptr(uint32_t NumArgs, void *fnptr, emis_argptr_t *a[]) {
+  T rv;
+  FT *vfnptr = (FT *)fnptr;
+  switch (NumArgs) {
+  case 1:
+    rv = (T)vfnptr(fnptr, a[0]);
+    break;
+  case 2:
+    rv = (T)vfnptr(fnptr, a[0], a[1]);
+    break;
+  case 3:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2]);
+    break;
+  case 4:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3]);
+    break;
+  case 5:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4]);
+    break;
+  case 6:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5]);
+    break;
+  case 7:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6]);
+    break;
+  case 8:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
+    break;
+  case 9:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8]);
+    break;
+  case 10:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9]);
+    break;
+  case 11:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10]);
+    break;
+  case 12:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11]);
+    break;
+  case 13:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12]);
+    break;
+  case 14:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13]);
+    break;
+  case 15:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14]);
+    break;
+  case 16:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15]);
+    break;
+  case 17:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16]);
+    break;
+  case 18:
+    rv =
+        (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                  a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17]);
+    break;
+  case 19:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18]);
+    break;
+  case 20:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19]);
+    break;
+  case 21:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20]);
+    break;
+  case 22:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21]);
+    break;
+  case 23:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21], a[22]);
+    break;
+  case 24:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21], a[22], a[23]);
+    break;
+  case 25:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21], a[22], a[23], a[24]);
+    break;
+  case 26:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25]);
+    break;
+  case 27:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25],
+                   a[26]);
+    break;
+  case 28:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25],
+                   a[26], a[27]);
+    break;
+  case 29:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25],
+                   a[26], a[27], a[28]);
+    break;
+  case 30:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25],
+                   a[26], a[27], a[28], a[29]);
+    break;
+  case 31:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25],
+                   a[26], a[27], a[28], a[29], a[30]);
+    break;
+  case 32:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25],
+                   a[26], a[27], a[28], a[29], a[30], a[31]);
+    break;
+  default:
+    rv = 0;
+  }
+  return rv;
+}
+
+#endif // OFFLOAD_EMISSARY_H
diff --git a/offload/plugins-nextgen/common/src/Emissary.cpp b/offload/plugins-nextgen/common/src/Emissary.cpp
new file mode 100644
index 0000000000000..0be0e426ee050
--- /dev/null
+++ b/offload/plugins-nextgen/common/src/Emissary.cpp
@@ -0,0 +1,259 @@
+//===----- ioffload/plugins-nexgen/common/include/Emissary.cpp ---- C++ ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RPC.h"
+
+#include "Shared/Debug.h"
+#include "Shared/RPCOpcodes.h"
+
+#include "PluginInterface.h"
+
+#include "Emissary.h"
+#include "EmissaryIds.h"
+#include "shared/rpc.h"
+#include "shared/rpc_opcodes.h"
+#include <unordered_map>
+
+extern "C" EmissaryReturn_t
+Emissary(char *data, emisArgBuf_t *ab,
+         std::unordered_map<void *, void *> *D2HAddrList) {
+  EmissaryReturn_t result = 0;
+  emis_argptr_t **args = (emis_argptr_t **)aligned_alloc(
+      sizeof(emis_argptr_t), ab->NumArgs * sizeof(emis_argptr_t *));
+
+  switch (ab->emisid) {
+  case EMIS_ID_INVALID: {
+    fprintf(stderr, "Emissary (host execution) got invalid EMIS_ID\n");
+    result = 0;
+    break;
+  }
+  case EMIS_ID_FORTRT: {
+    result = EmissaryFortrt(data, ab, D2HAddrList);
+    break;
+  }
+  case EMIS_ID_PRINT: {
+    result = EmissaryPrint(data, ab);
+    break;
+  }
+  case EMIS_ID_MPI: {
+    if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+                           &(ab->data_not_used), &args[0],
+                           D2HAddrList) != _ERC_SUCCESS)
+      return (EmissaryReturn_t)0;
+    result = EmissaryMPI(data, ab, args);
+    break;
+  }
+  case EMIS_ID_HDF5: {
+    if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+                           &(ab->data_not_used), &args[0],
+                           D2HAddrList) != _ERC_SUCCESS)
+      return (EmissaryReturn_t)0;
+    result = EmissaryHDF5(data, ab, args);
+    break;
+  }
+  case EMIS_ID_RESERVE: {
+    if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+                           &(ab->data_not_used), &args[0],
+                           D2HAddrList) != _ERC_SUCCESS)
+      return (EmissaryReturn_t)0;
+    result = EmissaryReserve(data, ab, args);
+    break;
+  }
+  default:
+    fprintf(stderr,
+            "Emissary (host execution) EMIS_ID:%d fnid:%d not supported\n",
+            ab->emisid, ab->emisfnid);
+  }
+  free(args);
+  return result;
+}
+
+// emisExtractArgBuf reverses protocol that codegen in EmitEmissaryExec makes.
+extern "C" void emisExtractArgBuf(char *data, emisArgBuf_t *ab) {
+
+  uint32_t *int32_data = (uint32_t *)data;
+  ab->DataLen = int32_data[0];
+  ab->NumArgs = int32_data[1];
+
+  // Note: while the data buffer contains all args including strings,
+  // ab->DataLen does not include strings. It only counts header, keys,
+  // and aligned numerics.
+
+  ab->keyptr = data + (2 * sizeof(int));
+  ab->argptr = ab->keyptr + (ab->NumArgs * sizeof(int));
+  ab->strptr = data + (size_t)ab->DataLen;
+  int alignfill = 0;
+  if (((size_t)ab->argptr) % (size_t)8) {
+    ab->argptr += 4;
+    alignfill = 4;
+  }
+
+  // Extract the two emissary identifiers and number of send
+  // and recv device data transfers. These are 4 16 bit values
+  // packed into a single 64-bit field.
+  uint64_t arg1 = *(uint64_t *)ab->argptr;
+  ab->emisid = (unsigned int)((arg1 >> 48) & 0xFFFF);
+  ab->emisfnid = (unsigned int)((arg1 >> 32) & 0xFFFF);
+  ab->NumSendXfers = (unsigned int)((arg1 >> 16) & 0xFFFF);
+  ab->NumRecvXfers = (unsigned int)((arg1) & 0xFFFF);
+
+  // skip the uint64_t emissary id arg which is first arg in _emissary_exec.
+  ab->keyptr += sizeof(int);
+  ab->argptr += sizeof(uint64_t);
+  ab->NumArgs -= 1;
+
+  // data_not_used used for testing consistency.
+  ab->data_not_used =
+      (size_t)(ab->DataLen) - (((size_t)(3 + ab->NumArgs) * sizeof(int)) +
+                               sizeof(uint64_t) + alignfill);
+
+  // Ensure first arg after emissary id arg is aligned.
+  if (((size_t)ab->argptr) % (size_t)8) {
+    ab->argptr += 4;
+    ab->data_not_used -= 4;
+  }
+}
+
+/// Get uint32 value extended to uint64_t value from a char ptr
+extern "C" uint64_t getuint32(char *val) {
+  uint32_t i32 = *(uint32_t *)val;
+  return (uint64_t)i32;
+}
+
+/// Get uint64_t value from a char ptr
+extern "C" uint64_t getuint64(char *val) { return *(uint64_t *)val; }
+
+/// Get a function pointer from a char ptr
+extern "C" void *getfnptr(char *val) {
+  uint64_t ival = *(uint64_t *)val;
+  return (void *)ival;
+}
+
+// build argument array
+extern "C" uint32_t
+EmissaryBuildVargs(int NumArgs, char *keyptr, char *dataptr, char *strptr,
+                   unsigned long long *data_not_used, emis_argptr_t *a[],
+                   std::unordered_map<void *, void *> *D2HAddrList) {
+  size_t num_bytes;
+  size_t bytes_consumed;
+  size_t strsz;
+  size_t fillerNeeded;
+
+  uint argcount = 0;
+
+  for (int argnum = 0; argnum < NumArgs; argnum++) {
+    num_bytes = 0;
+    strsz = 0;
+    unsigned int key = *(unsigned int *)keyptr;
+    unsigned int llvmID = key >> 16;
+    unsigned int numbits = (key << 16) >> 16;
+
+    switch (llvmID) {
+    case FloatTyID:  ///<  2: 32-bit floating point type
+    case DoubleTyID: ///<  3: 64-bit floating point type
+    case FP128TyID:  ///<  5: 128-bit floating point type (112-bit mantissa)
+      num_bytes = numbits / 8;
+      bytes_consumed = num_bytes;
+      fillerNeeded = ((size_t)dataptr) % num_bytes;
+      if (fillerNeeded) {
+        dataptr += fillerNeeded;
+        bytes_consumed += fillerNeeded;
+      }
+      if ((*data_not_used) < bytes_consumed)
+        return _ERC_DATA_USED_ERROR;
+
+      if (num_bytes == 4)
+        a[argcount] = (emis_argptr_t *)getuint32(dataptr);
+      else
+        a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+      break;
+
+    case IntegerTyID: ///< 11: Arbitrary bit width integers
+      num_bytes = numbits / 8;
+      bytes_consumed = num_bytes;
+      fillerNeeded = ((size_t)dataptr) % num_bytes;
+      if (fillerNeeded) {
+        dataptr += fillerNeeded;
+        bytes_consumed += fillerNeeded;
+      }
+      if ((*data_not_used) < bytes_consumed)
+        return _ERC_DATA_USED_ERROR;
+
+      if (num_bytes == 4)
+        a[argcount] = (emis_argptr_t *)getuint32(dataptr);
+      else
+        a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+      // fprintf(stderr, "Arg:%d  IntegerType bytes:%ld val:%ld\n",argcount,
+      // num_bytes,(uint64_t) a[argcount]);
+      break;
+
+    case PointerTyID: {   ///< 15: Pointers
+      if (numbits == 1) { // This is a pointer to string
+        num_bytes = 4;
+        bytes_consumed = num_bytes;
+        strsz = (size_t) * (unsigned int *)dataptr;
+        if ((*data_not_used) < bytes_consumed)
+          return _ERC_DATA_USED_ERROR;
+        a[argcount] = (emis_argptr_t *)((char *)strptr);
+      } else {
+        num_bytes = 8;
+        bytes_consumed = num_bytes;
+        fillerNeeded = ((size_t)dataptr) % num_bytes;
+        if (fillerNeeded) {
+          dataptr += fillerNeeded; // dataptr is now aligned
+          bytes_consumed += fillerNeeded;
+        }
+        if ((*data_not_used) < bytes_consumed)
+          return _ERC_DATA_USED_ERROR;
+        a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+      }
+      auto found = D2HAddrList->find((void *)a[argcount]);
+      if (found != D2HAddrList->end())
+        a[argcount] = (emis_argptr_t *)found->second;
+      // fprintf(stderr, "Arg:%d  pointer Type bytes:%ld val:%p\n",argcount,
+      // num_bytes, (void*) a[argcount]);
+    } break;
+
+    case HalfTyID:           ///<  1: 16-bit floating point type
+    case ArrayTyID:          ///< 14: Arrays
+    case StructTyID:         ///< 13: Structures
+    case FunctionTyID:       ///< 12: Functions
+    case TokenTyID:          ///< 10: Tokens
+    case MetadataTyID:       ///<  8: Metadata
+    case LabelTyID:          ///<  7: Labels
+    case PPC_FP128TyID:      ///<  6: 128-bit floating point type (two 64-bits,
+                             ///<  PowerPC)
+    case X86_FP80TyID:       ///<  4: 80-bit floating point type (X87)
+    case FixedVectorTyID:    ///< 16: Fixed width SIMD vector type
+    case ScalableVectorTyID: ///< 17: Scalable SIMD vector type
+    case TypedPointerTyID:   ///< Typed pointer used by some GPU targets
+    case TargetExtTyID:      ///< Target extension type
+    case VoidTyID:
+      return _ERC_UNSUPPORTED_ID_ERROR;
+      break;
+    default:
+      return _ERC_INVALID_ID_ERROR;
+    }
+
+    // Move to next argument
+    dataptr += num_bytes;
+    strptr += strsz;
+    *data_not_used -= bytes_consumed;
+    keyptr += 4;
+    argcount++;
+  }
+  return _ERC_SUCCESS;
+}
+
+extern "C" void emisSkipXferArgSet(emisArgBuf_t *ab) {
+  // Skip the ptr and size of the Xfer
+  ab->NumArgs -= 2;
+  ab->keyptr += 2 * sizeof(uint32_t);
+  ab->argptr += 2 * sizeof(void *);
+  ab->data_not_used -= 2 * sizeof(void *);
+}
diff --git a/offload/plugins-nextgen/common/src/EmissaryFortrt.cpp b/offload/plugins-nextgen/common/src/EmissaryFortrt.cpp
new file mode 100644
index 0000000000000..7853357b1329a
--- /dev/null
+++ b/offload/plugins-nextgen/common/src/EmissaryFortrt.cpp
@@ -0,0 +1,470 @@
+//===---- offload/plugins-nextgen/common/src/EmissaryFortrt.cpp  ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Host support for Fortran runtime Emissary API
+//
+//===----------------------------------------------------------------------===//
+#include "PluginInterface.h"
+#include "RPC.h"
+#include "Shared/Debug.h"
+#include "Shared/RPCOpcodes.h"
+#include "shared/rpc.h"
+#include "shared/rpc_opcodes.h"
+#include <Emissary.h>
+#include <EmissaryIds.h>
+#include <assert.h>
+#include <cstring>
+#include <ctype.h>
+#include <list>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <tuple>
+#include <vector>
+
+// Headers for Host Fortran Runtime API as built in llvm/flang/runtime
+extern "C" {
+void *_FortranAioBeginExternalListOutput(uint32_t a1, const char *a2,
+                                         uint32_t a3);
+void *_FortranAioBeginExternalFormattedOutput(const char *ptr1, uint64_t x1,
+                                              void *ptr2, uint32_t x2,
+                                              const char *ptr3, uint32_t x3);
+bool _FortranAioOutputAscii(void *a1, char *a2, uint64_t a3);
+bool _FortranAioOutputInteger32(void *a1, uint32_t a2);
+uint32_t _FortranAioEndIoStatement(void *a1);
+bool _FortranAioOutputInteger8(void *cookie, int8_t n);
+bool _FortranAioOutputInteger16(void *cookie, int16_t n);
+bool _FortranAioOutputInteger64(void *cookie, int64_t n);
+bool _FortranAioOutputReal32(void *cookie, float x);
+bool _FortranAioOutputReal64(void *cookie, double x);
+bool _FortranAioOutputComplex32(void *cookie, float re, float im);
+bool _FortranAioOutputComplex64(void *cookie, double re, double im);
+bool _FortranAioOutputLogical(void *cookie, bool truth);
+void _FortranAAbort();
+void _FortranAStopStatementText(char *errmsg, int64_t a1, bool a2, bool a3);
+void _FortranAStopStatement(int32_t a1, bool a2, bool a3);
+
+//  Save the cookie because deferred functions have execution reordered.
+static void *_list_started_cookie = nullptr;
+extern void *V_FortranAioBeginExternalListOutput(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  int32_t v0 = va_arg(args, int32_t);
+  const char *v1 = va_arg(args, const char *);
+  int32_t v2 = va_arg(args, int32_t);
+  va_end(args);
+  void *cookie = _FortranAioBeginExternalListOutput(v0, v1, v2);
+  _list_started_cookie = cookie;
+  return cookie;
+}
+extern void *V_FortranAioBeginExternalFormattedOutput(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  const char *p0 = va_arg(args, const char *);
+  int64_t v0 = va_arg(args, int64_t);
+  void *p1 = va_arg(args, void *);
+  int32_t v1 = va_arg(args, int32_t);
+  const char *p2 = va_arg(args, const char *);
+  int32_t v2 = va_arg(args, int32_t);
+  va_end(args);
+  void *cookie =
+      _FortranAioBeginExternalFormattedOutput(p0, v0, p1, v1, p2, v2);
+  _list_started_cookie = cookie;
+  return cookie;
+}
+extern bool V_FortranAioOutputAscii(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  char *v1 = va_arg(args, char *);
+  uint64_t v2 = va_arg(args, uint64_t);
+  va_end(args);
+  v0 = _list_started_cookie;
+  return _FortranAioOutputAscii(v0, v1, v2);
+}
+extern bool V_FortranAioOutputInteger32(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  uint32_t v1 = va_arg(args, uint32_t);
+  va_end(args);
+  v0 = _list_started_cookie;
+  return _FortranAioOutputInteger32(v0, v1);
+}
+extern uint32_t V_FortranAioEndIoStatement(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  va_end(args);
+  v0 = _list_started_cookie;
+  uint32_t rv = _FortranAioEndIoStatement(v0);
+  return rv;
+}
+extern bool V_FortranAioOutputInteger8(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  uint32_t v1 = va_arg(args, uint32_t);
+  va_end(args);
+  v0 = _list_started_cookie;
+  return _FortranAioOutputInteger8(v0, v1);
+}
+extern bool V_FortranAioOutputInteger16(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  uint32_t v1 = va_arg(args, uint32_t);
+  va_end(args);
+  v0 = _list_started_cookie;
+  return _FortranAioOutputInteger16(v0, v1);
+}
+extern bool V_FortranAioOutputInteger64(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  uint32_t v1 = va_arg(args, uint32_t);
+  va_end(args);
+  v0 = _list_started_cookie;
+  return _FortranAioOutputInteger64(v0, v1);
+}
+extern bool V_FortranAioOutputReal32(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  uint64_t v1 = va_arg(args, uint64_t);
+  va_end(args);
+  v0 = _list_started_cookie;
+  double dv;
+  memcpy(&dv, &v1, 8);
+  return _FortranAioOutputReal32(v0, (float)dv);
+}
+extern bool V_FortranAioOutputReal64(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *cookie = va_arg(args, void *);
+  uint64_t v1 = va_arg(args, uint64_t);
+  va_end(args);
+  cookie = _list_started_cookie;
+  double dv;
+  memcpy(&dv, &v1, 8);
+  return _FortranAioOutputReal64(cookie, dv);
+}
+extern bool V_FortranAioOutputComplex32(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  uint64_t v1 = va_arg(args, uint64_t);
+  uint64_t v2 = va_arg(args, uint64_t);
+  va_end(args);
+  v0 = _list_started_cookie;
+  double dv1, dv2;
+  memcpy(&dv1, &v1, 8);
+  memcpy(&dv2, &v2, 8);
+  return _FortranAioOutputComplex32(v0, (float)dv1, (float)dv2);
+}
+extern bool V_FortranAioOutputComplex64(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  uint64_t v1 = va_arg(args, uint64_t);
+  uint64_t v2 = va_arg(args, uint64_t);
+  va_end(args);
+  v0 = _list_started_cookie;
+  double dv1, dv2;
+  memcpy(&dv1, &v1, 8);
+  memcpy(&dv2, &v2, 8);
+  return _FortranAioOutputComplex64(v0, dv1, dv2);
+}
+extern bool V_FortranAioOutputLogical(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  uint32_t v1 = va_arg(args, uint32_t);
+  va_end(args);
+  v0 = _list_started_cookie;
+  return _FortranAioOutputLogical(v0, v1);
+}
+extern void V_FortranAAbort(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  va_end(args);
+  _FortranAAbort();
+  // Now return to device to run abort from stub
+}
+extern void V_FortranAStopStatementText(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  char *errmsg = va_arg(args, char *);
+  int64_t a1 = va_arg(args, int64_t);
+  uint32_t a2 = va_arg(args, uint32_t);
+  uint32_t a3 = va_arg(args, uint32_t);
+  va_end(args);
+  bool b2 = (bool)a2;
+  bool b3 = (bool)a3;
+  _FortranAStopStatementText(errmsg, a1, b2, b3);
+}
+extern void V_FortranAStopStatement(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  int32_t a1 = va_arg(args, int32_t);
+  uint32_t a2 = va_arg(args, uint32_t);
+  uint32_t a3 = va_arg(args, uint32_t);
+  va_end(args);
+  bool b2 = (bool)a2;
+  bool b3 = (bool)a3;
+  _FortranAStopStatement(a1, b2, b3);
+}
+} // end extern "C"
+
+// Static vars used to defer functions to reorder execution by thread and team.
+static uint32_t _deferred_fn_count = 0;
+static uint32_t _deferred_begin_statements = 0;
+static uint32_t _deferred_end_statements = 0;
+static uint64_t _max_num_threads = 0;
+static uint64_t _max_num_teams = 0;
+
+// structure for deferred functions
+typedef struct {
+  uint32_t NumArgs;    // The number of args in arg_array
+  void *fnptr;         // The function pointer for this index
+  uint64_t fn_idx;     // The function index, good for debug
+  uint32_t dfnid;      // The dvoideferred function id, in order received
+  uint64_t *arg_array; // ptr to malloced arg_array
+  char *c_ptr;         // ptr to null terminated char string
+  char *c_ptr2;        // ptr to null terminated char string
+  uint64_t thread_num;
+  uint64_t num_threads;
+  uint64_t team_num;
+  uint64_t num_teams;
+  EmissaryReturn_t return_value;
+} deferred_entry_t;
+
+static std::vector<deferred_entry_t *> *_deferred_fns_ptr;
+// static std::list<deferred_entry_t *> _deferred_fns;
+//
+
+extern "C" EmissaryReturn_t
+EmissaryFortrt(char *data, emisArgBuf_t *ab,
+               std::unordered_map<void *, void *> *D2HAddrList) {
+  EmissaryReturn_t return_value = (EmissaryReturn_t)0;
+
+  if (ab->DataLen == 0)
+    return _ERC_SUCCESS;
+
+  void *fnptr;
+  if (ab->NumArgs <= 0)
+    return _ERC_ERROR_INVALID_REQUEST;
+
+  emis_argptr_t *a[MAXVARGS];
+  if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+                         &ab->data_not_used, a, D2HAddrList) != _ERC_SUCCESS)
+    return _ERC_ERROR_INVALID_REQUEST;
+
+  // std::list<deferred_entry_t *> _deferred_fns;
+  if (!_deferred_fns_ptr)
+    _deferred_fns_ptr = new std::vector<deferred_entry_t *>;
+
+  char *c_ptr = nullptr;
+  char *c_ptr2 = nullptr;
+  bool defer_for_reorder = true;
+  bool run_deferred_functions = false;
+  switch (ab->emisfnid) {
+  case _FortranAioBeginExternalListOutput_idx: {
+    _deferred_begin_statements++;
+    fnptr = (void *)V_FortranAioBeginExternalListOutput;
+    size_t slen = std::strlen((char *)a[5]) + 1;
+    c_ptr = (char *)aligned_alloc(sizeof(uint64_t *), slen);
+    if (!c_ptr)
+      fprintf(stderr, "MALLOC FAILED for c_ptr size:%ld \n", slen);
+    std::strncpy(c_ptr, (char *)a[5], slen - 1);
+    c_ptr[slen - 1] = (char)0;
+    a[5] = (emis_argptr_t *)c_ptr;
+    break;
+  }
+  case _FortranAioBeginExternalFormattedOutput_idx: {
+    _deferred_begin_statements++;
+    fnptr = (void *)V_FortranAioBeginExternalFormattedOutput;
+    size_t slen = std::strlen((char *)a[8]) + 1;
+    c_ptr = (char *)aligned_alloc(sizeof(uint64_t *), slen);
+    if (!c_ptr)
+      fprintf(stderr, "MALLOC FAILED for c_ptr size:%ld \n", slen);
+    std::strncpy(c_ptr, (char *)a[8], slen - 1);
+    c_ptr[slen - 1] = (char)0;
+    a[8] = (emis_argptr_t *)c_ptr;
+
+    slen = std::strlen((char *)a[4]) + 1;
+    c_ptr2 = (char *)aligned_alloc(sizeof(uint64_t *), slen);
+    if (!c_ptr2)
+      fprintf(stderr, "MALLOC FAILED for c_ptr2 size:%ld \n", slen);
+    std::strncpy(c_ptr2, (char *)a[4], slen - 1);
+    c_ptr2[slen - 1] = (char)0;
+    a[4] = (emis_argptr_t *)c_ptr2;
+    break;
+  }
+  case _FortranAioOutputAscii_idx: {
+    fnptr = (void *)V_FortranAioOutputAscii;
+
+    size_t slen = (size_t)a[6] + 1;
+    c_ptr = (char *)aligned_alloc(sizeof(uint64_t *), slen);
+    if (!c_ptr)
+      fprintf(stderr, "MALLOC FAILED for c_ptr size:%ld \n", slen);
+    std::strncpy(c_ptr, (char *)a[5], slen - 1);
+    c_ptr[slen - 1] = (char)0;
+    a[5] = (emis_argptr_t *)c_ptr;
+
+    break;
+  }
+  case _FortranAioOutputInteger32_idx: {
+    fnptr = (void *)V_FortranAioOutputInteger32;
+    break;
+  }
+  case _FortranAioEndIoStatement_idx: {
+    _deferred_end_statements++;
+    fnptr = (void *)V_FortranAioEndIoStatement;
+    // We cannot use last tread and team number to trigger running deferred
+    // functions because its warp could finish early (out of order). So, if
+    // this is the last FortranAioEndIoStatement by count of begin statements,
+    // then run the deferred functions ordered by team and thread number.
+    if (_deferred_end_statements == _deferred_begin_statements)
+      run_deferred_functions = true;
+    break;
+  }
+  case _FortranAioOutputInteger8_idx: {
+    fnptr = (void *)V_FortranAioOutputInteger8;
+    break;
+  }
+  case _FortranAioOutputInteger16_idx: {
+    fnptr = (void *)V_FortranAioOutputInteger16;
+    break;
+  }
+  case _FortranAioOutputInteger64_idx: {
+    fnptr = (void *)V_FortranAioOutputInteger64;
+    break;
+  }
+  case _FortranAioOutputReal32_idx: {
+    fnptr = (void *)V_FortranAioOutputReal32;
+    break;
+  }
+  case _FortranAioOutputReal64_idx: {
+    fnptr = (void *)V_FortranAioOutputReal64;
+    break;
+  }
+  case _FortranAioOutputComplex32_idx: {
+    fnptr = (void *)V_FortranAioOutputComplex32;
+    break;
+  }
+  case _FortranAioOutputComplex64_idx: {
+    fnptr = (void *)V_FortranAioOutputComplex64;
+    break;
+  }
+  case _FortranAioOutputLogical_idx: {
+    fnptr = (void *)V_FortranAioOutputLogical;
+    break;
+  }
+  case _FortranAAbort_idx: {
+    defer_for_reorder = false;
+    fnptr = (void *)V_FortranAAbort;
+    break;
+  }
+  case _FortranAStopStatementText_idx: {
+    defer_for_reorder = false;
+    fnptr = (void *)V_FortranAStopStatementText;
+    break;
+  }
+  case _FortranAStopStatement_idx: {
+    defer_for_reorder = false;
+    fnptr = (void *)V_FortranAStopStatement;
+    break;
+  }
+  case _FortranAio_INVALID:
+  default: {
+    defer_for_reorder = false;
+    break;
+  }
+  } // end of switch
+
+  if (defer_for_reorder) {
+    _deferred_fn_count++;
+    deferred_entry_t *q = new deferred_entry_t;
+
+    q->dfnid = _deferred_fn_count - 1;
+    q->thread_num = (uint64_t)a[0];
+    q->num_threads = (uint64_t)a[1];
+    _max_num_threads =
+        (q->num_threads > _max_num_threads) ? q->num_threads : _max_num_threads;
+    q->team_num = (uint64_t)a[2];
+    q->num_teams = (uint64_t)a[3];
+    _max_num_teams =
+        (q->num_teams > _max_num_teams) ? q->num_teams : _max_num_teams;
+    q->NumArgs = ab->NumArgs - 4;
+    q->fnptr = fnptr;
+    q->fn_idx = ab->emisfnid;
+    uint64_t *arg_array = (uint64_t *)aligned_alloc(
+        sizeof(uint64_t), (ab->NumArgs - 4) * sizeof(uint64_t));
+    if (!arg_array)
+      fprintf(stderr, " MALLOC FAILED for arg_array size:%ld \n",
+              sizeof(uint64_t) * (ab->NumArgs - 4));
+    for (uint32_t i = 0; i < ab->NumArgs - 4; i++) {
+      uint64_t val = (uint64_t)a[i + 4];
+      arg_array[i] = val;
+    }
+    q->arg_array = arg_array;
+    q->return_value = (EmissaryReturn_t)0;
+    q->c_ptr = c_ptr;
+    q->c_ptr2 = c_ptr2;
+    _deferred_fns_ptr->push_back(q);
+  } else {
+    // execute a non deferred function
+    return_value = EmissaryCallFnptr<EmissaryReturn_t, emisfn_t>(
+        ab->NumArgs - 4, fnptr, &a[4]);
+  }
+
+  if (run_deferred_functions) {
+    // This specific team and thread ordering does not reflect the
+    // actual non-deterministic ordering.
+    for (uint32_t team_num = 0; team_num < _max_num_teams; team_num++) {
+      for (uint32_t thread_num = 0; thread_num < _max_num_threads;
+           thread_num++) {
+        for (auto q : *_deferred_fns_ptr) {
+          if ((thread_num == q->thread_num) && (team_num == q->team_num)) {
+            for (uint32_t i = 0; i < q->NumArgs; i++)
+              a[i] = (emis_argptr_t *)q->arg_array[i];
+            q->return_value = EmissaryCallFnptr<EmissaryReturn_t, emisfn_t>(
+                q->NumArgs, q->fnptr, a);
+          }
+          // Only the return value for the last end statement is returned.
+          return_value = q->return_value;
+        }
+      }
+    }
+
+    //  Reset static deferred function counters and free memory
+    for (auto q : *_deferred_fns_ptr) {
+      if (q->c_ptr)
+        free(q->c_ptr);
+      if (q->c_ptr2)
+        free(q->c_ptr2);
+      free(q->arg_array);
+      delete q;
+    }
+    _deferred_fns_ptr->clear();
+    _deferred_fn_count = 0;
+    _deferred_begin_statements = 0;
+    _deferred_end_statements = 0;
+    _max_num_threads = 0;
+    _max_num_teams = 0;
+    delete _deferred_fns_ptr;
+    _deferred_fns_ptr = nullptr;
+  } // end run_deferred_functions
+
+  return return_value;
+} // end EmissaryFortrt
diff --git a/offload/plugins-nextgen/common/src/EmissaryPrint.cpp b/offload/plugins-nextgen/common/src/EmissaryPrint.cpp
new file mode 100644
index 0000000000000..80c55d3c5a5fd
--- /dev/null
+++ b/offload/plugins-nextgen/common/src/EmissaryPrint.cpp
@@ -0,0 +1,423 @@
+//===--- offload/plugins-nextgen/common/src/EmissaryPrint.cpp ----- C++ ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  Host support for misc emissary API.
+//
+//===----------------------------------------------------------------------===//
+#include <Emissary.h>
+#include <EmissaryIds.h>
+#include <assert.h>
+#include <cstring>
+#include <ctype.h>
+#include <list>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static service_rc emissary_printf(uint *rc, emisArgBuf_t *ab);
+static service_rc emissary_fprintf(uint *rc, emisArgBuf_t *ab);
+
+extern "C" EmissaryReturn_t EmissaryPrint(char *data, emisArgBuf_t *ab) {
+  uint32_t return_value;
+  service_rc rc;
+  switch (ab->emisfnid) {
+  case _printf_idx: {
+    rc = emissary_printf(&return_value, ab);
+    break;
+  }
+  case _fprintf_idx: {
+    rc = emissary_fprintf(&return_value, ab);
+    break;
+  }
+  case _ockl_asan_report_idx: {
+    fprintf(stderr, " asan_report not yet implemented\n");
+    return_value = 0;
+    rc = _ERC_STATUS_ERROR;
+    break;
+  }
+  case _print_INVALID:
+  default: {
+    fprintf(stderr, " INVALID emissary function id (%d) for PRINT API \n",
+            ab->emisfnid);
+    return_value = 0;
+    rc = _ERC_STATUS_ERROR;
+    break;
+  }
+  }
+  if (rc != _ERC_SUCCESS)
+    fprintf(stderr, "HOST failure in _emissary_execute_print rc:%d\n", rc);
+
+  return (EmissaryReturn_t)return_value;
+}
+
+// NUMFPREGS and FPREGSZ are part of x86 vargs ABI that
+// is recreated with the printf support.
+#define NUMFPREGS 8
+#define FPREGSZ 16
+
+typedef int uint128_t __attribute__((mode(TI)));
+struct emissary_pfIntRegs {
+  uint64_t rdi, rsi, rdx, rcx, r8, r9;
+};
+typedef struct emissary_pfIntRegs emissary_pfIntRegs_t; // size = 48 bytes
+
+struct emissary_pfRegSaveArea {
+  emissary_pfIntRegs_t iregs;
+  uint128_t freg[NUMFPREGS];
+};
+typedef struct emissary_pfRegSaveArea
+    emissary_pfRegSaveArea_t; // size = 304 bytes
+
+struct emissary_ValistExt {
+  uint32_t gp_offset;      /* offset to next available gpr in reg_save_area */
+  uint32_t fp_offset;      /* offset to next available fpr in reg_save_area */
+  void *overflow_arg_area; /* args that are passed on the stack */
+  emissary_pfRegSaveArea_t *reg_save_area; /* int and fp registers */
+  size_t overflow_size;
+} __attribute__((packed));
+typedef struct emissary_ValistExt emissary_ValistExt_t;
+
+// Handle overflow when building the va_list for vprintf
+static service_rc emissary_pfGetOverflow(emissary_ValistExt_t *valist,
+                                         size_t needsize) {
+  if (needsize < valist->overflow_size)
+    return _ERC_SUCCESS;
+
+  // Make the overflow area bigger
+  size_t stacksize;
+  void *newstack;
+  if (valist->overflow_size == 0) {
+    // Make initial save area big to reduce mallocs
+    stacksize = (FPREGSZ * NUMFPREGS) * 2;
+    if (needsize > stacksize)
+      stacksize = needsize; // maybe a big string
+  } else {
+    // Initial save area not big enough, double it
+    stacksize = valist->overflow_size * 2;
+  }
+  if (!(newstack = malloc(stacksize))) {
+    return _ERC_STATUS_ERROR;
+  }
+  memset(newstack, 0, stacksize);
+  if (valist->overflow_size) {
+    memcpy(newstack, valist->overflow_arg_area, valist->overflow_size);
+    free(valist->overflow_arg_area);
+  }
+  valist->overflow_arg_area = newstack;
+  valist->overflow_size = stacksize;
+  return _ERC_SUCCESS;
+}
+
+// Add an integer to the va_list for vprintf
+static service_rc emissary_pfAddInteger(emissary_ValistExt_t *valist, char *val,
+                                        size_t valsize, size_t *stacksize) {
+  uint64_t ival;
+  switch (valsize) {
+  case 1:
+    ival = *(uint8_t *)val;
+    break;
+  case 2:
+    ival = *(uint32_t *)val;
+    break;
+  case 4:
+    ival = (*(uint32_t *)val);
+    break;
+  case 8:
+    ival = *(uint64_t *)val;
+    break;
+  default: {
+    return _ERC_STATUS_ERROR;
+  }
+  }
+  //  Always copy 8 bytes, sizeof(ival)
+  if ((valist->gp_offset + sizeof(ival)) <= sizeof(emissary_pfIntRegs_t)) {
+    memcpy(((char *)valist->reg_save_area + valist->gp_offset), &ival,
+           sizeof(ival));
+    valist->gp_offset += sizeof(ival);
+    return _ERC_SUCCESS;
+  }
+  // Ensure valist overflow area is big enough
+  size_t needsize = (size_t)*stacksize + sizeof(ival);
+  if (emissary_pfGetOverflow(valist, needsize) != _ERC_SUCCESS)
+    return _ERC_STATUS_ERROR;
+  // Copy to overflow
+  memcpy((char *)(valist->overflow_arg_area) + (size_t)*stacksize, &ival,
+         sizeof(ival));
+
+  *stacksize += sizeof(ival);
+  return _ERC_SUCCESS;
+}
+
+// Add a String argument when building va_list for vprintf
+static service_rc emissary_pfAddString(emissary_ValistExt_t *valist, char *val,
+                                       size_t strsz, size_t *stacksize) {
+  size_t valsize =
+      sizeof(char *); // ABI captures pointer to string,  not string
+  if ((valist->gp_offset + valsize) <= sizeof(emissary_pfIntRegs_t)) {
+    memcpy(((char *)valist->reg_save_area + valist->gp_offset), val, valsize);
+    valist->gp_offset += valsize;
+    return _ERC_SUCCESS;
+  }
+  size_t needsize = (size_t)*stacksize + valsize;
+  if (emissary_pfGetOverflow(valist, needsize) != _ERC_SUCCESS)
+    return _ERC_STATUS_ERROR;
+  memcpy((char *)(valist->overflow_arg_area) + (size_t)*stacksize, val,
+         valsize);
+  *stacksize += valsize;
+  return _ERC_SUCCESS;
+}
+
+// Add a floating point value when building va_list for vprintf
+static service_rc emissary_pfAddFloat(emissary_ValistExt_t *valist,
+                                      char *numdata, size_t valsize,
+                                      size_t *stacksize) {
+  // we could use load because doubles are now aligned
+  double dval;
+  if (valsize == 4) {
+    float fval;
+    memcpy(&fval, numdata, 4);
+    dval = (double)fval; // Extend single to double per abi
+  } else if (valsize == 8) {
+    memcpy(&dval, numdata, 8);
+  } else {
+    return _ERC_STATUS_ERROR;
+  }
+  if ((valist->fp_offset + FPREGSZ) <= sizeof(emissary_pfRegSaveArea_t)) {
+    memcpy(((char *)valist->reg_save_area + (size_t)(valist->fp_offset)), &dval,
+           sizeof(double));
+    valist->fp_offset += FPREGSZ;
+    return _ERC_SUCCESS;
+  }
+  size_t needsize = (size_t)*stacksize + sizeof(double);
+  if (emissary_pfGetOverflow(valist, needsize) != _ERC_SUCCESS)
+    return _ERC_STATUS_ERROR;
+  memcpy((char *)(valist->overflow_arg_area) + (size_t)*stacksize, &dval,
+         sizeof(double));
+  // move only by the size of the double (8 bytes)
+  *stacksize += sizeof(double);
+  return _ERC_SUCCESS;
+}
+
+// Build an extended va_list for vprintf by unpacking the buffer
+static service_rc emissary_pfBuildValist(emissary_ValistExt_t *valist,
+                                         int NumArgs, char *keyptr,
+                                         char *dataptr, char *strptr,
+                                         unsigned long long *data_not_used) {
+  emissary_pfRegSaveArea_t *regs;
+  size_t regs_size = sizeof(*regs);
+  regs = (emissary_pfRegSaveArea_t *)malloc(regs_size);
+  if (!regs)
+    return _ERC_STATUS_ERROR;
+  memset(regs, 0, regs_size);
+  *valist = (emissary_ValistExt_t){
+      .gp_offset = 0,
+      .fp_offset = 0,
+      .overflow_arg_area = NULL,
+      .reg_save_area = regs,
+      .overflow_size = 0,
+  };
+
+  size_t num_bytes;
+  size_t bytes_consumed;
+  size_t strsz;
+  size_t fillerNeeded;
+
+  size_t stacksize = 0;
+
+  for (int argnum = 0; argnum < NumArgs; argnum++) {
+    num_bytes = 0;
+    strsz = 0;
+    unsigned int key = *(unsigned int *)keyptr;
+    unsigned int llvmID = key >> 16;
+    unsigned int numbits = (key << 16) >> 16;
+    switch (llvmID) {
+    case FloatTyID:  ///<  2: 32-bit floating point type
+    case DoubleTyID: ///<  3: 64-bit floating point type
+    case FP128TyID:  ///<  5: 128-bit floating point type (112-bit mantissa)
+      num_bytes = numbits / 8;
+      bytes_consumed = num_bytes;
+      fillerNeeded = ((size_t)dataptr) % num_bytes;
+      if (fillerNeeded) {
+        dataptr += fillerNeeded;
+        bytes_consumed += fillerNeeded;
+      }
+      if ((*data_not_used) < bytes_consumed)
+        return _ERC_DATA_USED_ERROR;
+      if (valist->fp_offset == 0)
+        valist->fp_offset = sizeof(emissary_pfIntRegs_t);
+      if (emissary_pfAddFloat(valist, dataptr, num_bytes, &stacksize))
+        return _ERC_ADDFLOAT_ERROR;
+      break;
+
+    case IntegerTyID: ///< 11: Arbitrary bit width integers
+      num_bytes = numbits / 8;
+      bytes_consumed = num_bytes;
+      fillerNeeded = ((size_t)dataptr) % num_bytes;
+      if (fillerNeeded) {
+        dataptr += fillerNeeded;
+        bytes_consumed += fillerNeeded;
+      }
+      if ((*data_not_used) < bytes_consumed)
+        return _ERC_DATA_USED_ERROR;
+      if (emissary_pfAddInteger(valist, dataptr, num_bytes, &stacksize))
+        return _ERC_ADDINT_ERROR;
+      break;
+
+    case PointerTyID:     ///< 15: Pointers
+      if (numbits == 1) { // This is a pointer to string
+        num_bytes = 4;
+        bytes_consumed = num_bytes;
+        strsz = (size_t) * (unsigned int *)dataptr;
+        if ((*data_not_used) < bytes_consumed)
+          return _ERC_DATA_USED_ERROR;
+        if (emissary_pfAddString(valist, (char *)&strptr, strsz, &stacksize))
+          return _ERC_ADDSTRING_ERROR;
+      } else {
+        num_bytes = 8;
+        bytes_consumed = num_bytes;
+        fillerNeeded = ((size_t)dataptr) % num_bytes;
+        if (fillerNeeded) {
+          dataptr += fillerNeeded; // dataptr is now aligned
+          bytes_consumed += fillerNeeded;
+        }
+        if ((*data_not_used) < bytes_consumed)
+          return _ERC_DATA_USED_ERROR;
+        if (emissary_pfAddInteger(valist, dataptr, num_bytes, &stacksize))
+          return _ERC_ADDINT_ERROR;
+      }
+      break;
+
+    case HalfTyID:           ///<  1: 16-bit floating point type
+    case ArrayTyID:          ///< 14: Arrays
+    case StructTyID:         ///< 13: Structures
+    case FunctionTyID:       ///< 12: Functions
+    case TokenTyID:          ///< 10: Tokens
+    case MetadataTyID:       ///<  8: Metadata
+    case LabelTyID:          ///<  7: Labels
+    case PPC_FP128TyID:      ///<  6: 128-bit floating point type (two 64-bits,
+                             ///<  PowerPC)
+    case X86_FP80TyID:       ///<  4: 80-bit floating point type (X87)
+    case FixedVectorTyID:    ///< 16: Fixed width SIMD vector type
+    case ScalableVectorTyID: ///< 17: Scalable SIMD vector type
+    case TypedPointerTyID:   ///< Typed pointer used by some GPU targets
+    case TargetExtTyID:      ///< Target extension type
+    case VoidTyID:
+      return _ERC_UNSUPPORTED_ID_ERROR;
+      break;
+    default:
+      return _ERC_INVALID_ID_ERROR;
+    }
+
+    dataptr += num_bytes;
+    strptr += strsz;
+    *data_not_used -= bytes_consumed;
+    keyptr += 4;
+  }
+  return _ERC_SUCCESS;
+} // end emissary_pfBuildValist
+
+/*
+ *  The buffer to pack arguments for all vargs functions has thes 4 sections:
+ *  1. Header  datalen 4 bytes
+ *             numargs 4 bytes
+ *  2. keyptr  A 4-byte key for each arg including string args
+ *             Each 4-byte key contains llvmID and numbits to
+ *             describe the datatype.
+ *  3. argptr  Ths data values for each argument.
+ *             Each arg is aligned according to its size.
+ *             If the field is a string
+ *             the dataptr contains the string length.
+ *  4. strptr  Exection time string values
+ */
+static service_rc emissary_fprintf(uint *rc, emisArgBuf_t *ab) {
+
+  if (ab->DataLen == 0)
+    return _ERC_SUCCESS;
+
+  char *fmtstr = ab->strptr;
+  FILE *fileptr = (FILE *)*((size_t *)ab->argptr);
+
+  // Skip past the file pointer
+  ab->NumArgs--;
+  ab->keyptr += 4;
+  ab->argptr += sizeof(FILE *);
+  ab->data_not_used -= sizeof(FILE *);
+
+  // Skip past the format string
+  ab->NumArgs--;
+  ab->keyptr += 4;
+  size_t abstrsz = (size_t) * (unsigned int *)ab->argptr;
+  ab->strptr += abstrsz;
+  ab->argptr += 4;
+  ab->data_not_used -= 4;
+
+  emissary_ValistExt_t valist;
+  va_list *real_va_list;
+  real_va_list = (va_list *)&valist;
+
+  if (emissary_pfBuildValist(&valist, ab->NumArgs, ab->keyptr, ab->argptr,
+                             ab->strptr, &ab->data_not_used) != _ERC_SUCCESS)
+    return _ERC_ERROR_INVALID_REQUEST;
+
+  // Roll back offsets and save stack pointer
+  valist.gp_offset = 0;
+  valist.fp_offset = sizeof(emissary_pfIntRegs_t);
+  void *save_stack = valist.overflow_arg_area;
+  *rc = vfprintf(fileptr, fmtstr, *real_va_list);
+  if (valist.reg_save_area)
+    free(valist.reg_save_area);
+  if (save_stack)
+    free(save_stack);
+  return _ERC_SUCCESS;
+}
+
+static service_rc emissary_printf(uint *rc, emisArgBuf_t *ab) {
+  if (ab->DataLen == 0)
+    return _ERC_SUCCESS;
+
+  char *fmtstr = ab->strptr;
+
+  // Skip past the format string
+  ab->NumArgs--;
+  ab->keyptr += 4;
+  size_t abstrsz = (size_t) * (unsigned int *)ab->argptr;
+  ab->strptr += abstrsz;
+  ab->argptr += 4;
+  ab->data_not_used -= 4;
+
+  emissary_ValistExt_t valist;
+  va_list *real_va_list;
+  real_va_list = (va_list *)&valist;
+
+  if (emissary_pfBuildValist(&valist, ab->NumArgs, ab->keyptr, ab->argptr,
+                             ab->strptr, &ab->data_not_used) != _ERC_SUCCESS)
+    return _ERC_ERROR_INVALID_REQUEST;
+
+  // Roll back offsets and save stack pointer for
+  valist.gp_offset = 0;
+  valist.fp_offset = sizeof(emissary_pfIntRegs_t);
+  void *save_stack = valist.overflow_arg_area;
+  *rc = vprintf(fmtstr, *real_va_list);
+  if (valist.reg_save_area)
+    free(valist.reg_save_area);
+  if (save_stack)
+    free(save_stack);
+  return _ERC_SUCCESS;
+}
+
+extern "C" void *global_allocate(uint32_t bufsz) {
+  return malloc((size_t)bufsz);
+}
+extern "C" int global_free(void *ptr) {
+  free(ptr);
+  return 0;
+}
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index 8d6704733970d..c3f041aa26637 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -13,9 +13,11 @@
 
 #include "PluginInterface.h"
 
+#include "Emissary.h"
 #include "shared/rpc.h"
 #include "shared/rpc_opcodes.h"
 #include "shared/rpc_server.h"
+#include <unordered_map>
 
 using namespace llvm;
 using namespace omp;
@@ -63,6 +65,156 @@ rpc::Status handleOffloadOpcodes(plugin::GenericDeviceTy &Device,
     });
     break;
   }
+
+  // This case handles the device function __llvm_emissary_rpc for emissary
+  // APIs that require no d2h or h2d memory transfer.
+  case OFFLOAD_EMISSARY: {
+    uint64_t Sizes[NumLanes] = {0};
+    unsigned long long Results[NumLanes] = {0};
+    void *buf_ptrs[NumLanes] = {nullptr};
+    Port.recv_n(buf_ptrs, Sizes, [&](uint64_t Size) { return new char[Size]; });
+    uint32_t id = 0;
+    for (void *buffer_ptr : buf_ptrs) {
+      if (buffer_ptr) {
+        emisArgBuf_t ab;
+        emisExtractArgBuf((char *)buffer_ptr, &ab);
+        Results[id++] = Emissary((char *)buffer_ptr, &ab, nullptr);
+      }
+    }
+    Port.send([&](rpc::Buffer *Buffer, uint32_t ID) {
+      Buffer->data[0] = static_cast<uint64_t>(Results[ID]);
+      delete[] reinterpret_cast<char *>(buf_ptrs[ID]);
+    });
+    break;
+  }
+
+  // This case handles the device function __llvm_emissary_rpc_dm for emissary
+  // APIs require D2H or H2D transfer vectors to be processed through the Port.
+  // FIXME: test with multiple transfer vectors of the same type.
+  case OFFLOAD_EMISSARY_DM: {
+    uint64_t Sizes[NumLanes] = {0};
+    unsigned long long Results[NumLanes] = {0};
+    void *buf_ptrs[NumLanes] = {nullptr};
+    Port.recv_n(buf_ptrs, Sizes, [&](uint64_t Size) { return new char[Size]; });
+
+    uint32_t id = 0;
+    emisArgBuf_t AB[NumLanes];
+    std::unordered_map<void *, void *> D2HAddrList;
+    void *Xfers[NumLanes] = {nullptr};
+    void *devXfers[NumLanes] = {nullptr};
+    uint64_t XferSzs[NumLanes] = {0};
+    uint32_t numSendXfers = 0;
+    id = 0;
+    for (void *buffer_ptr : buf_ptrs) {
+      if (buffer_ptr) {
+        emisArgBuf_t *ab = &AB[id];
+        emisExtractArgBuf((char *)buffer_ptr, ab);
+        for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++) {
+          numSendXfers++;
+          devXfers[id] = (void *)*((uint64_t *)ab->argptr);
+          XferSzs[id] = (size_t) * ((size_t *)(ab->argptr + sizeof(void *)));
+          emisSkipXferArgSet(ab);
+        }
+        // Allocate the host space for the receive Xfers
+        for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+          void *devAddr = (void *)*((uint64_t *)ab->argptr);
+          size_t devSz = (size_t) * ((size_t *)(ab->argptr + sizeof(void *)));
+          void *hostAddr = new char[devSz];
+          D2HAddrList.insert(std::pair<void *, void *>(devAddr, hostAddr));
+          emisSkipXferArgSet(ab);
+        }
+        id++;
+      }
+    }
+    // recv_n for device send_n into new host-allocated Xfers
+    if (numSendXfers)
+      Port.recv_n(Xfers, XferSzs,
+                  [&](uint64_t Size) { return new char[Size]; });
+
+    // Xfers now contains just allocated host addrs for sends and
+    // devXfers contains corresponding devAddr for those sends
+    // Build map to pass to Emissary
+    id = 0;
+    for (void *Xfer : Xfers) {
+      if (Xfer) {
+        D2HAddrList.insert(std::pair<void *, void *>(devXfers[id], Xfer));
+        id++;
+      }
+    }
+
+    // Call Emissary for each active lane
+    id = 0;
+    for (void *buffer_ptr : buf_ptrs) {
+      if (buffer_ptr) {
+        emisArgBuf_t *ab = &AB[id];
+        emisExtractArgBuf((char *)buffer_ptr, ab);
+        for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++)
+          emisSkipXferArgSet(ab);
+        for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++)
+          emisSkipXferArgSet(ab);
+        Results[id] = Emissary((char *)buffer_ptr, ab, &D2HAddrList);
+        id++;
+      }
+    }
+
+    // Process send_n for the H2D Xfers.
+    void *recvXfers[NumLanes] = {nullptr};
+    uint64_t recvXferSzs[NumLanes] = {0};
+    id = 0;
+    uint32_t numRecvXfers = 0;
+    for (void *buffer_ptr : buf_ptrs) {
+      if (buffer_ptr) {
+        emisArgBuf_t *ab = &AB[id];
+        // Reset ArgBuf tracker
+        emisExtractArgBuf((char *)buffer_ptr, ab);
+        for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++)
+          emisSkipXferArgSet(ab);
+        for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+          numRecvXfers++;
+          void *devAddr = (void *)*((uint64_t *)ab->argptr);
+          recvXfers[id] = D2HAddrList[devAddr];
+          recvXferSzs[id] =
+              (uint64_t) * ((size_t *)(ab->argptr + sizeof(void *)));
+          emisSkipXferArgSet(ab);
+        }
+        id++;
+      }
+    }
+    if (numRecvXfers)
+      Port.send_n(recvXfers, recvXferSzs);
+    // Cleanup all host allocated transfer buffers
+    id = 0;
+    for (void *buffer_ptr : buf_ptrs) {
+      if (buffer_ptr) {
+        emisArgBuf_t *ab = &AB[id];
+        // Reset the ArgBuf tracker ab
+        emisExtractArgBuf((char *)buffer_ptr, ab);
+        // Cleanup host allocated send Xfers
+        for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++) {
+          void *devAddr = (void *)*((uint64_t *)ab->argptr);
+          void *hostAddr = D2HAddrList[devAddr];
+          delete[] reinterpret_cast<char *>(hostAddr);
+          emisSkipXferArgSet(ab);
+        }
+        // Cleanup host allocated bufs
+        for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+          void *devAddr = (void *)*((uint64_t *)ab->argptr);
+          void *hostAddr = D2HAddrList[devAddr];
+          delete[] reinterpret_cast<char *>(hostAddr);
+          emisSkipXferArgSet(ab);
+        }
+        id++;
+      }
+    }
+
+    Port.send([&](rpc::Buffer *Buffer, uint32_t ID) {
+      Buffer->data[0] = static_cast<uint64_t>(Results[ID]);
+      delete[] reinterpret_cast<char *>(buf_ptrs[ID]);
+    });
+
+    break;
+  } // END CASE OFFLOAD_EMISSARY
+
   default:
     return rpc::RPC_UNHANDLED_OPCODE;
     break;
diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt
index 54cfdfef440a5..48873601e4f8f 100644
--- a/openmp/device/CMakeLists.txt
+++ b/openmp/device/CMakeLists.txt
@@ -7,6 +7,15 @@ if(LLVM_VERSION_MAJOR AND NOT (CMAKE_CXX_COMPILER_ID MATCHES "[Cc]lang" AND
                       " is not 'Clang ${req_ver}'.")
 endif()
 
+option(OFFLOAD_ENABLE_EMISSARY_APIS "Enable build of GPU Emissary APIs" ON)
+if(OFFLOAD_ENABLE_EMISSARY_APIS)
+  add_definitions(-DOFFLOAD_ENABLE_EMISSARY_APIS)
+  set(emissary_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/EmissaryFortrt.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/EmissaryPrint.cpp
+  )
+endif()
+
 set(src_files
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Allocator.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Configuration.cpp
@@ -23,6 +32,7 @@ set(src_files
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Tasking.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/DeviceUtils.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Workshare.cpp
+  ${emissary_sources}
 )
 
 list(APPEND compile_options -flto)
diff --git a/openmp/device/src/EmissaryFortrt.cpp b/openmp/device/src/EmissaryFortrt.cpp
new file mode 100644
index 0000000000000..a0d45f4ae2080
--- /dev/null
+++ b/openmp/device/src/EmissaryFortrt.cpp
@@ -0,0 +1,144 @@
+//===- EmissaryFortrt.cpp - Fortran Runtime emissary API ----- ---- c++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Device stubs for Fortran Runtime emissary API
+//
+//===----------------------------------------------------------------------===//
+
+#include "DeviceTypes.h"
+#include "EmissaryIds.h"
+#include "Shared/RPCOpcodes.h"
+#include "shared/rpc.h"
+
+unsigned long long _emissary_exec(unsigned long long, ...);
+
+extern "C" {
+
+// The clang compiler will generate calls to this only when a string length is
+// not a compile time constant.
+uint32_t __strlen_max(char *instr, uint32_t maxstrlen) {
+  for (uint32_t i = 0; i < maxstrlen; i++)
+    if (instr[i] == (char)0)
+      return (uint32_t)(i + 1);
+  return maxstrlen;
+}
+
+uint32_t omp_get_thread_num();
+uint32_t omp_get_num_threads();
+uint32_t omp_get_team_num();
+uint32_t omp_get_num_teams();
+
+// All Fortran Runtime Functions pass 4 extra args to assist with
+// defered execution and debug. The host variadic wrappers do not use
+// these arguments when calling the actual Fortran runtime.
+#define _EXTRA_ARGS                                                            \
+  omp_get_thread_num(), omp_get_num_threads(), omp_get_team_num(),             \
+      omp_get_num_teams()
+#define _START_ARGS(idx) _PACK_EMIS_IDS(EMIS_ID_FORTRT, idx, 0, 0), _EXTRA_ARGS,
+
+void *_FortranAioBeginExternalListOutput(uint32_t a1, const char *a2,
+                                         uint32_t a3) {
+  void *cookie = (void *)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioBeginExternalListOutput_idx, 0,
+                     0),
+      _EXTRA_ARGS, a1, a2, a3);
+  return cookie;
+}
+
+void *_FortranAioBeginExternalFormattedOutput(char *fmt, uint64_t fmtlen,
+                                              void *ptr, uint32_t val1,
+                                              char *source_name,
+                                              uint32_t val2) {
+  fmt[fmtlen - 1] = (char)0;
+  void *cookie = (void *)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT,
+                     _FortranAioBeginExternalFormattedOutput_idx, 0, 0),
+      _EXTRA_ARGS, fmt, fmtlen, ptr, val1, source_name, val2);
+  return cookie;
+}
+
+bool _FortranAioOutputAscii(void *a1, char *a2, uint64_t a3) {
+  // insert null terminating char so  _emissary_exec can correctly
+  // calculate runtime str length
+  a2[a3 - 1] = (char)0;
+  return (bool)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputAscii_idx, 0, 0),
+      _EXTRA_ARGS, a1, a2, a3);
+}
+bool _FortranAioOutputInteger32(void *a1, uint32_t a2) {
+  return (bool)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputInteger32_idx, 0, 0),
+      _EXTRA_ARGS, a1, a2);
+}
+uint32_t _FortranAioEndIoStatement(void *a1) {
+  return (uint32_t)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioEndIoStatement_idx, 0, 0),
+      _EXTRA_ARGS, a1);
+}
+bool _FortranAioOutputInteger8(void *cookie, int8_t n) {
+  return (bool)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputInteger8_idx, 0, 0),
+      _EXTRA_ARGS, cookie, n);
+}
+bool _FortranAioOutputInteger16(void *cookie, int16_t n) {
+  return (bool)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputInteger16_idx, 0, 0),
+      _EXTRA_ARGS, cookie, n);
+}
+bool _FortranAioOutputInteger64(void *cookie, int64_t n) {
+  return (bool)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputInteger64_idx, 0, 0),
+      _EXTRA_ARGS, cookie, n);
+}
+bool _FortranAioOutputReal32(void *cookie, float x) {
+  return (bool)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputReal32_idx, 0, 0),
+      _EXTRA_ARGS, cookie, x);
+}
+bool _FortranAioOutputReal64(void *cookie, double x) {
+  return (bool)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputReal64_idx, 0, 0),
+      _EXTRA_ARGS, cookie, x);
+}
+bool _FortranAioOutputComplex32(void *cookie, float re, float im) {
+  return (bool)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputComplex32_idx, 0, 0),
+      _EXTRA_ARGS, cookie, re, im);
+}
+bool _FortranAioOutputComplex64(void *cookie, double re, double im) {
+  return (bool)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputComplex64_idx, 0, 0),
+      _EXTRA_ARGS, cookie, re, im);
+}
+bool _FortranAioOutputLogical(void *cookie, bool barg) {
+  return (bool)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputLogical_idx, 0, 0),
+      _EXTRA_ARGS, cookie, barg);
+}
+void _FortranAAbort() {
+  _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAAbort_idx, 0, 0),
+                 _EXTRA_ARGS);
+  // When  host service _FortranAAbort finishes, we must die from the device.
+  __builtin_trap();
+}
+void _FortranAStopStatement(int32_t a1, bool a2, bool a3) {
+  _emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAStopStatement_idx, 0, 0),
+      _EXTRA_ARGS, a1, a2, a3);
+  __builtin_trap();
+}
+void _FortranAStopStatementText(char *errmsg, int64_t a1, bool a2, bool a3) {
+  errmsg[a1 - 1] = (char)0;
+  _emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAStopStatementText_idx, 0, 0),
+      _EXTRA_ARGS, errmsg, a1, a2, a3);
+  __builtin_trap();
+}
+
+} // end extern "C"
+#undef _EXTRA_ARGS
diff --git a/openmp/device/src/EmissaryPrint.cpp b/openmp/device/src/EmissaryPrint.cpp
new file mode 100644
index 0000000000000..80bd0262d5ed7
--- /dev/null
+++ b/openmp/device/src/EmissaryPrint.cpp
@@ -0,0 +1,79 @@
+//===----------- EmissaryPrint.cpp - Misc Emissary API ------------ c++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Device stubs for misc emissary API
+//
+//===----------------------------------------------------------------------===//
+
+#include "Allocator.h"
+#include "Configuration.h"
+#include "DeviceTypes.h"
+#include "Shared/RPCOpcodes.h"
+#include "shared/rpc.h"
+
+#include "Debug.h"
+#include "EmissaryIds.h"
+
+#if 0
+__attribute__((flatten, always_inline)) void f90print_(char *s) {
+  _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _printf_idx,0,0), "%s\n", s);
+}
+__attribute__((flatten, always_inline)) void f90printi_(char *s, int *i) {
+  _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _printf_idx,0,0), "%s $d\n", s, *i);
+}
+__attribute__((flatten, always_inline)) void f90printl_(char *s, long *i) {
+  _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _printf_idx,0,0), "%s %ld\n", s, *i);
+}
+__attribute__((flatten, always_inline)) void f90printf_(char *s, float *f) {
+  _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _printf_idx,0,0), "%s %f\n", s, *f);
+}
+__attribute__((flatten, always_inline)) void f90printd_(char *s, double *d) {
+  _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _printf_idx,0,0), "%s %g\n", s, *d);
+}
+#endif
+
+// This definition of __ockl_devmem_request and __ockl_sanitizer_report needs to
+// override the weak symbol for __ockl_devmem_request and
+// __ockl_sanitizer_report in rocm device lib ockl.bc because ockl uses
+// hostcall but OpenMP uses rpc.
+//
+extern "C" {
+__attribute__((noinline)) int
+__ockl_sanitizer_report(uint64_t addr, uint64_t pc, uint64_t wgidx,
+                        uint64_t wgidy, uint64_t wgidz, uint64_t wave_id,
+                        uint64_t is_read, uint64_t access_size) {
+  unsigned long long rc = _emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_PRINT, _ockl_asan_report_idx, 0, 0), addr, pc,
+      wgidx, wgidy, wgidz, wave_id, is_read, access_size);
+  return ((int)rc);
+}
+#if SANITIZER_AMDGPU
+__attribute__((noinline)) uint64_t __asan_malloc_impl(uint64_t bufsz,
+                                                      uint64_t pc);
+__attribute__((noinline)) void __asan_free_impl(uint64_t ptr, uint64_t pc);
+#endif
+
+__attribute__((flatten, always_inline)) char *global_allocate(uint32_t bufsz) {
+#if SANITIZER_AMDGPU
+  return (char *)__asan_malloc_impl(bufsz,
+                                    (uint64_t)__builtin_return_address(0));
+#else
+  return (char *)malloc((uint64_t)bufsz);
+#endif
+}
+__attribute__((flatten, always_inline)) int global_free(void *ptr) {
+#if SANITIZER_AMDGPU
+  __asan_free_impl((uint64_t)ptr, (uint64_t)__builtin_return_address(0));
+#else
+  free(ptr);
+#endif
+  return 0;
+}
+
+//
+} // end extern "C"
diff --git a/openmp/device/src/Misc.cpp b/openmp/device/src/Misc.cpp
index 5d5a2a383f2b2..fc300d60ea752 100644
--- a/openmp/device/src/Misc.cpp
+++ b/openmp/device/src/Misc.cpp
@@ -134,6 +134,68 @@ unsigned long long __llvm_omp_host_call(void *fn, void *data, size_t size) {
   Port.close();
   return Ret;
 }
+
+// Calls to __llvm_omp_emissary_rpc and __llvm_omp_emissary_premalloc are
+// generated by device codegen at _emissary_exec call sites.
+// See clang/lib/CodeGen/CGEmitEmissaryExec.cpp
+void *__llvm_emissary_premalloc(uint32_t sz) {
+  return omp_alloc((size_t)sz, omp_default_mem_alloc);
+}
+
+unsigned long long __llvm_emissary_rpc(uint32_t sz32, void *bufdata) {
+  rpc::Client::Port Port = ompx::impl::Client.open<OFFLOAD_EMISSARY>();
+  Port.send_n(bufdata, (size_t)sz32);
+  unsigned long long Ret;
+  Port.recv([&](rpc::Buffer *Buffer, uint32_t) {
+    Ret = static_cast<unsigned long long>(Buffer->data[0]);
+  });
+  omp_free(bufdata, omp_default_mem_alloc);
+  Port.close();
+  return Ret;
+}
+
+// This is for emissary APIs that require d2h or h2d memory transfers.
+unsigned long long __llvm_emissary_rpc_dm(uint32_t sz32, void *bufdata) {
+  rpc::Client::Port Port = ompx::impl::Client.open<OFFLOAD_EMISSARY_DM>();
+  Port.send_n(bufdata, (size_t)sz32);
+
+  char *data = (char *)bufdata;
+  uint32_t *int32_data = (uint32_t *)data;
+  uint32_t NumArgs = int32_data[1];
+  char *keyptr = data + (2 * sizeof(int));
+  char *argptr = keyptr + (NumArgs * sizeof(int));
+  if (((size_t)argptr) % (size_t)8)
+    argptr += 4; // argptr must be aligned
+  uint64_t arg1 = *(uint64_t *)argptr;
+  uint32_t NumSendXfers = (unsigned int)((arg1 >> 16) & 0xFFFF);
+  uint32_t NumRecvXfers = (unsigned int)((arg1) & 0xFFFF);
+  // Skip by arg1 and process Send and Recv Xfers if any
+  argptr += sizeof(uint64_t);
+  for (uint32_t idx = 0; idx < NumSendXfers; idx++) {
+    void *D2Hdata = (void *)*((uint64_t *)argptr);
+    argptr += sizeof(void *);
+    size_t D2Hsize = (size_t) * ((size_t *)argptr);
+    argptr += sizeof(size_t);
+    Port.send_n(D2Hdata, D2Hsize);
+  }
+  for (uint32_t idx = 0; idx < NumRecvXfers; idx++) {
+    void *H2Ddata = (void *)*((uint64_t *)argptr);
+    argptr += sizeof(void *);
+    argptr += sizeof(size_t);
+    uint64_t recv_size;
+    void *buf = nullptr;
+    Port.recv_n(&buf, &recv_size,
+                [&](uint64_t) { return reinterpret_cast<void *>(H2Ddata); });
+  }
+
+  unsigned long long Ret;
+  Port.recv([&](rpc::Buffer *Buffer, uint32_t) {
+    Ret = static_cast<unsigned long long>(Buffer->data[0]);
+  });
+  omp_free(bufdata, omp_default_mem_alloc);
+  Port.close();
+  return Ret;
+}
 }
 
 // C++ ABI helpers.



More information about the Openmp-commits mailing list