[Openmp-commits] [clang] [llvm] [openmp] [OpenMP] support for Emissary APIs as discussed in 89169 (PR #175265)

Greg Rodgers via Openmp-commits openmp-commits at lists.llvm.org
Mon Jan 12 04:45:09 PST 2026


https://github.com/gregrodgers updated https://github.com/llvm/llvm-project/pull/175265

>From cdcedd36b2a590e443974941bd3d4873abe21796 Mon Sep 17 00:00:00 2001
From: gregrodgers <Gregory.Rodgers at amd.com>
Date: Fri, 9 Jan 2026 17:01:48 -0600
Subject: [PATCH 1/2] [OpenMP] support for Emissary APIs as discussed in 89169

---
 clang/include/clang/Basic/LangOptions.def     |   1 +
 clang/include/clang/Options/Options.td        |   6 +
 clang/lib/CodeGen/CGEmitEmissaryExec.cpp      | 371 ++++++++++++++
 clang/lib/CodeGen/CGExpr.cpp                  |  11 +
 clang/lib/CodeGen/CMakeLists.txt              |   1 +
 clang/lib/CodeGen/CodeGenFunction.h           |   1 +
 clang/lib/Driver/ToolChains/Clang.cpp         |   7 +
 clang/lib/Headers/CMakeLists.txt              |   2 +
 clang/lib/Headers/EmissaryIds.h               |  97 ++++
 clang/lib/Headers/EmissaryMPI.h               | 202 ++++++++
 clang/lib/Headers/llvm_libc_wrappers/stdio.h  |  17 +
 offload/include/Shared/RPCOpcodes.h           |   2 +
 offload/libomptarget/CMakeLists.txt           |   1 +
 offload/plugins-nextgen/common/CMakeLists.txt |  15 +
 .../plugins-nextgen/common/include/Emissary.h | 263 ++++++++++
 .../plugins-nextgen/common/src/Emissary.cpp   | 259 ++++++++++
 .../common/src/EmissaryFortrt.cpp             | 470 ++++++++++++++++++
 .../common/src/EmissaryPrint.cpp              | 423 ++++++++++++++++
 offload/plugins-nextgen/common/src/RPC.cpp    | 152 ++++++
 openmp/device/CMakeLists.txt                  |  10 +
 openmp/device/src/EmissaryFortrt.cpp          | 144 ++++++
 openmp/device/src/EmissaryPrint.cpp           |  79 +++
 openmp/device/src/Misc.cpp                    |  62 +++
 23 files changed, 2596 insertions(+)
 create mode 100644 clang/lib/CodeGen/CGEmitEmissaryExec.cpp
 create mode 100644 clang/lib/Headers/EmissaryIds.h
 create mode 100644 clang/lib/Headers/EmissaryMPI.h
 create mode 100644 offload/plugins-nextgen/common/include/Emissary.h
 create mode 100644 offload/plugins-nextgen/common/src/Emissary.cpp
 create mode 100644 offload/plugins-nextgen/common/src/EmissaryFortrt.cpp
 create mode 100644 offload/plugins-nextgen/common/src/EmissaryPrint.cpp
 create mode 100644 openmp/device/src/EmissaryFortrt.cpp
 create mode 100644 openmp/device/src/EmissaryPrint.cpp

diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 8cba1dbaee24e..69ff43cef7307 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -227,6 +227,7 @@ LANGOPT(OpenMPExtensions  , 1, 1, NotCompatible, "Enable all Clang extensions fo
 LANGOPT(OpenMPSimd        , 1, 0, NotCompatible, "Use SIMD only OpenMP support.")
 LANGOPT(OpenMPUseTLS      , 1, 0, NotCompatible, "Use TLS for threadprivates or runtime calls")
 LANGOPT(OpenMPIsTargetDevice    , 1, 0, NotCompatible, "Generate code only for OpenMP target device")
+LANGOPT(UseEmissaryPrint  , 1, 0, NotCompatible, "Enables use of certain IO functions with Emissary rather than LIBC")
 LANGOPT(OpenMPCUDAMode    , 1, 0, NotCompatible, "Generate code for OpenMP pragmas in SIMT/SPMD mode")
 LANGOPT(OpenMPIRBuilder   , 1, 0, NotCompatible, "Use the experimental OpenMP-IR-Builder codegen path.")
 LANGOPT(OpenMPCUDANumSMs  , 32, 0, NotCompatible, "Number of SMs for CUDA devices.")
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 6a72931727a7c..8cf017d8effd7 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -8940,6 +8940,12 @@ def fopenmp_host_ir_file_path : Separate<["-"], "fopenmp-host-ir-file-path">,
 
 } // let Visibility = [CC1Option, FC1Option]
 
+defm use_emissary_print: BoolFOption<"use-emissary-print",
+  LangOpts<"UseEmissaryPrint">, DefaultTrue,
+  PosFlag<SetTrue, [], [ClangOption, CC1Option],
+  "Enable use of Emissary printf/fprint overriding device libc printf/fprintf">,
+  NegFlag<SetFalse>>;
+
 //===----------------------------------------------------------------------===//
 // Coarray Options
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/CGEmitEmissaryExec.cpp b/clang/lib/CodeGen/CGEmitEmissaryExec.cpp
new file mode 100644
index 0000000000000..cc574d1ffdcf2
--- /dev/null
+++ b/clang/lib/CodeGen/CGEmitEmissaryExec.cpp
@@ -0,0 +1,371 @@
+//===------- CGEmitEmissaryExec.cpp - Codegen for _emissary_exec --==------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// EmitEmissaryExec:
+//
+// When a device call to the varadic function _emissary_exec is encountered
+// (in CGExpr.cpp) EmitEmissaryExec does these steps:
+//
+// 1. If string lens are runtime dependent, Emit code to determine runtime len.
+// 2. Emits call to allocate memory __llvm_emissary_premalloc,
+// 3. Emit stores of each arg into arg buffer,
+// 4. Emits call to function __llvm_emissary_rpc or __llvm_emissary_rpc_dm
+//
+// The arg buffer is a struct that contains the length, number of args, an
+// array of 4-byte keys that represent the type of of each arg, an array of
+// aligned "data" values for each arg, and finally the runtime string values.
+// If an arg is a string the data value is the runtime length of the string.
+// Each 4-byte key contains the llvm type ID and the number of bits for the
+// type. encoded by the macro _PACK_TY_BITLEN(x,y) ((uint32_t)x << 16) |
+// ((uint32_t)y)
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../../clang/lib/Headers/EmissaryIds.h"
+#include "CodeGenFunction.h"
+#include "clang/Basic/Builtins.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h"
+
+using namespace clang;
+using namespace CodeGen;
+
+// These static helper functions support EmitEmissaryExec.
+static llvm::Function *GetOmpStrlenDeclaration(CodeGenModule &CGM) {
+  auto &M = CGM.getModule();
+  // Args are pointer to char and maxstringlen
+  llvm::Type *ArgTypes[] = {CGM.Int8PtrTy, CGM.Int32Ty};
+  llvm::FunctionType *OmpStrlenFTy =
+      llvm::FunctionType::get(CGM.Int32Ty, ArgTypes, false);
+  if (auto *F = M.getFunction("__strlen_max")) {
+    assert(F->getFunctionType() == OmpStrlenFTy);
+    return F;
+  }
+  llvm::Function *FN = llvm::Function::Create(
+      OmpStrlenFTy, llvm::GlobalVariable::ExternalLinkage, "__strlen_max", &M);
+  return FN;
+}
+
+// Deterimines if an expression is a string with variable lenth
+static bool isVarString(const clang::Expr *argX, const clang::Type *argXTy,
+                        const llvm::Value *Arg) {
+  if ((argXTy->isPointerType() || argXTy->isConstantArrayType()) &&
+      argXTy->getPointeeOrArrayElementType()->isCharType() && !argX->isLValue())
+    return true;
+  // Ensure the VarDecl has an inititalizer
+  if (const auto *DRE = dyn_cast<DeclRefExpr>(argX))
+    if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl()))
+      if (!VD->getInit() ||
+          !llvm::isa<StringLiteral>(VD->getInit()->IgnoreImplicit()))
+        return true;
+  return false;
+}
+
+// Deterimines if an argument is a string
+static bool isString(const clang::Type *argXTy) {
+  if ((argXTy->isPointerType() || argXTy->isConstantArrayType()) &&
+      argXTy->getPointeeOrArrayElementType()->isCharType())
+    return true;
+  else
+    return false;
+}
+
+// Gets a string literal to write into the transfer buffer
+static const StringLiteral *getSL(const clang::Expr *argX,
+                                  const clang::Type *argXTy) {
+  // String in argX has known constant length
+  if (!argXTy->isConstantArrayType()) {
+    // Allow constant string to be a declared variable,
+    // But it must be constant and initialized.
+    const DeclRefExpr *DRE = cast<DeclRefExpr>(argX);
+    const VarDecl *VarD = cast<VarDecl>(DRE->getDecl());
+    argX = VarD->getInit()->IgnoreImplicit();
+  }
+  const StringLiteral *SL = cast<StringLiteral>(argX);
+  return SL;
+}
+
+// Returns a function pointer to __llvm_emissary_premalloc
+static llvm::Function *GetEmissaryAllocDeclaration(CodeGenModule &CGM) {
+  auto &M = CGM.getModule();
+  // clang::CodeGen::CodeGenTypes &CGT = CGM.getTypes();
+  const char *_executeName = "__llvm_emissary_premalloc";
+  llvm::Type *ArgTypes[] = {CGM.Int32Ty};
+  llvm::Function *FN;
+  // Maybe this should be pointer to char instead of pointer to void
+  llvm::FunctionType *VargsFnAllocFuncType = llvm::FunctionType::get(
+      CGM.getTypes().ConvertType(
+          CGM.getContext().getPointerType(CGM.getContext().VoidTy)),
+      ArgTypes, false);
+  if (!(FN = M.getFunction(_executeName)))
+    FN = llvm::Function::Create(VargsFnAllocFuncType,
+                                llvm::GlobalVariable::ExternalLinkage,
+                                _executeName, &M);
+  assert(FN->getFunctionType() == VargsFnAllocFuncType);
+  return FN;
+}
+
+// Returns a function pointer to __llvm_emissary_rpc
+static llvm::Function *GetEmissaryExecDeclaration(CodeGenModule &CGM,
+                                                  bool hasXfers) {
+  const char *_executeName =
+      hasXfers ? "__llvm_emissary_rpc_dm" : "__llvm_emissary_rpc";
+  auto &M = CGM.getModule();
+  llvm::Type *ArgTypes[] = {
+      CGM.Int32Ty, CGM.getTypes().ConvertType(CGM.getContext().getPointerType(
+                       CGM.getContext().VoidTy))};
+  llvm::Function *FN;
+  llvm::FunctionType *VarfnFuncType =
+      llvm::FunctionType::get(CGM.Int64Ty, ArgTypes, false);
+  if (!(FN = M.getFunction(_executeName)))
+    FN = llvm::Function::Create(
+        VarfnFuncType, llvm::GlobalVariable::ExternalLinkage, _executeName, &M);
+  assert(FN->getFunctionType() == VarfnFuncType);
+  return FN;
+}
+
+// A macro to pack the llvm type ID and numbits into 4-byte key
+#define _PACK_TY_BITLEN(x, y) ((uint32_t)x << 16) | ((uint32_t)y)
+
+//  ----- External function EmitEmissaryExec called from CGExpr.cpp -----
+RValue CodeGenFunction::EmitEmissaryExec(const CallExpr *E) {
+  assert(getTarget().getTriple().isAMDGCN() ||
+         getTarget().getTriple().isNVPTX());
+  assert(E->getNumArgs() >= 1); // _emissary_exec always has at least one arg.
+  const llvm::DataLayout &DL = CGM.getDataLayout();
+  CallArgList Args;
+
+  EmitCallArgs(Args,
+               E->getDirectCallee()->getType()->getAs<FunctionProtoType>(),
+               E->arguments(), E->getDirectCallee(),
+               /* ParamsToSkip = */ 0);
+
+  // We don't know how to emit non-scalar varargs.
+  if (std::any_of(Args.begin() + 1, Args.end(), [&](const CallArg &A) {
+        return !A.getRValue(*this).isScalar();
+      })) {
+    CGM.ErrorUnsupported(E, "non-scalar arg in GPU vargs function");
+    return RValue::get(llvm::ConstantInt::get(IntTy, 0));
+  }
+  // NumArgs always includes emisid, but E->getNumArgs() could be 1 less if
+  // inserted it above.
+  unsigned NumArgs = (unsigned)Args.size();
+  llvm::SmallVector<llvm::Type *, 32> ArgTypes;
+  llvm::SmallVector<llvm::Value *, 32> VarStrLengths;
+  llvm::Value *TotalVarStrsLength = llvm::ConstantInt::get(Int32Ty, 0);
+  bool hasVarStrings = false;
+  ArgTypes.push_back(Int32Ty); // 1st field in struct is total DataLen
+  ArgTypes.push_back(Int32Ty); // 2nd field in struct will be num args
+  // An array of 4-byte keys that describe the arg type
+  for (unsigned I = 0; I < NumArgs; ++I)
+    ArgTypes.push_back(Int32Ty);
+
+  // Track the size of the numeric data length and string length
+  unsigned DataLen_CT =
+      (unsigned)(DL.getTypeAllocSize(Int32Ty)) * (NumArgs + 2);
+  unsigned AllStringsLen_CT = 0;
+
+  // ---  1st Pass over Args to create ArgTypes and count size ---
+  size_t structOffset = 4 * (NumArgs + 2);
+  for (unsigned I = 0; I < NumArgs; I++) {
+    llvm::Value *Arg = Args[I].getRValue(*this).getScalarVal();
+    llvm::Type *ArgType = Arg->getType();
+    // Skip string processing on arg0 which may not be in E->getArg(0)
+    if (I != 0) {
+      const Expr *argX = E->getArg(I)->IgnoreParenCasts();
+      auto *argXTy = argX->getType().getTypePtr();
+      if (isString(argXTy)) {
+        if (isVarString(argX, argXTy, Arg)) {
+          hasVarStrings = true;
+          if (auto *PtrTy = dyn_cast<llvm::PointerType>(ArgType))
+            if (PtrTy->getPointerAddressSpace()) {
+              Arg = Builder.CreateAddrSpaceCast(Arg, CGM.Int8PtrTy);
+              ArgType = Arg->getType();
+            }
+          llvm::Value *VarStrLen =
+              Builder.CreateCall(GetOmpStrlenDeclaration(CGM),
+                                 {Arg, llvm::ConstantInt::get(Int32Ty, 1024)});
+          VarStrLengths.push_back(VarStrLen);
+          TotalVarStrsLength = Builder.CreateAdd(TotalVarStrsLength, VarStrLen,
+                                                 "sum_of_var_strings_length");
+          ArgType = Int32Ty;
+        } else {
+          const StringLiteral *SL = getSL(argX, argXTy);
+          StringRef ArgString = SL->getString();
+          AllStringsLen_CT += ((int)ArgString.size() + 1);
+          // change ArgType from char ptr to int to contain string length
+          ArgType = Int32Ty;
+        }
+      } // end of processing string argument
+    } // End of skip 1st arg
+    // if ArgTypeSize is >4 bytes we need to insert dummy align
+    // values in the struct so all stores can be aligned .
+    // These dummy fields must be inserted before the arg.
+    //
+    // In the pass below where the stores are generated careful
+    // tracking of the index into the struct is necessary.
+    size_t needsPadding = (structOffset % (size_t)DL.getTypeAllocSize(ArgType));
+    if (needsPadding) {
+      DataLen_CT += (unsigned)needsPadding;
+      structOffset += needsPadding;
+      ArgTypes.push_back(Int32Ty); // could assert that needsPadding == 4 here
+    }
+
+    ArgTypes.push_back(ArgType);
+    DataLen_CT += ((int)DL.getTypeAllocSize(ArgType));
+    structOffset += (size_t)DL.getTypeAllocSize(ArgType);
+  }
+
+  // ---  Generate call to __llvm_emissary_premalloc to get data pointer
+  if (hasVarStrings)
+    TotalVarStrsLength = Builder.CreateAdd(
+        TotalVarStrsLength,
+        llvm::ConstantInt::get(Int32Ty, AllStringsLen_CT + DataLen_CT),
+        "total_buffer_size");
+  llvm::Value *BufferLen =
+      hasVarStrings
+          ? TotalVarStrsLength
+          : llvm::ConstantInt::get(Int32Ty, AllStringsLen_CT + DataLen_CT);
+  llvm::Value *DataStructPtr =
+      Builder.CreateCall(GetEmissaryAllocDeclaration(CGM), {BufferLen});
+
+  // --- Cast the generic return pointer to be a struct in device global memory
+  llvm::StructType *DataStructTy =
+      llvm::StructType::create(ArgTypes, "varfn_args_store");
+  unsigned AS = getContext().getTargetAddressSpace(LangAS::cuda_device);
+  llvm::Value *BufferPtr = Builder.CreatePointerCast(
+      DataStructPtr, llvm::PointerType::get(CGM.getLLVMContext(), AS),
+      "varfn_args_store_casted");
+  // ---  Header of struct contains length and NumArgs ---
+  llvm::Value *DataLenField = llvm::ConstantInt::get(Int32Ty, DataLen_CT);
+  llvm::Value *P = Builder.CreateStructGEP(DataStructTy, BufferPtr, 0);
+  Builder.CreateAlignedStore(DataLenField, P,
+                             DL.getPrefTypeAlign(DataLenField->getType()));
+  llvm::Value *NumArgsField = llvm::ConstantInt::get(Int32Ty, NumArgs);
+  P = Builder.CreateStructGEP(DataStructTy, BufferPtr, 1);
+  Builder.CreateAlignedStore(NumArgsField, P,
+                             DL.getPrefTypeAlign(NumArgsField->getType()));
+
+  // ---  2nd Pass: create array of 4-byte keys to describe each arg
+  for (unsigned I = 0; I < NumArgs; I++) {
+    llvm::Type *ty = Args[I].getRValue(*this).getScalarVal()->getType();
+    llvm::Type::TypeID argtypeid =
+        Args[I].getRValue(*this).getScalarVal()->getType()->getTypeID();
+
+    // Get type size in bits. Usually 64 or 32.
+    uint32_t numbits = 0;
+    if (I > 0 &&
+        isString(E->getArg(I)->IgnoreParenCasts()->getType().getTypePtr()))
+      // The llvm typeID for string is pointer.  Since pointer numbits is 0,
+      // we set numbits to 1 to distinguish pointer type ID as string pointer.
+      numbits = 1;
+    else
+      numbits = ty->getScalarSizeInBits();
+    // Create a key that combines llvm typeID and size
+    llvm::Value *Key =
+        llvm::ConstantInt::get(Int32Ty, _PACK_TY_BITLEN(argtypeid, numbits));
+    P = Builder.CreateStructGEP(DataStructTy, BufferPtr, I + 2);
+    Builder.CreateAlignedStore(Key, P, DL.getPrefTypeAlign(Key->getType()));
+  }
+
+  // ---  3rd Pass: Store data values for each arg ---
+  unsigned varstring_index = 0;
+  unsigned structIndex = 2 + NumArgs;
+  structOffset = 4 * structIndex;
+  bool hasXfers;
+  for (unsigned I = 0; I < NumArgs; I++) {
+    llvm::Value *Arg;
+    if (I == 0) {
+      Arg = Args[I].getKnownRValue().getScalarVal();
+      llvm::ConstantInt *CI = llvm::dyn_cast<llvm::ConstantInt>(Arg);
+      uint64_t uint64value = CI->getZExtValue();
+      uint32_t lower_32 = (uint32_t)(uint64value & 0xFFFFFFFF);
+      hasXfers = lower_32 ? true : false;
+    } else {
+      const Expr *argX = E->getArg(I)->IgnoreParenCasts();
+      auto *argXTy = argX->getType().getTypePtr();
+      if (isString(argXTy)) {
+        if (isVarString(argX, argXTy, Arg)) {
+          Arg = VarStrLengths[varstring_index];
+          varstring_index++;
+        } else {
+          const StringLiteral *SL = getSL(argX, argXTy);
+          StringRef ArgString = SL->getString();
+          int ArgStrLen = (int)ArgString.size() + 1;
+          // Change Arg from a char pointer to the integer string length
+          Arg = llvm::ConstantInt::get(Int32Ty, ArgStrLen);
+        }
+      } else {
+        Arg = Args[I].getKnownRValue().getScalarVal();
+      }
+    }
+    size_t structElementSize = (size_t)DL.getTypeAllocSize(Arg->getType());
+    size_t needsPadding = (structOffset % structElementSize);
+    if (needsPadding) {
+      // Skip over dummy fields in struct to align
+      structOffset += needsPadding; // should assert needsPadding == 4
+      structIndex++;
+    }
+    P = Builder.CreateStructGEP(DataStructTy, BufferPtr, structIndex);
+    Builder.CreateAlignedStore(Arg, P, DL.getPrefTypeAlign(Arg->getType()));
+    structOffset += structElementSize;
+    structIndex++;
+  }
+
+  // ---  4th Pass: memcpy all strings after the data values ---
+  // bitcast the struct in device global memory as a char buffer
+  Address BufferPtrByteAddr =
+      Address(Builder.CreatePointerCast(
+                  BufferPtr, llvm::PointerType::get(CGM.getLLVMContext(), AS),
+                  "_casted"),
+              Int8Ty, CharUnits::fromQuantity(1));
+
+  // BufferPtrByteAddr is a pointer to where we want to write the next string
+  BufferPtrByteAddr = Builder.CreateConstInBoundsByteGEP(
+      BufferPtrByteAddr, CharUnits::fromQuantity(DataLen_CT));
+  varstring_index = 0;
+  // Skip string processing on arg0 which may not be in E->getArg(0)
+  for (unsigned I = 1; I < NumArgs; ++I) {
+    llvm::Value *Arg = Args[I].getKnownRValue().getScalarVal();
+    const Expr *argX = E->getArg(I)->IgnoreParenCasts();
+    auto *argXTy = argX->getType().getTypePtr();
+    if (isString(argXTy)) {
+      if (isVarString(argX, argXTy, Arg)) {
+        llvm::Value *varStrLength = VarStrLengths[varstring_index];
+        varstring_index++;
+        Address SrcAddr = Address(Arg, Int8Ty, CharUnits::fromQuantity(1));
+        Builder.CreateMemCpy(BufferPtrByteAddr, SrcAddr, varStrLength);
+        // update BufferPtrByteAddr for next string memcpy
+        llvm::Value *PtrAsInt = BufferPtrByteAddr.emitRawPointer(*this);
+        BufferPtrByteAddr =
+            Address(Builder.CreateGEP(Int8Ty, PtrAsInt,
+                                      ArrayRef<llvm::Value *>(varStrLength)),
+                    Int8Ty, CharUnits::fromQuantity(1));
+      } else {
+        const StringLiteral *SL = getSL(argX, argXTy);
+        StringRef ArgString = SL->getString();
+        int ArgStrLen = (int)ArgString.size() + 1;
+        Address SrcAddr = CGM.GetAddrOfConstantStringFromLiteral(SL);
+        Builder.CreateMemCpy(BufferPtrByteAddr, SrcAddr, ArgStrLen);
+        // update BufferPtrByteAddr for next memcpy
+        BufferPtrByteAddr = Builder.CreateConstInBoundsByteGEP(
+            BufferPtrByteAddr, CharUnits::fromQuantity(ArgStrLen));
+      }
+    }
+  }
+  // --- Generate call to __llvm_emissary_rpc and return RValue
+  llvm::Value *emis_rc = Builder.CreateCall(
+      GetEmissaryExecDeclaration(CGM, hasXfers), {BufferLen, DataStructPtr});
+  // truncate long long int to int for printf return value.
+  if ((E->getDirectCallee()->getNameAsString() == "fprintf") ||
+      (E->getDirectCallee()->getNameAsString() == "printf"))
+    emis_rc = Builder.CreateTrunc(emis_rc, CGM.Int32Ty, "emis_rc");
+  return RValue::get(emis_rc);
+}
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 6309c37788f0c..a4ee21d968b5e 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -6829,6 +6829,17 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType,
       StaticOperator = true;
   }
 
+  // Replace calls to _emissary_exec found in emissary device stubs with calls
+  // to either __llvm_emissary_rpc or __llvm_emissary_rpc_dm. Before the call
+  // EmitEmissaryExec generates code to allocate an arg buffer and to fill the
+  // arg buffer.
+  if ((CGM.getTriple().isAMDGCN() || CGM.getTriple().isNVPTX()) && FnType &&
+      dyn_cast<FunctionProtoType>(FnType) &&
+      dyn_cast<FunctionProtoType>(FnType)->isVariadic() &&
+      (E->getDirectCallee()->getNameAsString() == "_emissary_exec") &&
+      CGM.getLangOpts().OpenMP)
+    return EmitEmissaryExec(E);
+
   auto Arguments = E->arguments();
   if (StaticOperator) {
     // If we're calling a static operator, we need to emit the object argument
diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt
index dbbc35b372f42..f699cee7fea11 100644
--- a/clang/lib/CodeGen/CMakeLists.txt
+++ b/clang/lib/CodeGen/CMakeLists.txt
@@ -62,6 +62,7 @@ add_clang_library(clangCodeGen
   CGAtomic.cpp
   CGBlocks.cpp
   CGBuiltin.cpp
+  CGEmitEmissaryExec.cpp
   CGCUDANV.cpp
   CGCUDARuntime.cpp
   CGCXX.cpp
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 855e43631f436..3e962e5d0e34c 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4708,6 +4708,7 @@ class CodeGenFunction : public CodeGenTypeCache {
 
   RValue EmitNVPTXDevicePrintfCallExpr(const CallExpr *E);
   RValue EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E);
+  RValue EmitEmissaryExec(const CallExpr *E);
 
   RValue EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
                          const CallExpr *E, ReturnValueSlot ReturnValue);
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 4399eb475be75..b3be55c9d7d3d 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7813,6 +7813,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
+  // To opt-out of emissary printf/fprint set -fno-use-emissary-print. This
+  // will use the slower device libc for printf/fprintf. In the default mode
+  // (use-emissary-print), all other device libc functions are still active.
+  if (Args.hasFlag(options::OPT_fuse_emissary_print,
+                   options::OPT_fno_use_emissary_print, true))
+    CmdArgs.push_back("-DOFFLOAD_ENABLE_EMISSARY_PRINT");
+
   if (Triple.isAMDGPU()) {
     handleAMDGPUCodeObjectVersionOptions(D, Args, CmdArgs);
 
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index c92b370b88d2d..c1edefa5f6aaa 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -41,6 +41,8 @@ set(core_files
   tgmath.h
   unwind.h
   varargs.h
+  EmissaryIds.h
+  EmissaryMPI.h
   )
 
 set(arm_common_files
diff --git a/clang/lib/Headers/EmissaryIds.h b/clang/lib/Headers/EmissaryIds.h
new file mode 100644
index 0000000000000..f9ff616a0aacb
--- /dev/null
+++ b/clang/lib/Headers/EmissaryIds.h
@@ -0,0 +1,97 @@
+//===- openmp/device/include/EmissaryIds.h enum & headers ----- C++ -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines Emissary API identifiers. This header is used by both host
+// and device compilations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OFFLOAD_EMISSARY_IDS_H
+#define OFFLOAD_EMISSARY_IDS_H
+/// The sets of emissary APIs under development
+typedef enum {
+  EMIS_ID_INVALID,
+  EMIS_ID_FORTRT,
+  EMIS_ID_PRINT,
+  EMIS_ID_MPI,
+  EMIS_ID_HDF5,
+  EMIS_ID_RESERVE,
+} offload_emis_id_t;
+
+typedef enum {
+  _print_INVALID,
+  _printf_idx,
+  _fprintf_idx,
+  _ockl_asan_report_idx,
+} offload_emis_print_t;
+
+/// The vargs function used by emissary API device stubs
+unsigned long long _emissary_exec(unsigned long long, ...);
+
+//#define _PACK_EMIS_IDS(x, y)                                                   \
+//  ((unsigned long long)x << 32) | ((unsigned long long)y)
+
+#define _PACK_EMIS_IDS(a, b, c, d)                                             \
+  ((unsigned long long)a << 48) | ((unsigned long long)b << 32) |              \
+      ((unsigned long long)c << 16) | ((unsigned long long)d)
+
+typedef enum {
+  _FortranAio_INVALID,
+  _FortranAioBeginExternalListOutput_idx,
+  _FortranAioOutputAscii_idx,
+  _FortranAioOutputInteger32_idx,
+  _FortranAioEndIoStatement_idx,
+  _FortranAioOutputInteger8_idx,
+  _FortranAioOutputInteger16_idx,
+  _FortranAioOutputInteger64_idx,
+  _FortranAioOutputReal32_idx,
+  _FortranAioOutputReal64_idx,
+  _FortranAioOutputComplex32_idx,
+  _FortranAioOutputComplex64_idx,
+  _FortranAioOutputLogical_idx,
+  _FortranAAbort_idx,
+  _FortranAStopStatementText_idx,
+  _FortranAioBeginExternalFormattedOutput_idx,
+  _FortranAStopStatement_idx,
+} offload_emis_fortrt_idx;
+
+/// This structure is created by emisExtractArgBuf to get information
+/// from the data buffer passed by rpc.
+typedef struct {
+  unsigned int DataLen;
+  unsigned int NumArgs;
+  unsigned int emisid;
+  unsigned int emisfnid;
+  unsigned int NumSendXfers;
+  unsigned int NumRecvXfers;
+  unsigned long long data_not_used;
+  char *keyptr;
+  char *argptr;
+  char *strptr;
+} emisArgBuf_t;
+
+typedef unsigned long long EmissaryReturn_t;
+typedef unsigned long long emis_argptr_t;
+typedef EmissaryReturn_t emisfn_t(void *, ...);
+
+#define MAXVARGS 32
+
+typedef enum service_rc {
+  _ERC_SUCCESS = 0,
+  _ERC_STATUS_ERROR = 1,
+  _ERC_DATA_USED_ERROR = 2,
+  _ERC_ADDINT_ERROR = 3,
+  _ERC_ADDFLOAT_ERROR = 4,
+  _ERC_ADDSTRING_ERROR = 5,
+  _ERC_UNSUPPORTED_ID_ERROR = 6,
+  _ERC_INVALID_ID_ERROR = 7,
+  _ERC_ERROR_INVALID_REQUEST = 8,
+  _ERC_EXCEED_MAXVARGS_ERROR = 9,
+} service_rc;
+
+#endif // OFFLOAD_EMISSARY_IDS_H
diff --git a/clang/lib/Headers/EmissaryMPI.h b/clang/lib/Headers/EmissaryMPI.h
new file mode 100644
index 0000000000000..c9529e1ea73ce
--- /dev/null
+++ b/clang/lib/Headers/EmissaryMPI.h
@@ -0,0 +1,202 @@
+//===----------------  openmp/device/include/EmissaryMPI.h  ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// EmissaryMPI.h This include must be included by MPI application
+//
+//===----------------------------------------------------------------------===//
+#include "EmissaryIds.h"
+#include <stdarg.h>
+#include <unordered_map>
+
+typedef enum {
+  _MPI_INVALID,
+  _MPI_Send_idx,
+  _MPI_Recv_idx,
+  _MPI_Allreduce_idx,
+  _MPI_Reduce_idx,
+} offload_emis_mpi_t;
+
+// -------- DELETE THIS BLOCK WHEN MPI_Type_size on GPU WORKS ----------------
+// Emissary_Initialize_MPI builds a table of lengths for each each MPI_Datatype.
+// That table is called _mpi_type_lens and passed to the device.
+// We need this because we do not yet have a GPU version of MPI_Type_size.
+// If we did we can avoid the table search for datatype_size. This is
+// how MPI datatype length should be calculated.
+//   int datatype_size;
+//   MPI_Type_size(v2,&datatype_size) ;
+// Delete this table search when we have a working MPI_Type_size.
+#define _MPI_DATATYPES 5
+typedef struct mpi_type_len_t {
+  uint64_t dt_signature;
+  uint32_t dt_size;
+} mpi_type_len_t;
+#pragma omp begin declare target
+mpi_type_len_t _mpi_type_lens[_MPI_DATATYPES];
+#pragma omp end declare target
+void Emissary_Initialize_MPI() {
+  MPI_Datatype _mpi_int = MPI_INT;
+  MPI_Datatype _mpi_float = MPI_FLOAT;
+  MPI_Datatype _mpi_unsigned = MPI_UNSIGNED;
+  MPI_Datatype _mpi_double = MPI_DOUBLE;
+  MPI_Datatype _mpi_char = MPI_CHAR;
+  _mpi_type_lens[0] = {(uint64_t)_mpi_int, 4};
+  _mpi_type_lens[1] = {(uint64_t)_mpi_unsigned, 4};
+  _mpi_type_lens[2] = {(uint64_t)_mpi_float, 4};
+  _mpi_type_lens[3] = {(uint64_t)_mpi_double, 8};
+  _mpi_type_lens[4] = {(uint64_t)_mpi_char, 1};
+#pragma omp target update to(_mpi_type_lens[0 : _MPI_DATATYPES])
+}
+// -------- END BLOCK TO DELETE WHEN MPI_Type_size on GPU WORKS ---------------
+
+///  Device stubs must use the identical host API interface.
+///  Stubs call _emissary_exec with additional args that include
+///  the identifier and additional D2H and H2D transfer vectors.
+///  whose params include an identifier
+///
+#if defined(__NVPTX__) || defined(__AMDGCN__)
+
+extern "C" int MPI_Send(const void *buf, int count, MPI_Datatype datatype,
+                        int dest, int tag, MPI_Comm comm) {
+  uint64_t dt_signature = (uint64_t)datatype;
+  int datatype_size = 8; // Default in case we do not have it in our table.
+  for (int i = 0; i < _MPI_DATATYPES; i++)
+    if (_mpi_type_lens[i].dt_signature == dt_signature) {
+      datatype_size = _mpi_type_lens[i].dt_size;
+      break;
+    }
+  return (int)_emissary_exec(
+      // The emissary identifier is a static 64 bit field that encodes
+      // the emissary id, emissary function, D2H Xfer cnt, and H2D Xfer cnt.
+      _PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Send_idx, 1, 0),
+      // Each D2H transfer vector needs a pair of args to describe the xfer
+      // The first is the device pointer, the 2nd is size.
+      buf, (int)count * datatype_size,
+      // These are the actual 6 MPI_Send Args passed directly from the params
+      buf, count, datatype, dest, tag, comm);
+}
+extern "C" int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source,
+                        int tag, MPI_Comm comm, MPI_Status *st) {
+  uint64_t dt_signature = (uint64_t)datatype;
+  int datatype_size = 8;
+  for (int i = 0; i < _MPI_DATATYPES; i++)
+    if (_mpi_type_lens[i].dt_signature == dt_signature) {
+      datatype_size = _mpi_type_lens[i].dt_size;
+      break;
+    }
+  return (int)_emissary_exec(_PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Recv_idx, 0, 1),
+                             buf,
+                             (int)count * datatype_size, // This is a H2D Xfer
+                             buf, count, datatype, source, tag, comm, st);
+}
+extern "C" int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count,
+                             MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) {
+  return (int)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Allreduce_idx, 1, 1), sendbuf, recvbuf,
+      count, datatype, op, comm);
+}
+extern "C" int MPI_Reduce(const void *sendbuf, void *recvbuf, int count,
+                          MPI_Datatype datatype, MPI_Op op, int root,
+                          MPI_Comm comm) {
+  return (int)_emissary_exec(_PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Reduce_idx, 1, 1),
+                             sendbuf, recvbuf, count, datatype, op, root, comm);
+}
+
+#else
+
+/// Host variadic wrapper functions.
+extern "C" {
+extern int V_MPI_Send(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  int v1 = va_arg(args, int);
+  MPI_Datatype v2 = va_arg(args, MPI_Datatype);
+  int v3 = va_arg(args, int);
+  int v4 = va_arg(args, int);
+  MPI_Comm v5 = va_arg(args, MPI_Comm);
+  va_end(args);
+  int rval = MPI_Send(v0, v1, v2, v3, v4, v5);
+  return rval;
+}
+extern int V_MPI_Recv(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  int v1 = va_arg(args, int);
+  MPI_Datatype v2 = va_arg(args, MPI_Datatype);
+  int v3 = va_arg(args, int);
+  int v4 = va_arg(args, int);
+  MPI_Comm v5 = va_arg(args, MPI_Comm);
+  MPI_Status *v6 = va_arg(args, MPI_Status *);
+  va_end(args);
+  int rval = MPI_Recv(v0, v1, v2, v3, v4, v5, v6);
+  return rval;
+}
+extern int V_MPI_Allreduce(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *buf = va_arg(args, void *);
+  void *recvbuf = va_arg(args, void *);
+  int count = va_arg(args, int);
+  MPI_Datatype datatype = va_arg(args, MPI_Datatype);
+  MPI_Op op = va_arg(args, MPI_Op);
+  MPI_Comm comm = va_arg(args, MPI_Comm);
+  va_end(args);
+  int rval = MPI_Allreduce(buf, recvbuf, count, datatype, op, comm);
+  return rval;
+}
+extern int V_MPI_Reduce(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *buf = va_arg(args, void *);
+  void *recvbuf = va_arg(args, void *);
+  int count = va_arg(args, int);
+  MPI_Datatype datatype = va_arg(args, MPI_Datatype);
+  MPI_Op op = va_arg(args, MPI_Op);
+  int root = va_arg(args, int);
+  MPI_Comm comm = va_arg(args, MPI_Comm);
+  va_end(args);
+  int rval = MPI_Reduce(buf, recvbuf, count, datatype, op, root, comm);
+  return rval;
+}
+
+/// EmissaryMPI function selector
+EmissaryReturn_t EmissaryMPI(char *data, emisArgBuf_t *ab, emis_argptr_t *a[]) {
+
+  switch (ab->emisfnid) {
+  case _MPI_Send_idx: {
+    void *fnptr = (void *)V_MPI_Send;
+    int return_value_int =
+        V_MPI_Send(fnptr, a[0], a[1], a[2], a[3], a[4], a[5]);
+    return (EmissaryReturn_t)return_value_int;
+  }
+  case _MPI_Recv_idx: {
+    void *fnptr = (void *)V_MPI_Recv;
+    int return_value_int =
+        V_MPI_Recv(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6]);
+    return (EmissaryReturn_t)return_value_int;
+  }
+  case _MPI_Allreduce_idx: {
+    void *fnptr = (void *)V_MPI_Allreduce;
+    int return_value_int =
+        V_MPI_Allreduce(fnptr, a[0], a[1], a[2], a[3], a[4], a[5]);
+    return (EmissaryReturn_t)return_value_int;
+  }
+  case _MPI_Reduce_idx: {
+    void *fnptr = (void *)V_MPI_Reduce;
+    int return_value_int =
+        V_MPI_Reduce(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6]);
+    return (EmissaryReturn_t)return_value_int;
+  }
+  }
+  return (EmissaryReturn_t)0;
+}
+
+} // end extern "C"
+
+#endif
diff --git a/clang/lib/Headers/llvm_libc_wrappers/stdio.h b/clang/lib/Headers/llvm_libc_wrappers/stdio.h
index 0c3e44823da70..e8c0250cfb01b 100644
--- a/clang/lib/Headers/llvm_libc_wrappers/stdio.h
+++ b/clang/lib/Headers/llvm_libc_wrappers/stdio.h
@@ -21,6 +21,23 @@
 #define __LIBC_ATTRS
 #endif
 
+// To turn off emissary print (and this macro) set -fno-use-emissary-print.
+#ifdef OFFLOAD_ENABLE_EMISSARY_PRINT
+#if defined(__NVPTX__) || defined(__AMDGCN__)
+#include <EmissaryIds.h>
+#define fprintf(...)                                                           \
+  _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _fprintf_idx, 0, 0),            \
+                 __VA_ARGS__);
+#define printf(...)                                                            \
+  _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _printf_idx, 0, 0), __VA_ARGS__);
+#define fputc(c, stream) fprintf(stream, "%c", (unsigned char)(c))
+#define putc(c, stream) fprintf(stream, "%c", (unsigned char)(c))
+#define putchar(c) printf("%c", (char)(c))
+#define fputs(str, stream) fprintf((stream), "%s", (str))
+#define puts(str) fprintf(stdout, "%s", (str))
+#endif
+#endif
+
 // Some headers provide these as macros. Temporarily undefine them so they do
 // not conflict with any definitions for the GPU.
 
diff --git a/offload/include/Shared/RPCOpcodes.h b/offload/include/Shared/RPCOpcodes.h
index beee29df1f707..fbdc839620cec 100644
--- a/offload/include/Shared/RPCOpcodes.h
+++ b/offload/include/Shared/RPCOpcodes.h
@@ -18,6 +18,8 @@
 
 typedef enum {
   OFFLOAD_HOST_CALL = LLVM_OFFLOAD_OPCODE(0),
+  OFFLOAD_EMISSARY = LLVM_OFFLOAD_OPCODE(1),
+  OFFLOAD_EMISSARY_DM = LLVM_OFFLOAD_OPCODE(2),
 } offload_opcode_t;
 
 #undef LLVM_OFFLOAD_OPCODE
diff --git a/offload/libomptarget/CMakeLists.txt b/offload/libomptarget/CMakeLists.txt
index 93e684e53bf17..d3247c09a50a8 100644
--- a/offload/libomptarget/CMakeLists.txt
+++ b/offload/libomptarget/CMakeLists.txt
@@ -57,6 +57,7 @@ target_compile_definitions(omptarget PRIVATE
 foreach(plugin IN LISTS LIBOMPTARGET_PLUGINS_TO_BUILD)
   target_link_libraries(omptarget PRIVATE omptarget.rtl.${plugin})
 endforeach()
+target_link_libraries(omptarget PRIVATE flang_rt.runtime)
 
 target_compile_options(omptarget PRIVATE ${offload_compile_flags})
 target_link_options(omptarget PRIVATE ${offload_link_flags})
diff --git a/offload/plugins-nextgen/common/CMakeLists.txt b/offload/plugins-nextgen/common/CMakeLists.txt
index ea0910abf95d5..0232fe2d83f28 100644
--- a/offload/plugins-nextgen/common/CMakeLists.txt
+++ b/offload/plugins-nextgen/common/CMakeLists.txt
@@ -7,6 +7,15 @@ tablegen(OFFLOAD include/OffloadErrcodes.inc -gen-errcodes -I ${CMAKE_CURRENT_SO
 tablegen(OFFLOAD include/OffloadInfo.inc -gen-info -I ${CMAKE_CURRENT_SOURCE_DIR}/../../liboffload/API)
 add_public_tablegen_target(PluginErrcodes)
 
+if(OFFLOAD_ENABLE_EMISSARY_APIS)
+  set(emissary_sources
+    src/Emissary.cpp
+    src/EmissaryFortrt.cpp
+    src/EmissaryPrint.cpp
+  )
+  set(emissary_headers ${CMAKE_CURRENT_SOURCE_DIR}/../../../openmp/device/include)
+endif()
+
 # NOTE: Don't try to build `PluginInterface` using `add_llvm_library` because we
 # don't want to export `PluginInterface` while `add_llvm_library` requires that.
 add_library(PluginCommon OBJECT
@@ -16,6 +25,7 @@ add_library(PluginCommon OBJECT
   src/RPC.cpp
   src/OffloadError.cpp
   src/Utils/ELF.cpp
+  ${emissary_sources}
 )
 add_dependencies(PluginCommon intrinsics_gen PluginErrcodes)
 
@@ -34,6 +44,10 @@ endif()
 # Include the RPC server from the `libc` project.
 include(FindLibcCommonUtils)
 target_link_libraries(PluginCommon PRIVATE llvm-libc-common-utilities)
+if(OFFLOAD_ENABLE_EMISSARY_APIS)
+  target_link_libraries(PluginCommon PRIVATE flang_rt.runtime
+    -L${CMAKE_BINARY_DIR}/../../lib -L${CMAKE_INSTALL_PREFIX}/lib)
+endif()
 
 # Define the TARGET_NAME and DEBUG_PREFIX.
 target_compile_definitions(PluginCommon PRIVATE
@@ -50,6 +64,7 @@ target_include_directories(PluginCommon PUBLIC
   ${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
   ${LIBOMPTARGET_BINARY_INCLUDE_DIR}
   ${LIBOMPTARGET_INCLUDE_DIR}
+  ${emissary_headers}
 )
 
 set_target_properties(PluginCommon PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/offload/plugins-nextgen/common/include/Emissary.h b/offload/plugins-nextgen/common/include/Emissary.h
new file mode 100644
index 0000000000000..317c6ea51136b
--- /dev/null
+++ b/offload/plugins-nextgen/common/include/Emissary.h
@@ -0,0 +1,263 @@
+//===-- offload/plugins-nextgen/common/include/Emissary.h ------ C++ ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines emissary helper functions. This include is only used for host
+// compilation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OFFLOAD_EMISSARY_H
+#define OFFLOAD_EMISSARY_H
+
+#include "EmissaryIds.h"
+#include "RPC.h"
+// #include "shared/rpc.h"
+#include "shared/rpc_server.h"
+
+extern "C" {
+
+/// Called by rpc after receiving emissary argument buffer
+EmissaryReturn_t Emissary(char *data, emisArgBuf_t *ab,
+                          std::unordered_map<void *, void *> *D2HAddrList);
+
+/// Called by Emissary for all Fortrt emissary functions
+EmissaryReturn_t
+EmissaryFortrt(char *data, emisArgBuf_t *ab,
+               std::unordered_map<void *, void *> *D2HAddrList);
+
+/// Called by Emissary for all misc print functions
+EmissaryReturn_t EmissaryPrint(char *data, emisArgBuf_t *ab);
+
+/// Called by Emissary for all MPI emissary API functions
+__attribute((weak)) EmissaryReturn_t EmissaryMPI(char *data, emisArgBuf_t *ab,
+                                                 emis_argptr_t *arg[]);
+
+/// Called by Emissary for all HDF5 Emissary API functions
+__attribute((weak)) EmissaryReturn_t EmissaryHDF5(char *data, emisArgBuf_t *ab,
+                                                  emis_argptr_t *arg[]);
+
+/// Called by Emissary to support user-defined emissary API
+__attribute((weak)) EmissaryReturn_t EmissaryReserve(char *data,
+                                                     emisArgBuf_t *ab,
+                                                     emis_argptr_t *arg[]);
+
+/// emisExtractArgBuf is called within the "case OFFLOAD_EMISSARY:" stanza
+/// in offload/plugins-nextgen/common/src/RPC.cpp to build the emisArgBuf_t
+/// structure from the emissary data buffer sent to the CPU by rpc.
+/// This buffer is created by clang CodeGen when variadic function
+/// _emissary_exec(...) is encountered when compiling any emissary device
+/// stub to define the device function.
+void emisExtractArgBuf(char *buf, emisArgBuf_t *ab);
+
+/// Move the ArgBuf tracker past a set of XferSrgs
+void emisSkipXferArgSet(emisArgBuf_t *ab);
+
+/// Get uint32 value extended to uint64_t value from a char ptr
+uint64_t getuint32(char *val);
+/// Get uint64_t value from a char ptr
+uint64_t getuint64(char *val);
+/// Get a function pointer from a char ptr
+void *getfnptr(char *val);
+
+/// Builds the array of pointers passed to V_ functions
+uint32_t EmissaryBuildVargs(int NumArgs, char *keyptr, char *dataptr,
+                            char *strptr, unsigned long long *data_not_used,
+                            emis_argptr_t *a[],
+                            std::unordered_map<void *, void *> *D2HAddrList);
+
+} // end extern "C"
+
+/// Call the associated V_ function
+template <typename T, typename FT>
+extern T EmissaryCallFnptr(uint32_t NumArgs, void *fnptr, emis_argptr_t *a[]);
+
+// We would like to get llvm typeID enum from Type.h. e.g.
+// #include ".../llvm/include/llvm/IR/Type.h"
+// But we cannot include LLVM headers in a runtime function.
+// So we a have a manual copy of llvm TypeID enum from Type.h
+// The codegen for _emissary_exec puts this ID in the key for
+// each arg and the host runtime needs to decode this key.
+enum TypeID {
+  // PrimitiveTypes
+  HalfTyID = 0,  ///< 16-bit floating point type
+  BFloatTyID,    ///< 16-bit floating point type (7-bit significand)
+  FloatTyID,     ///< 32-bit floating point type
+  DoubleTyID,    ///< 64-bit floating point type
+  X86_FP80TyID,  ///< 80-bit floating point type (X87)
+  FP128TyID,     ///< 128-bit floating point type (112-bit significand)
+  PPC_FP128TyID, ///< 128-bit floating point type (two 64-bits, PowerPC)
+  VoidTyID,      ///< type with no size
+  LabelTyID,     ///< Labels
+  MetadataTyID,  ///< Metadata
+  X86_AMXTyID,   ///< AMX vectors (8192 bits, X86 specific)
+  TokenTyID,     ///< Tokens
+
+  // Derived types... see DerivedTypes.h file.
+  IntegerTyID,        ///< Arbitrary bit width integers
+  FunctionTyID,       ///< Functions
+  PointerTyID,        ///< Pointers
+  StructTyID,         ///< Structures
+  ArrayTyID,          ///< Arrays
+  FixedVectorTyID,    ///< Fixed width SIMD vector type
+  ScalableVectorTyID, ///< Scalable SIMD vector type
+  TypedPointerTyID,   ///< Typed pointer used by some GPU targets
+  TargetExtTyID,      ///< Target extension type
+};
+
+template <typename T, typename FT>
+extern T EmissaryCallFnptr(uint32_t NumArgs, void *fnptr, emis_argptr_t *a[]) {
+  T rv;
+  FT *vfnptr = (FT *)fnptr;
+  switch (NumArgs) {
+  case 1:
+    rv = (T)vfnptr(fnptr, a[0]);
+    break;
+  case 2:
+    rv = (T)vfnptr(fnptr, a[0], a[1]);
+    break;
+  case 3:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2]);
+    break;
+  case 4:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3]);
+    break;
+  case 5:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4]);
+    break;
+  case 6:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5]);
+    break;
+  case 7:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6]);
+    break;
+  case 8:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
+    break;
+  case 9:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8]);
+    break;
+  case 10:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9]);
+    break;
+  case 11:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10]);
+    break;
+  case 12:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11]);
+    break;
+  case 13:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12]);
+    break;
+  case 14:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13]);
+    break;
+  case 15:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14]);
+    break;
+  case 16:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15]);
+    break;
+  case 17:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16]);
+    break;
+  case 18:
+    rv =
+        (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                  a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17]);
+    break;
+  case 19:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18]);
+    break;
+  case 20:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19]);
+    break;
+  case 21:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20]);
+    break;
+  case 22:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21]);
+    break;
+  case 23:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21], a[22]);
+    break;
+  case 24:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21], a[22], a[23]);
+    break;
+  case 25:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21], a[22], a[23], a[24]);
+    break;
+  case 26:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25]);
+    break;
+  case 27:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25],
+                   a[26]);
+    break;
+  case 28:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25],
+                   a[26], a[27]);
+    break;
+  case 29:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25],
+                   a[26], a[27], a[28]);
+    break;
+  case 30:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25],
+                   a[26], a[27], a[28], a[29]);
+    break;
+  case 31:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25],
+                   a[26], a[27], a[28], a[29], a[30]);
+    break;
+  case 32:
+    rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+                   a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+                   a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25],
+                   a[26], a[27], a[28], a[29], a[30], a[31]);
+    break;
+  default:
+    rv = 0;
+  }
+  return rv;
+}
+
+#endif // OFFLOAD_EMISSARY_H
diff --git a/offload/plugins-nextgen/common/src/Emissary.cpp b/offload/plugins-nextgen/common/src/Emissary.cpp
new file mode 100644
index 0000000000000..0be0e426ee050
--- /dev/null
+++ b/offload/plugins-nextgen/common/src/Emissary.cpp
@@ -0,0 +1,259 @@
+//===----- ioffload/plugins-nexgen/common/include/Emissary.cpp ---- C++ ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RPC.h"
+
+#include "Shared/Debug.h"
+#include "Shared/RPCOpcodes.h"
+
+#include "PluginInterface.h"
+
+#include "Emissary.h"
+#include "EmissaryIds.h"
+#include "shared/rpc.h"
+#include "shared/rpc_opcodes.h"
+#include <unordered_map>
+
+extern "C" EmissaryReturn_t
+Emissary(char *data, emisArgBuf_t *ab,
+         std::unordered_map<void *, void *> *D2HAddrList) {
+  EmissaryReturn_t result = 0;
+  emis_argptr_t **args = (emis_argptr_t **)aligned_alloc(
+      sizeof(emis_argptr_t), ab->NumArgs * sizeof(emis_argptr_t *));
+
+  switch (ab->emisid) {
+  case EMIS_ID_INVALID: {
+    fprintf(stderr, "Emissary (host execution) got invalid EMIS_ID\n");
+    result = 0;
+    break;
+  }
+  case EMIS_ID_FORTRT: {
+    result = EmissaryFortrt(data, ab, D2HAddrList);
+    break;
+  }
+  case EMIS_ID_PRINT: {
+    result = EmissaryPrint(data, ab);
+    break;
+  }
+  case EMIS_ID_MPI: {
+    if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+                           &(ab->data_not_used), &args[0],
+                           D2HAddrList) != _ERC_SUCCESS)
+      return (EmissaryReturn_t)0;
+    result = EmissaryMPI(data, ab, args);
+    break;
+  }
+  case EMIS_ID_HDF5: {
+    if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+                           &(ab->data_not_used), &args[0],
+                           D2HAddrList) != _ERC_SUCCESS)
+      return (EmissaryReturn_t)0;
+    result = EmissaryHDF5(data, ab, args);
+    break;
+  }
+  case EMIS_ID_RESERVE: {
+    if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+                           &(ab->data_not_used), &args[0],
+                           D2HAddrList) != _ERC_SUCCESS)
+      return (EmissaryReturn_t)0;
+    result = EmissaryReserve(data, ab, args);
+    break;
+  }
+  default:
+    fprintf(stderr,
+            "Emissary (host execution) EMIS_ID:%d fnid:%d not supported\n",
+            ab->emisid, ab->emisfnid);
+  }
+  free(args);
+  return result;
+}
+
+// emisExtractArgBuf reverses protocol that codegen in EmitEmissaryExec makes.
+extern "C" void emisExtractArgBuf(char *data, emisArgBuf_t *ab) {
+
+  uint32_t *int32_data = (uint32_t *)data;
+  ab->DataLen = int32_data[0];
+  ab->NumArgs = int32_data[1];
+
+  // Note: while the data buffer contains all args including strings,
+  // ab->DataLen does not include strings. It only counts header, keys,
+  // and aligned numerics.
+
+  ab->keyptr = data + (2 * sizeof(int));
+  ab->argptr = ab->keyptr + (ab->NumArgs * sizeof(int));
+  ab->strptr = data + (size_t)ab->DataLen;
+  int alignfill = 0;
+  if (((size_t)ab->argptr) % (size_t)8) {
+    ab->argptr += 4;
+    alignfill = 4;
+  }
+
+  // Extract the two emissary identifiers and number of send
+  // and recv device data transfers. These are 4 16 bit values
+  // packed into a single 64-bit field.
+  uint64_t arg1 = *(uint64_t *)ab->argptr;
+  ab->emisid = (unsigned int)((arg1 >> 48) & 0xFFFF);
+  ab->emisfnid = (unsigned int)((arg1 >> 32) & 0xFFFF);
+  ab->NumSendXfers = (unsigned int)((arg1 >> 16) & 0xFFFF);
+  ab->NumRecvXfers = (unsigned int)((arg1) & 0xFFFF);
+
+  // skip the uint64_t emissary id arg which is first arg in _emissary_exec.
+  ab->keyptr += sizeof(int);
+  ab->argptr += sizeof(uint64_t);
+  ab->NumArgs -= 1;
+
+  // data_not_used used for testing consistency.
+  ab->data_not_used =
+      (size_t)(ab->DataLen) - (((size_t)(3 + ab->NumArgs) * sizeof(int)) +
+                               sizeof(uint64_t) + alignfill);
+
+  // Ensure first arg after emissary id arg is aligned.
+  if (((size_t)ab->argptr) % (size_t)8) {
+    ab->argptr += 4;
+    ab->data_not_used -= 4;
+  }
+}
+
+/// Get uint32 value extended to uint64_t value from a char ptr
+extern "C" uint64_t getuint32(char *val) {
+  uint32_t i32 = *(uint32_t *)val;
+  return (uint64_t)i32;
+}
+
+/// Get uint64_t value from a char ptr
+extern "C" uint64_t getuint64(char *val) { return *(uint64_t *)val; }
+
+/// Get a function pointer from a char ptr
+extern "C" void *getfnptr(char *val) {
+  uint64_t ival = *(uint64_t *)val;
+  return (void *)ival;
+}
+
+// build argument array
+extern "C" uint32_t
+EmissaryBuildVargs(int NumArgs, char *keyptr, char *dataptr, char *strptr,
+                   unsigned long long *data_not_used, emis_argptr_t *a[],
+                   std::unordered_map<void *, void *> *D2HAddrList) {
+  size_t num_bytes;
+  size_t bytes_consumed;
+  size_t strsz;
+  size_t fillerNeeded;
+
+  uint argcount = 0;
+
+  for (int argnum = 0; argnum < NumArgs; argnum++) {
+    num_bytes = 0;
+    strsz = 0;
+    unsigned int key = *(unsigned int *)keyptr;
+    unsigned int llvmID = key >> 16;
+    unsigned int numbits = (key << 16) >> 16;
+
+    switch (llvmID) {
+    case FloatTyID:  ///<  2: 32-bit floating point type
+    case DoubleTyID: ///<  3: 64-bit floating point type
+    case FP128TyID:  ///<  5: 128-bit floating point type (112-bit mantissa)
+      num_bytes = numbits / 8;
+      bytes_consumed = num_bytes;
+      fillerNeeded = ((size_t)dataptr) % num_bytes;
+      if (fillerNeeded) {
+        dataptr += fillerNeeded;
+        bytes_consumed += fillerNeeded;
+      }
+      if ((*data_not_used) < bytes_consumed)
+        return _ERC_DATA_USED_ERROR;
+
+      if (num_bytes == 4)
+        a[argcount] = (emis_argptr_t *)getuint32(dataptr);
+      else
+        a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+      break;
+
+    case IntegerTyID: ///< 11: Arbitrary bit width integers
+      num_bytes = numbits / 8;
+      bytes_consumed = num_bytes;
+      fillerNeeded = ((size_t)dataptr) % num_bytes;
+      if (fillerNeeded) {
+        dataptr += fillerNeeded;
+        bytes_consumed += fillerNeeded;
+      }
+      if ((*data_not_used) < bytes_consumed)
+        return _ERC_DATA_USED_ERROR;
+
+      if (num_bytes == 4)
+        a[argcount] = (emis_argptr_t *)getuint32(dataptr);
+      else
+        a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+      // fprintf(stderr, "Arg:%d  IntegerType bytes:%ld val:%ld\n",argcount,
+      // num_bytes,(uint64_t) a[argcount]);
+      break;
+
+    case PointerTyID: {   ///< 15: Pointers
+      if (numbits == 1) { // This is a pointer to string
+        num_bytes = 4;
+        bytes_consumed = num_bytes;
+        strsz = (size_t) * (unsigned int *)dataptr;
+        if ((*data_not_used) < bytes_consumed)
+          return _ERC_DATA_USED_ERROR;
+        a[argcount] = (emis_argptr_t *)((char *)strptr);
+      } else {
+        num_bytes = 8;
+        bytes_consumed = num_bytes;
+        fillerNeeded = ((size_t)dataptr) % num_bytes;
+        if (fillerNeeded) {
+          dataptr += fillerNeeded; // dataptr is now aligned
+          bytes_consumed += fillerNeeded;
+        }
+        if ((*data_not_used) < bytes_consumed)
+          return _ERC_DATA_USED_ERROR;
+        a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+      }
+      auto found = D2HAddrList->find((void *)a[argcount]);
+      if (found != D2HAddrList->end())
+        a[argcount] = (emis_argptr_t *)found->second;
+      // fprintf(stderr, "Arg:%d  pointer Type bytes:%ld val:%p\n",argcount,
+      // num_bytes, (void*) a[argcount]);
+    } break;
+
+    case HalfTyID:           ///<  1: 16-bit floating point type
+    case ArrayTyID:          ///< 14: Arrays
+    case StructTyID:         ///< 13: Structures
+    case FunctionTyID:       ///< 12: Functions
+    case TokenTyID:          ///< 10: Tokens
+    case MetadataTyID:       ///<  8: Metadata
+    case LabelTyID:          ///<  7: Labels
+    case PPC_FP128TyID:      ///<  6: 128-bit floating point type (two 64-bits,
+                             ///<  PowerPC)
+    case X86_FP80TyID:       ///<  4: 80-bit floating point type (X87)
+    case FixedVectorTyID:    ///< 16: Fixed width SIMD vector type
+    case ScalableVectorTyID: ///< 17: Scalable SIMD vector type
+    case TypedPointerTyID:   ///< Typed pointer used by some GPU targets
+    case TargetExtTyID:      ///< Target extension type
+    case VoidTyID:
+      return _ERC_UNSUPPORTED_ID_ERROR;
+      break;
+    default:
+      return _ERC_INVALID_ID_ERROR;
+    }
+
+    // Move to next argument
+    dataptr += num_bytes;
+    strptr += strsz;
+    *data_not_used -= bytes_consumed;
+    keyptr += 4;
+    argcount++;
+  }
+  return _ERC_SUCCESS;
+}
+
+extern "C" void emisSkipXferArgSet(emisArgBuf_t *ab) {
+  // Skip the ptr and size of the Xfer
+  ab->NumArgs -= 2;
+  ab->keyptr += 2 * sizeof(uint32_t);
+  ab->argptr += 2 * sizeof(void *);
+  ab->data_not_used -= 2 * sizeof(void *);
+}
diff --git a/offload/plugins-nextgen/common/src/EmissaryFortrt.cpp b/offload/plugins-nextgen/common/src/EmissaryFortrt.cpp
new file mode 100644
index 0000000000000..7853357b1329a
--- /dev/null
+++ b/offload/plugins-nextgen/common/src/EmissaryFortrt.cpp
@@ -0,0 +1,470 @@
+//===---- offload/plugins-nextgen/common/src/EmissaryFortrt.cpp  ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Host support for Fortran runtime Emissary API
+//
+//===----------------------------------------------------------------------===//
+#include "PluginInterface.h"
+#include "RPC.h"
+#include "Shared/Debug.h"
+#include "Shared/RPCOpcodes.h"
+#include "shared/rpc.h"
+#include "shared/rpc_opcodes.h"
+#include <Emissary.h>
+#include <EmissaryIds.h>
+#include <assert.h>
+#include <cstring>
+#include <ctype.h>
+#include <list>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <tuple>
+#include <vector>
+
+// Headers for Host Fortran Runtime API as built in llvm/flang/runtime
+extern "C" {
+void *_FortranAioBeginExternalListOutput(uint32_t a1, const char *a2,
+                                         uint32_t a3);
+void *_FortranAioBeginExternalFormattedOutput(const char *ptr1, uint64_t x1,
+                                              void *ptr2, uint32_t x2,
+                                              const char *ptr3, uint32_t x3);
+bool _FortranAioOutputAscii(void *a1, char *a2, uint64_t a3);
+bool _FortranAioOutputInteger32(void *a1, uint32_t a2);
+uint32_t _FortranAioEndIoStatement(void *a1);
+bool _FortranAioOutputInteger8(void *cookie, int8_t n);
+bool _FortranAioOutputInteger16(void *cookie, int16_t n);
+bool _FortranAioOutputInteger64(void *cookie, int64_t n);
+bool _FortranAioOutputReal32(void *cookie, float x);
+bool _FortranAioOutputReal64(void *cookie, double x);
+bool _FortranAioOutputComplex32(void *cookie, float re, float im);
+bool _FortranAioOutputComplex64(void *cookie, double re, double im);
+bool _FortranAioOutputLogical(void *cookie, bool truth);
+void _FortranAAbort();
+void _FortranAStopStatementText(char *errmsg, int64_t a1, bool a2, bool a3);
+void _FortranAStopStatement(int32_t a1, bool a2, bool a3);
+
+//  Save the cookie because deferred functions have execution reordered.
+static void *_list_started_cookie = nullptr;
+extern void *V_FortranAioBeginExternalListOutput(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  int32_t v0 = va_arg(args, int32_t);
+  const char *v1 = va_arg(args, const char *);
+  int32_t v2 = va_arg(args, int32_t);
+  va_end(args);
+  void *cookie = _FortranAioBeginExternalListOutput(v0, v1, v2);
+  _list_started_cookie = cookie;
+  return cookie;
+}
+extern void *V_FortranAioBeginExternalFormattedOutput(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  const char *p0 = va_arg(args, const char *);
+  int64_t v0 = va_arg(args, int64_t);
+  void *p1 = va_arg(args, void *);
+  int32_t v1 = va_arg(args, int32_t);
+  const char *p2 = va_arg(args, const char *);
+  int32_t v2 = va_arg(args, int32_t);
+  va_end(args);
+  void *cookie =
+      _FortranAioBeginExternalFormattedOutput(p0, v0, p1, v1, p2, v2);
+  _list_started_cookie = cookie;
+  return cookie;
+}
+extern bool V_FortranAioOutputAscii(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  char *v1 = va_arg(args, char *);
+  uint64_t v2 = va_arg(args, uint64_t);
+  va_end(args);
+  v0 = _list_started_cookie;
+  return _FortranAioOutputAscii(v0, v1, v2);
+}
+extern bool V_FortranAioOutputInteger32(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  uint32_t v1 = va_arg(args, uint32_t);
+  va_end(args);
+  v0 = _list_started_cookie;
+  return _FortranAioOutputInteger32(v0, v1);
+}
+extern uint32_t V_FortranAioEndIoStatement(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  va_end(args);
+  v0 = _list_started_cookie;
+  uint32_t rv = _FortranAioEndIoStatement(v0);
+  return rv;
+}
+extern bool V_FortranAioOutputInteger8(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  uint32_t v1 = va_arg(args, uint32_t);
+  va_end(args);
+  v0 = _list_started_cookie;
+  return _FortranAioOutputInteger8(v0, v1);
+}
+extern bool V_FortranAioOutputInteger16(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  uint32_t v1 = va_arg(args, uint32_t);
+  va_end(args);
+  v0 = _list_started_cookie;
+  return _FortranAioOutputInteger16(v0, v1);
+}
+extern bool V_FortranAioOutputInteger64(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  uint32_t v1 = va_arg(args, uint32_t);
+  va_end(args);
+  v0 = _list_started_cookie;
+  return _FortranAioOutputInteger64(v0, v1);
+}
+extern bool V_FortranAioOutputReal32(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  uint64_t v1 = va_arg(args, uint64_t);
+  va_end(args);
+  v0 = _list_started_cookie;
+  double dv;
+  memcpy(&dv, &v1, 8);
+  return _FortranAioOutputReal32(v0, (float)dv);
+}
+extern bool V_FortranAioOutputReal64(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *cookie = va_arg(args, void *);
+  uint64_t v1 = va_arg(args, uint64_t);
+  va_end(args);
+  cookie = _list_started_cookie;
+  double dv;
+  memcpy(&dv, &v1, 8);
+  return _FortranAioOutputReal64(cookie, dv);
+}
+extern bool V_FortranAioOutputComplex32(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  uint64_t v1 = va_arg(args, uint64_t);
+  uint64_t v2 = va_arg(args, uint64_t);
+  va_end(args);
+  v0 = _list_started_cookie;
+  double dv1, dv2;
+  memcpy(&dv1, &v1, 8);
+  memcpy(&dv2, &v2, 8);
+  return _FortranAioOutputComplex32(v0, (float)dv1, (float)dv2);
+}
+extern bool V_FortranAioOutputComplex64(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  uint64_t v1 = va_arg(args, uint64_t);
+  uint64_t v2 = va_arg(args, uint64_t);
+  va_end(args);
+  v0 = _list_started_cookie;
+  double dv1, dv2;
+  memcpy(&dv1, &v1, 8);
+  memcpy(&dv2, &v2, 8);
+  return _FortranAioOutputComplex64(v0, dv1, dv2);
+}
+extern bool V_FortranAioOutputLogical(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  void *v0 = va_arg(args, void *);
+  uint32_t v1 = va_arg(args, uint32_t);
+  va_end(args);
+  v0 = _list_started_cookie;
+  return _FortranAioOutputLogical(v0, v1);
+}
+extern void V_FortranAAbort(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  va_end(args);
+  _FortranAAbort();
+  // Now return to device to run abort from stub
+}
+extern void V_FortranAStopStatementText(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  char *errmsg = va_arg(args, char *);
+  int64_t a1 = va_arg(args, int64_t);
+  uint32_t a2 = va_arg(args, uint32_t);
+  uint32_t a3 = va_arg(args, uint32_t);
+  va_end(args);
+  bool b2 = (bool)a2;
+  bool b3 = (bool)a3;
+  _FortranAStopStatementText(errmsg, a1, b2, b3);
+}
+extern void V_FortranAStopStatement(void *fnptr, ...) {
+  va_list args;
+  va_start(args, fnptr);
+  int32_t a1 = va_arg(args, int32_t);
+  uint32_t a2 = va_arg(args, uint32_t);
+  uint32_t a3 = va_arg(args, uint32_t);
+  va_end(args);
+  bool b2 = (bool)a2;
+  bool b3 = (bool)a3;
+  _FortranAStopStatement(a1, b2, b3);
+}
+} // end extern "C"
+
+// Static vars used to defer functions to reorder execution by thread and team.
+static uint32_t _deferred_fn_count = 0;
+static uint32_t _deferred_begin_statements = 0;
+static uint32_t _deferred_end_statements = 0;
+static uint64_t _max_num_threads = 0;
+static uint64_t _max_num_teams = 0;
+
+// structure for deferred functions
+typedef struct {
+  uint32_t NumArgs;    // The number of args in arg_array
+  void *fnptr;         // The function pointer for this index
+  uint64_t fn_idx;     // The function index, good for debug
+  uint32_t dfnid;      // The dvoideferred function id, in order received
+  uint64_t *arg_array; // ptr to malloced arg_array
+  char *c_ptr;         // ptr to null terminated char string
+  char *c_ptr2;        // ptr to null terminated char string
+  uint64_t thread_num;
+  uint64_t num_threads;
+  uint64_t team_num;
+  uint64_t num_teams;
+  EmissaryReturn_t return_value;
+} deferred_entry_t;
+
+static std::vector<deferred_entry_t *> *_deferred_fns_ptr;
+// static std::list<deferred_entry_t *> _deferred_fns;
+//
+
+extern "C" EmissaryReturn_t
+EmissaryFortrt(char *data, emisArgBuf_t *ab,
+               std::unordered_map<void *, void *> *D2HAddrList) {
+  EmissaryReturn_t return_value = (EmissaryReturn_t)0;
+
+  if (ab->DataLen == 0)
+    return _ERC_SUCCESS;
+
+  void *fnptr;
+  if (ab->NumArgs <= 0)
+    return _ERC_ERROR_INVALID_REQUEST;
+
+  emis_argptr_t *a[MAXVARGS];
+  if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+                         &ab->data_not_used, a, D2HAddrList) != _ERC_SUCCESS)
+    return _ERC_ERROR_INVALID_REQUEST;
+
+  // std::list<deferred_entry_t *> _deferred_fns;
+  if (!_deferred_fns_ptr)
+    _deferred_fns_ptr = new std::vector<deferred_entry_t *>;
+
+  char *c_ptr = nullptr;
+  char *c_ptr2 = nullptr;
+  bool defer_for_reorder = true;
+  bool run_deferred_functions = false;
+  switch (ab->emisfnid) {
+  case _FortranAioBeginExternalListOutput_idx: {
+    _deferred_begin_statements++;
+    fnptr = (void *)V_FortranAioBeginExternalListOutput;
+    size_t slen = std::strlen((char *)a[5]) + 1;
+    c_ptr = (char *)aligned_alloc(sizeof(uint64_t *), slen);
+    if (!c_ptr)
+      fprintf(stderr, "MALLOC FAILED for c_ptr size:%ld \n", slen);
+    std::strncpy(c_ptr, (char *)a[5], slen - 1);
+    c_ptr[slen - 1] = (char)0;
+    a[5] = (emis_argptr_t *)c_ptr;
+    break;
+  }
+  case _FortranAioBeginExternalFormattedOutput_idx: {
+    _deferred_begin_statements++;
+    fnptr = (void *)V_FortranAioBeginExternalFormattedOutput;
+    size_t slen = std::strlen((char *)a[8]) + 1;
+    c_ptr = (char *)aligned_alloc(sizeof(uint64_t *), slen);
+    if (!c_ptr)
+      fprintf(stderr, "MALLOC FAILED for c_ptr size:%ld \n", slen);
+    std::strncpy(c_ptr, (char *)a[8], slen - 1);
+    c_ptr[slen - 1] = (char)0;
+    a[8] = (emis_argptr_t *)c_ptr;
+
+    slen = std::strlen((char *)a[4]) + 1;
+    c_ptr2 = (char *)aligned_alloc(sizeof(uint64_t *), slen);
+    if (!c_ptr2)
+      fprintf(stderr, "MALLOC FAILED for c_ptr2 size:%ld \n", slen);
+    std::strncpy(c_ptr2, (char *)a[4], slen - 1);
+    c_ptr2[slen - 1] = (char)0;
+    a[4] = (emis_argptr_t *)c_ptr2;
+    break;
+  }
+  case _FortranAioOutputAscii_idx: {
+    fnptr = (void *)V_FortranAioOutputAscii;
+
+    size_t slen = (size_t)a[6] + 1;
+    c_ptr = (char *)aligned_alloc(sizeof(uint64_t *), slen);
+    if (!c_ptr)
+      fprintf(stderr, "MALLOC FAILED for c_ptr size:%ld \n", slen);
+    std::strncpy(c_ptr, (char *)a[5], slen - 1);
+    c_ptr[slen - 1] = (char)0;
+    a[5] = (emis_argptr_t *)c_ptr;
+
+    break;
+  }
+  case _FortranAioOutputInteger32_idx: {
+    fnptr = (void *)V_FortranAioOutputInteger32;
+    break;
+  }
+  case _FortranAioEndIoStatement_idx: {
+    _deferred_end_statements++;
+    fnptr = (void *)V_FortranAioEndIoStatement;
+    // We cannot use last tread and team number to trigger running deferred
+    // functions because its warp could finish early (out of order). So, if
+    // this is the last FortranAioEndIoStatement by count of begin statements,
+    // then run the deferred functions ordered by team and thread number.
+    if (_deferred_end_statements == _deferred_begin_statements)
+      run_deferred_functions = true;
+    break;
+  }
+  case _FortranAioOutputInteger8_idx: {
+    fnptr = (void *)V_FortranAioOutputInteger8;
+    break;
+  }
+  case _FortranAioOutputInteger16_idx: {
+    fnptr = (void *)V_FortranAioOutputInteger16;
+    break;
+  }
+  case _FortranAioOutputInteger64_idx: {
+    fnptr = (void *)V_FortranAioOutputInteger64;
+    break;
+  }
+  case _FortranAioOutputReal32_idx: {
+    fnptr = (void *)V_FortranAioOutputReal32;
+    break;
+  }
+  case _FortranAioOutputReal64_idx: {
+    fnptr = (void *)V_FortranAioOutputReal64;
+    break;
+  }
+  case _FortranAioOutputComplex32_idx: {
+    fnptr = (void *)V_FortranAioOutputComplex32;
+    break;
+  }
+  case _FortranAioOutputComplex64_idx: {
+    fnptr = (void *)V_FortranAioOutputComplex64;
+    break;
+  }
+  case _FortranAioOutputLogical_idx: {
+    fnptr = (void *)V_FortranAioOutputLogical;
+    break;
+  }
+  case _FortranAAbort_idx: {
+    defer_for_reorder = false;
+    fnptr = (void *)V_FortranAAbort;
+    break;
+  }
+  case _FortranAStopStatementText_idx: {
+    defer_for_reorder = false;
+    fnptr = (void *)V_FortranAStopStatementText;
+    break;
+  }
+  case _FortranAStopStatement_idx: {
+    defer_for_reorder = false;
+    fnptr = (void *)V_FortranAStopStatement;
+    break;
+  }
+  case _FortranAio_INVALID:
+  default: {
+    defer_for_reorder = false;
+    break;
+  }
+  } // end of switch
+
+  if (defer_for_reorder) {
+    _deferred_fn_count++;
+    deferred_entry_t *q = new deferred_entry_t;
+
+    q->dfnid = _deferred_fn_count - 1;
+    q->thread_num = (uint64_t)a[0];
+    q->num_threads = (uint64_t)a[1];
+    _max_num_threads =
+        (q->num_threads > _max_num_threads) ? q->num_threads : _max_num_threads;
+    q->team_num = (uint64_t)a[2];
+    q->num_teams = (uint64_t)a[3];
+    _max_num_teams =
+        (q->num_teams > _max_num_teams) ? q->num_teams : _max_num_teams;
+    q->NumArgs = ab->NumArgs - 4;
+    q->fnptr = fnptr;
+    q->fn_idx = ab->emisfnid;
+    uint64_t *arg_array = (uint64_t *)aligned_alloc(
+        sizeof(uint64_t), (ab->NumArgs - 4) * sizeof(uint64_t));
+    if (!arg_array)
+      fprintf(stderr, " MALLOC FAILED for arg_array size:%ld \n",
+              sizeof(uint64_t) * (ab->NumArgs - 4));
+    for (uint32_t i = 0; i < ab->NumArgs - 4; i++) {
+      uint64_t val = (uint64_t)a[i + 4];
+      arg_array[i] = val;
+    }
+    q->arg_array = arg_array;
+    q->return_value = (EmissaryReturn_t)0;
+    q->c_ptr = c_ptr;
+    q->c_ptr2 = c_ptr2;
+    _deferred_fns_ptr->push_back(q);
+  } else {
+    // execute a non deferred function
+    return_value = EmissaryCallFnptr<EmissaryReturn_t, emisfn_t>(
+        ab->NumArgs - 4, fnptr, &a[4]);
+  }
+
+  if (run_deferred_functions) {
+    // This specific team and thread ordering does not reflect the
+    // actual non-deterministic ordering.
+    for (uint32_t team_num = 0; team_num < _max_num_teams; team_num++) {
+      for (uint32_t thread_num = 0; thread_num < _max_num_threads;
+           thread_num++) {
+        for (auto q : *_deferred_fns_ptr) {
+          if ((thread_num == q->thread_num) && (team_num == q->team_num)) {
+            for (uint32_t i = 0; i < q->NumArgs; i++)
+              a[i] = (emis_argptr_t *)q->arg_array[i];
+            q->return_value = EmissaryCallFnptr<EmissaryReturn_t, emisfn_t>(
+                q->NumArgs, q->fnptr, a);
+          }
+          // Only the return value for the last end statement is returned.
+          return_value = q->return_value;
+        }
+      }
+    }
+
+    //  Reset static deferred function counters and free memory
+    for (auto q : *_deferred_fns_ptr) {
+      if (q->c_ptr)
+        free(q->c_ptr);
+      if (q->c_ptr2)
+        free(q->c_ptr2);
+      free(q->arg_array);
+      delete q;
+    }
+    _deferred_fns_ptr->clear();
+    _deferred_fn_count = 0;
+    _deferred_begin_statements = 0;
+    _deferred_end_statements = 0;
+    _max_num_threads = 0;
+    _max_num_teams = 0;
+    delete _deferred_fns_ptr;
+    _deferred_fns_ptr = nullptr;
+  } // end run_deferred_functions
+
+  return return_value;
+} // end EmissaryFortrt
diff --git a/offload/plugins-nextgen/common/src/EmissaryPrint.cpp b/offload/plugins-nextgen/common/src/EmissaryPrint.cpp
new file mode 100644
index 0000000000000..80c55d3c5a5fd
--- /dev/null
+++ b/offload/plugins-nextgen/common/src/EmissaryPrint.cpp
@@ -0,0 +1,423 @@
+//===--- offload/plugins-nextgen/common/src/EmissaryPrint.cpp ----- C++ ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  Host support for misc emissary API.
+//
+//===----------------------------------------------------------------------===//
+#include <Emissary.h>
+#include <EmissaryIds.h>
+#include <assert.h>
+#include <cstring>
+#include <ctype.h>
+#include <list>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static service_rc emissary_printf(uint *rc, emisArgBuf_t *ab);
+static service_rc emissary_fprintf(uint *rc, emisArgBuf_t *ab);
+
+extern "C" EmissaryReturn_t EmissaryPrint(char *data, emisArgBuf_t *ab) {
+  uint32_t return_value;
+  service_rc rc;
+  switch (ab->emisfnid) {
+  case _printf_idx: {
+    rc = emissary_printf(&return_value, ab);
+    break;
+  }
+  case _fprintf_idx: {
+    rc = emissary_fprintf(&return_value, ab);
+    break;
+  }
+  case _ockl_asan_report_idx: {
+    fprintf(stderr, " asan_report not yet implemented\n");
+    return_value = 0;
+    rc = _ERC_STATUS_ERROR;
+    break;
+  }
+  case _print_INVALID:
+  default: {
+    fprintf(stderr, " INVALID emissary function id (%d) for PRINT API \n",
+            ab->emisfnid);
+    return_value = 0;
+    rc = _ERC_STATUS_ERROR;
+    break;
+  }
+  }
+  if (rc != _ERC_SUCCESS)
+    fprintf(stderr, "HOST failure in _emissary_execute_print rc:%d\n", rc);
+
+  return (EmissaryReturn_t)return_value;
+}
+
+// NUMFPREGS and FPREGSZ are part of x86 vargs ABI that
+// is recreated with the printf support.
+#define NUMFPREGS 8
+#define FPREGSZ 16
+
+typedef int uint128_t __attribute__((mode(TI)));
+struct emissary_pfIntRegs {
+  uint64_t rdi, rsi, rdx, rcx, r8, r9;
+};
+typedef struct emissary_pfIntRegs emissary_pfIntRegs_t; // size = 48 bytes
+
+struct emissary_pfRegSaveArea {
+  emissary_pfIntRegs_t iregs;
+  uint128_t freg[NUMFPREGS];
+};
+typedef struct emissary_pfRegSaveArea
+    emissary_pfRegSaveArea_t; // size = 304 bytes
+
+struct emissary_ValistExt {
+  uint32_t gp_offset;      /* offset to next available gpr in reg_save_area */
+  uint32_t fp_offset;      /* offset to next available fpr in reg_save_area */
+  void *overflow_arg_area; /* args that are passed on the stack */
+  emissary_pfRegSaveArea_t *reg_save_area; /* int and fp registers */
+  size_t overflow_size;
+} __attribute__((packed));
+typedef struct emissary_ValistExt emissary_ValistExt_t;
+
+// Handle overflow when building the va_list for vprintf
+static service_rc emissary_pfGetOverflow(emissary_ValistExt_t *valist,
+                                         size_t needsize) {
+  if (needsize < valist->overflow_size)
+    return _ERC_SUCCESS;
+
+  // Make the overflow area bigger
+  size_t stacksize;
+  void *newstack;
+  if (valist->overflow_size == 0) {
+    // Make initial save area big to reduce mallocs
+    stacksize = (FPREGSZ * NUMFPREGS) * 2;
+    if (needsize > stacksize)
+      stacksize = needsize; // maybe a big string
+  } else {
+    // Initial save area not big enough, double it
+    stacksize = valist->overflow_size * 2;
+  }
+  if (!(newstack = malloc(stacksize))) {
+    return _ERC_STATUS_ERROR;
+  }
+  memset(newstack, 0, stacksize);
+  if (valist->overflow_size) {
+    memcpy(newstack, valist->overflow_arg_area, valist->overflow_size);
+    free(valist->overflow_arg_area);
+  }
+  valist->overflow_arg_area = newstack;
+  valist->overflow_size = stacksize;
+  return _ERC_SUCCESS;
+}
+
+// Add an integer to the va_list for vprintf
+static service_rc emissary_pfAddInteger(emissary_ValistExt_t *valist, char *val,
+                                        size_t valsize, size_t *stacksize) {
+  uint64_t ival;
+  switch (valsize) {
+  case 1:
+    ival = *(uint8_t *)val;
+    break;
+  case 2:
+    ival = *(uint32_t *)val;
+    break;
+  case 4:
+    ival = (*(uint32_t *)val);
+    break;
+  case 8:
+    ival = *(uint64_t *)val;
+    break;
+  default: {
+    return _ERC_STATUS_ERROR;
+  }
+  }
+  //  Always copy 8 bytes, sizeof(ival)
+  if ((valist->gp_offset + sizeof(ival)) <= sizeof(emissary_pfIntRegs_t)) {
+    memcpy(((char *)valist->reg_save_area + valist->gp_offset), &ival,
+           sizeof(ival));
+    valist->gp_offset += sizeof(ival);
+    return _ERC_SUCCESS;
+  }
+  // Ensure valist overflow area is big enough
+  size_t needsize = (size_t)*stacksize + sizeof(ival);
+  if (emissary_pfGetOverflow(valist, needsize) != _ERC_SUCCESS)
+    return _ERC_STATUS_ERROR;
+  // Copy to overflow
+  memcpy((char *)(valist->overflow_arg_area) + (size_t)*stacksize, &ival,
+         sizeof(ival));
+
+  *stacksize += sizeof(ival);
+  return _ERC_SUCCESS;
+}
+
+// Add a String argument when building va_list for vprintf
+static service_rc emissary_pfAddString(emissary_ValistExt_t *valist, char *val,
+                                       size_t strsz, size_t *stacksize) {
+  size_t valsize =
+      sizeof(char *); // ABI captures pointer to string,  not string
+  if ((valist->gp_offset + valsize) <= sizeof(emissary_pfIntRegs_t)) {
+    memcpy(((char *)valist->reg_save_area + valist->gp_offset), val, valsize);
+    valist->gp_offset += valsize;
+    return _ERC_SUCCESS;
+  }
+  size_t needsize = (size_t)*stacksize + valsize;
+  if (emissary_pfGetOverflow(valist, needsize) != _ERC_SUCCESS)
+    return _ERC_STATUS_ERROR;
+  memcpy((char *)(valist->overflow_arg_area) + (size_t)*stacksize, val,
+         valsize);
+  *stacksize += valsize;
+  return _ERC_SUCCESS;
+}
+
+// Add a floating point value when building va_list for vprintf
+static service_rc emissary_pfAddFloat(emissary_ValistExt_t *valist,
+                                      char *numdata, size_t valsize,
+                                      size_t *stacksize) {
+  // we could use load because doubles are now aligned
+  double dval;
+  if (valsize == 4) {
+    float fval;
+    memcpy(&fval, numdata, 4);
+    dval = (double)fval; // Extend single to double per abi
+  } else if (valsize == 8) {
+    memcpy(&dval, numdata, 8);
+  } else {
+    return _ERC_STATUS_ERROR;
+  }
+  if ((valist->fp_offset + FPREGSZ) <= sizeof(emissary_pfRegSaveArea_t)) {
+    memcpy(((char *)valist->reg_save_area + (size_t)(valist->fp_offset)), &dval,
+           sizeof(double));
+    valist->fp_offset += FPREGSZ;
+    return _ERC_SUCCESS;
+  }
+  size_t needsize = (size_t)*stacksize + sizeof(double);
+  if (emissary_pfGetOverflow(valist, needsize) != _ERC_SUCCESS)
+    return _ERC_STATUS_ERROR;
+  memcpy((char *)(valist->overflow_arg_area) + (size_t)*stacksize, &dval,
+         sizeof(double));
+  // move only by the size of the double (8 bytes)
+  *stacksize += sizeof(double);
+  return _ERC_SUCCESS;
+}
+
+// Build an extended va_list for vprintf by unpacking the buffer
+static service_rc emissary_pfBuildValist(emissary_ValistExt_t *valist,
+                                         int NumArgs, char *keyptr,
+                                         char *dataptr, char *strptr,
+                                         unsigned long long *data_not_used) {
+  emissary_pfRegSaveArea_t *regs;
+  size_t regs_size = sizeof(*regs);
+  regs = (emissary_pfRegSaveArea_t *)malloc(regs_size);
+  if (!regs)
+    return _ERC_STATUS_ERROR;
+  memset(regs, 0, regs_size);
+  *valist = (emissary_ValistExt_t){
+      .gp_offset = 0,
+      .fp_offset = 0,
+      .overflow_arg_area = NULL,
+      .reg_save_area = regs,
+      .overflow_size = 0,
+  };
+
+  size_t num_bytes;
+  size_t bytes_consumed;
+  size_t strsz;
+  size_t fillerNeeded;
+
+  size_t stacksize = 0;
+
+  for (int argnum = 0; argnum < NumArgs; argnum++) {
+    num_bytes = 0;
+    strsz = 0;
+    unsigned int key = *(unsigned int *)keyptr;
+    unsigned int llvmID = key >> 16;
+    unsigned int numbits = (key << 16) >> 16;
+    switch (llvmID) {
+    case FloatTyID:  ///<  2: 32-bit floating point type
+    case DoubleTyID: ///<  3: 64-bit floating point type
+    case FP128TyID:  ///<  5: 128-bit floating point type (112-bit mantissa)
+      num_bytes = numbits / 8;
+      bytes_consumed = num_bytes;
+      fillerNeeded = ((size_t)dataptr) % num_bytes;
+      if (fillerNeeded) {
+        dataptr += fillerNeeded;
+        bytes_consumed += fillerNeeded;
+      }
+      if ((*data_not_used) < bytes_consumed)
+        return _ERC_DATA_USED_ERROR;
+      if (valist->fp_offset == 0)
+        valist->fp_offset = sizeof(emissary_pfIntRegs_t);
+      if (emissary_pfAddFloat(valist, dataptr, num_bytes, &stacksize))
+        return _ERC_ADDFLOAT_ERROR;
+      break;
+
+    case IntegerTyID: ///< 11: Arbitrary bit width integers
+      num_bytes = numbits / 8;
+      bytes_consumed = num_bytes;
+      fillerNeeded = ((size_t)dataptr) % num_bytes;
+      if (fillerNeeded) {
+        dataptr += fillerNeeded;
+        bytes_consumed += fillerNeeded;
+      }
+      if ((*data_not_used) < bytes_consumed)
+        return _ERC_DATA_USED_ERROR;
+      if (emissary_pfAddInteger(valist, dataptr, num_bytes, &stacksize))
+        return _ERC_ADDINT_ERROR;
+      break;
+
+    case PointerTyID:     ///< 15: Pointers
+      if (numbits == 1) { // This is a pointer to string
+        num_bytes = 4;
+        bytes_consumed = num_bytes;
+        strsz = (size_t) * (unsigned int *)dataptr;
+        if ((*data_not_used) < bytes_consumed)
+          return _ERC_DATA_USED_ERROR;
+        if (emissary_pfAddString(valist, (char *)&strptr, strsz, &stacksize))
+          return _ERC_ADDSTRING_ERROR;
+      } else {
+        num_bytes = 8;
+        bytes_consumed = num_bytes;
+        fillerNeeded = ((size_t)dataptr) % num_bytes;
+        if (fillerNeeded) {
+          dataptr += fillerNeeded; // dataptr is now aligned
+          bytes_consumed += fillerNeeded;
+        }
+        if ((*data_not_used) < bytes_consumed)
+          return _ERC_DATA_USED_ERROR;
+        if (emissary_pfAddInteger(valist, dataptr, num_bytes, &stacksize))
+          return _ERC_ADDINT_ERROR;
+      }
+      break;
+
+    case HalfTyID:           ///<  1: 16-bit floating point type
+    case ArrayTyID:          ///< 14: Arrays
+    case StructTyID:         ///< 13: Structures
+    case FunctionTyID:       ///< 12: Functions
+    case TokenTyID:          ///< 10: Tokens
+    case MetadataTyID:       ///<  8: Metadata
+    case LabelTyID:          ///<  7: Labels
+    case PPC_FP128TyID:      ///<  6: 128-bit floating point type (two 64-bits,
+                             ///<  PowerPC)
+    case X86_FP80TyID:       ///<  4: 80-bit floating point type (X87)
+    case FixedVectorTyID:    ///< 16: Fixed width SIMD vector type
+    case ScalableVectorTyID: ///< 17: Scalable SIMD vector type
+    case TypedPointerTyID:   ///< Typed pointer used by some GPU targets
+    case TargetExtTyID:      ///< Target extension type
+    case VoidTyID:
+      return _ERC_UNSUPPORTED_ID_ERROR;
+      break;
+    default:
+      return _ERC_INVALID_ID_ERROR;
+    }
+
+    dataptr += num_bytes;
+    strptr += strsz;
+    *data_not_used -= bytes_consumed;
+    keyptr += 4;
+  }
+  return _ERC_SUCCESS;
+} // end emissary_pfBuildValist
+
+/*
+ *  The buffer to pack arguments for all vargs functions has thes 4 sections:
+ *  1. Header  datalen 4 bytes
+ *             numargs 4 bytes
+ *  2. keyptr  A 4-byte key for each arg including string args
+ *             Each 4-byte key contains llvmID and numbits to
+ *             describe the datatype.
+ *  3. argptr  Ths data values for each argument.
+ *             Each arg is aligned according to its size.
+ *             If the field is a string
+ *             the dataptr contains the string length.
+ *  4. strptr  Exection time string values
+ */
+static service_rc emissary_fprintf(uint *rc, emisArgBuf_t *ab) {
+
+  if (ab->DataLen == 0)
+    return _ERC_SUCCESS;
+
+  char *fmtstr = ab->strptr;
+  FILE *fileptr = (FILE *)*((size_t *)ab->argptr);
+
+  // Skip past the file pointer
+  ab->NumArgs--;
+  ab->keyptr += 4;
+  ab->argptr += sizeof(FILE *);
+  ab->data_not_used -= sizeof(FILE *);
+
+  // Skip past the format string
+  ab->NumArgs--;
+  ab->keyptr += 4;
+  size_t abstrsz = (size_t) * (unsigned int *)ab->argptr;
+  ab->strptr += abstrsz;
+  ab->argptr += 4;
+  ab->data_not_used -= 4;
+
+  emissary_ValistExt_t valist;
+  va_list *real_va_list;
+  real_va_list = (va_list *)&valist;
+
+  if (emissary_pfBuildValist(&valist, ab->NumArgs, ab->keyptr, ab->argptr,
+                             ab->strptr, &ab->data_not_used) != _ERC_SUCCESS)
+    return _ERC_ERROR_INVALID_REQUEST;
+
+  // Roll back offsets and save stack pointer
+  valist.gp_offset = 0;
+  valist.fp_offset = sizeof(emissary_pfIntRegs_t);
+  void *save_stack = valist.overflow_arg_area;
+  *rc = vfprintf(fileptr, fmtstr, *real_va_list);
+  if (valist.reg_save_area)
+    free(valist.reg_save_area);
+  if (save_stack)
+    free(save_stack);
+  return _ERC_SUCCESS;
+}
+
+static service_rc emissary_printf(uint *rc, emisArgBuf_t *ab) {
+  if (ab->DataLen == 0)
+    return _ERC_SUCCESS;
+
+  char *fmtstr = ab->strptr;
+
+  // Skip past the format string
+  ab->NumArgs--;
+  ab->keyptr += 4;
+  size_t abstrsz = (size_t) * (unsigned int *)ab->argptr;
+  ab->strptr += abstrsz;
+  ab->argptr += 4;
+  ab->data_not_used -= 4;
+
+  emissary_ValistExt_t valist;
+  va_list *real_va_list;
+  real_va_list = (va_list *)&valist;
+
+  if (emissary_pfBuildValist(&valist, ab->NumArgs, ab->keyptr, ab->argptr,
+                             ab->strptr, &ab->data_not_used) != _ERC_SUCCESS)
+    return _ERC_ERROR_INVALID_REQUEST;
+
+  // Roll back offsets and save stack pointer for
+  valist.gp_offset = 0;
+  valist.fp_offset = sizeof(emissary_pfIntRegs_t);
+  void *save_stack = valist.overflow_arg_area;
+  *rc = vprintf(fmtstr, *real_va_list);
+  if (valist.reg_save_area)
+    free(valist.reg_save_area);
+  if (save_stack)
+    free(save_stack);
+  return _ERC_SUCCESS;
+}
+
+extern "C" void *global_allocate(uint32_t bufsz) {
+  return malloc((size_t)bufsz);
+}
+extern "C" int global_free(void *ptr) {
+  free(ptr);
+  return 0;
+}
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index 8d6704733970d..c3f041aa26637 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -13,9 +13,11 @@
 
 #include "PluginInterface.h"
 
+#include "Emissary.h"
 #include "shared/rpc.h"
 #include "shared/rpc_opcodes.h"
 #include "shared/rpc_server.h"
+#include <unordered_map>
 
 using namespace llvm;
 using namespace omp;
@@ -63,6 +65,156 @@ rpc::Status handleOffloadOpcodes(plugin::GenericDeviceTy &Device,
     });
     break;
   }
+
+  // This case handles the device function __llvm_emissary_rpc for emissary
+  // APIs that require no d2h or h2d memory transfer.
+  case OFFLOAD_EMISSARY: {
+    uint64_t Sizes[NumLanes] = {0};
+    unsigned long long Results[NumLanes] = {0};
+    void *buf_ptrs[NumLanes] = {nullptr};
+    Port.recv_n(buf_ptrs, Sizes, [&](uint64_t Size) { return new char[Size]; });
+    uint32_t id = 0;
+    for (void *buffer_ptr : buf_ptrs) {
+      if (buffer_ptr) {
+        emisArgBuf_t ab;
+        emisExtractArgBuf((char *)buffer_ptr, &ab);
+        Results[id++] = Emissary((char *)buffer_ptr, &ab, nullptr);
+      }
+    }
+    Port.send([&](rpc::Buffer *Buffer, uint32_t ID) {
+      Buffer->data[0] = static_cast<uint64_t>(Results[ID]);
+      delete[] reinterpret_cast<char *>(buf_ptrs[ID]);
+    });
+    break;
+  }
+
+  // This case handles the device function __llvm_emissary_rpc_dm for emissary
+  // APIs require D2H or H2D transfer vectors to be processed through the Port.
+  // FIXME: test with multiple transfer vectors of the same type.
+  case OFFLOAD_EMISSARY_DM: {
+    uint64_t Sizes[NumLanes] = {0};
+    unsigned long long Results[NumLanes] = {0};
+    void *buf_ptrs[NumLanes] = {nullptr};
+    Port.recv_n(buf_ptrs, Sizes, [&](uint64_t Size) { return new char[Size]; });
+
+    uint32_t id = 0;
+    emisArgBuf_t AB[NumLanes];
+    std::unordered_map<void *, void *> D2HAddrList;
+    void *Xfers[NumLanes] = {nullptr};
+    void *devXfers[NumLanes] = {nullptr};
+    uint64_t XferSzs[NumLanes] = {0};
+    uint32_t numSendXfers = 0;
+    id = 0;
+    for (void *buffer_ptr : buf_ptrs) {
+      if (buffer_ptr) {
+        emisArgBuf_t *ab = &AB[id];
+        emisExtractArgBuf((char *)buffer_ptr, ab);
+        for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++) {
+          numSendXfers++;
+          devXfers[id] = (void *)*((uint64_t *)ab->argptr);
+          XferSzs[id] = (size_t) * ((size_t *)(ab->argptr + sizeof(void *)));
+          emisSkipXferArgSet(ab);
+        }
+        // Allocate the host space for the receive Xfers
+        for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+          void *devAddr = (void *)*((uint64_t *)ab->argptr);
+          size_t devSz = (size_t) * ((size_t *)(ab->argptr + sizeof(void *)));
+          void *hostAddr = new char[devSz];
+          D2HAddrList.insert(std::pair<void *, void *>(devAddr, hostAddr));
+          emisSkipXferArgSet(ab);
+        }
+        id++;
+      }
+    }
+    // recv_n for device send_n into new host-allocated Xfers
+    if (numSendXfers)
+      Port.recv_n(Xfers, XferSzs,
+                  [&](uint64_t Size) { return new char[Size]; });
+
+    // Xfers now contains just allocated host addrs for sends and
+    // devXfers contains corresponding devAddr for those sends
+    // Build map to pass to Emissary
+    id = 0;
+    for (void *Xfer : Xfers) {
+      if (Xfer) {
+        D2HAddrList.insert(std::pair<void *, void *>(devXfers[id], Xfer));
+        id++;
+      }
+    }
+
+    // Call Emissary for each active lane
+    id = 0;
+    for (void *buffer_ptr : buf_ptrs) {
+      if (buffer_ptr) {
+        emisArgBuf_t *ab = &AB[id];
+        emisExtractArgBuf((char *)buffer_ptr, ab);
+        for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++)
+          emisSkipXferArgSet(ab);
+        for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++)
+          emisSkipXferArgSet(ab);
+        Results[id] = Emissary((char *)buffer_ptr, ab, &D2HAddrList);
+        id++;
+      }
+    }
+
+    // Process send_n for the H2D Xfers.
+    void *recvXfers[NumLanes] = {nullptr};
+    uint64_t recvXferSzs[NumLanes] = {0};
+    id = 0;
+    uint32_t numRecvXfers = 0;
+    for (void *buffer_ptr : buf_ptrs) {
+      if (buffer_ptr) {
+        emisArgBuf_t *ab = &AB[id];
+        // Reset ArgBuf tracker
+        emisExtractArgBuf((char *)buffer_ptr, ab);
+        for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++)
+          emisSkipXferArgSet(ab);
+        for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+          numRecvXfers++;
+          void *devAddr = (void *)*((uint64_t *)ab->argptr);
+          recvXfers[id] = D2HAddrList[devAddr];
+          recvXferSzs[id] =
+              (uint64_t) * ((size_t *)(ab->argptr + sizeof(void *)));
+          emisSkipXferArgSet(ab);
+        }
+        id++;
+      }
+    }
+    if (numRecvXfers)
+      Port.send_n(recvXfers, recvXferSzs);
+    // Cleanup all host allocated transfer buffers
+    id = 0;
+    for (void *buffer_ptr : buf_ptrs) {
+      if (buffer_ptr) {
+        emisArgBuf_t *ab = &AB[id];
+        // Reset the ArgBuf tracker ab
+        emisExtractArgBuf((char *)buffer_ptr, ab);
+        // Cleanup host allocated send Xfers
+        for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++) {
+          void *devAddr = (void *)*((uint64_t *)ab->argptr);
+          void *hostAddr = D2HAddrList[devAddr];
+          delete[] reinterpret_cast<char *>(hostAddr);
+          emisSkipXferArgSet(ab);
+        }
+        // Cleanup host allocated bufs
+        for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+          void *devAddr = (void *)*((uint64_t *)ab->argptr);
+          void *hostAddr = D2HAddrList[devAddr];
+          delete[] reinterpret_cast<char *>(hostAddr);
+          emisSkipXferArgSet(ab);
+        }
+        id++;
+      }
+    }
+
+    Port.send([&](rpc::Buffer *Buffer, uint32_t ID) {
+      Buffer->data[0] = static_cast<uint64_t>(Results[ID]);
+      delete[] reinterpret_cast<char *>(buf_ptrs[ID]);
+    });
+
+    break;
+  } // END CASE OFFLOAD_EMISSARY
+
   default:
     return rpc::RPC_UNHANDLED_OPCODE;
     break;
diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt
index 54cfdfef440a5..48873601e4f8f 100644
--- a/openmp/device/CMakeLists.txt
+++ b/openmp/device/CMakeLists.txt
@@ -7,6 +7,15 @@ if(LLVM_VERSION_MAJOR AND NOT (CMAKE_CXX_COMPILER_ID MATCHES "[Cc]lang" AND
                       " is not 'Clang ${req_ver}'.")
 endif()
 
+option(OFFLOAD_ENABLE_EMISSARY_APIS "Enable build of GPU Emissary APIs" ON)
+if(OFFLOAD_ENABLE_EMISSARY_APIS)
+  add_definitions(-DOFFLOAD_ENABLE_EMISSARY_APIS)
+  set(emissary_sources
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/EmissaryFortrt.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/EmissaryPrint.cpp
+  )
+endif()
+
 set(src_files
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Allocator.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Configuration.cpp
@@ -23,6 +32,7 @@ set(src_files
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Tasking.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/DeviceUtils.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/Workshare.cpp
+  ${emissary_sources}
 )
 
 list(APPEND compile_options -flto)
diff --git a/openmp/device/src/EmissaryFortrt.cpp b/openmp/device/src/EmissaryFortrt.cpp
new file mode 100644
index 0000000000000..a0d45f4ae2080
--- /dev/null
+++ b/openmp/device/src/EmissaryFortrt.cpp
@@ -0,0 +1,144 @@
+//===- EmissaryFortrt.cpp - Fortran Runtime emissary API ----- ---- c++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Device stubs for Fortran Runtime emissary API
+//
+//===----------------------------------------------------------------------===//
+
+#include "DeviceTypes.h"
+#include "EmissaryIds.h"
+#include "Shared/RPCOpcodes.h"
+#include "shared/rpc.h"
+
+unsigned long long _emissary_exec(unsigned long long, ...);
+
+extern "C" {
+
+// The clang compiler will generate calls to this only when a string length is
+// not a compile time constant.
+uint32_t __strlen_max(char *instr, uint32_t maxstrlen) {
+  for (uint32_t i = 0; i < maxstrlen; i++)
+    if (instr[i] == (char)0)
+      return (uint32_t)(i + 1);
+  return maxstrlen;
+}
+
+uint32_t omp_get_thread_num();
+uint32_t omp_get_num_threads();
+uint32_t omp_get_team_num();
+uint32_t omp_get_num_teams();
+
+// All Fortran Runtime Functions pass 4 extra args to assist with
+// defered execution and debug. The host variadic wrappers do not use
+// these arguments when calling the actual Fortran runtime.
+#define _EXTRA_ARGS                                                            \
+  omp_get_thread_num(), omp_get_num_threads(), omp_get_team_num(),             \
+      omp_get_num_teams()
+#define _START_ARGS(idx) _PACK_EMIS_IDS(EMIS_ID_FORTRT, idx, 0, 0), _EXTRA_ARGS,
+
+void *_FortranAioBeginExternalListOutput(uint32_t a1, const char *a2,
+                                         uint32_t a3) {
+  void *cookie = (void *)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioBeginExternalListOutput_idx, 0,
+                     0),
+      _EXTRA_ARGS, a1, a2, a3);
+  return cookie;
+}
+
+void *_FortranAioBeginExternalFormattedOutput(char *fmt, uint64_t fmtlen,
+                                              void *ptr, uint32_t val1,
+                                              char *source_name,
+                                              uint32_t val2) {
+  fmt[fmtlen - 1] = (char)0;
+  void *cookie = (void *)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT,
+                     _FortranAioBeginExternalFormattedOutput_idx, 0, 0),
+      _EXTRA_ARGS, fmt, fmtlen, ptr, val1, source_name, val2);
+  return cookie;
+}
+
+bool _FortranAioOutputAscii(void *a1, char *a2, uint64_t a3) {
+  // insert null terminating char so  _emissary_exec can correctly
+  // calculate runtime str length
+  a2[a3 - 1] = (char)0;
+  return (bool)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputAscii_idx, 0, 0),
+      _EXTRA_ARGS, a1, a2, a3);
+}
+bool _FortranAioOutputInteger32(void *a1, uint32_t a2) {
+  return (bool)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputInteger32_idx, 0, 0),
+      _EXTRA_ARGS, a1, a2);
+}
+uint32_t _FortranAioEndIoStatement(void *a1) {
+  return (uint32_t)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioEndIoStatement_idx, 0, 0),
+      _EXTRA_ARGS, a1);
+}
+bool _FortranAioOutputInteger8(void *cookie, int8_t n) {
+  return (bool)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputInteger8_idx, 0, 0),
+      _EXTRA_ARGS, cookie, n);
+}
+bool _FortranAioOutputInteger16(void *cookie, int16_t n) {
+  return (bool)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputInteger16_idx, 0, 0),
+      _EXTRA_ARGS, cookie, n);
+}
+bool _FortranAioOutputInteger64(void *cookie, int64_t n) {
+  return (bool)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputInteger64_idx, 0, 0),
+      _EXTRA_ARGS, cookie, n);
+}
+bool _FortranAioOutputReal32(void *cookie, float x) {
+  return (bool)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputReal32_idx, 0, 0),
+      _EXTRA_ARGS, cookie, x);
+}
+bool _FortranAioOutputReal64(void *cookie, double x) {
+  return (bool)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputReal64_idx, 0, 0),
+      _EXTRA_ARGS, cookie, x);
+}
+bool _FortranAioOutputComplex32(void *cookie, float re, float im) {
+  return (bool)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputComplex32_idx, 0, 0),
+      _EXTRA_ARGS, cookie, re, im);
+}
+bool _FortranAioOutputComplex64(void *cookie, double re, double im) {
+  return (bool)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputComplex64_idx, 0, 0),
+      _EXTRA_ARGS, cookie, re, im);
+}
+bool _FortranAioOutputLogical(void *cookie, bool barg) {
+  return (bool)_emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputLogical_idx, 0, 0),
+      _EXTRA_ARGS, cookie, barg);
+}
+void _FortranAAbort() {
+  _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAAbort_idx, 0, 0),
+                 _EXTRA_ARGS);
+  // When  host service _FortranAAbort finishes, we must die from the device.
+  __builtin_trap();
+}
+void _FortranAStopStatement(int32_t a1, bool a2, bool a3) {
+  _emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAStopStatement_idx, 0, 0),
+      _EXTRA_ARGS, a1, a2, a3);
+  __builtin_trap();
+}
+void _FortranAStopStatementText(char *errmsg, int64_t a1, bool a2, bool a3) {
+  errmsg[a1 - 1] = (char)0;
+  _emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAStopStatementText_idx, 0, 0),
+      _EXTRA_ARGS, errmsg, a1, a2, a3);
+  __builtin_trap();
+}
+
+} // end extern "C"
+#undef _EXTRA_ARGS
diff --git a/openmp/device/src/EmissaryPrint.cpp b/openmp/device/src/EmissaryPrint.cpp
new file mode 100644
index 0000000000000..80bd0262d5ed7
--- /dev/null
+++ b/openmp/device/src/EmissaryPrint.cpp
@@ -0,0 +1,79 @@
+//===----------- EmissaryPrint.cpp - Misc Emissary API ------------ c++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Device stubs for misc emissary API
+//
+//===----------------------------------------------------------------------===//
+
+#include "Allocator.h"
+#include "Configuration.h"
+#include "DeviceTypes.h"
+#include "Shared/RPCOpcodes.h"
+#include "shared/rpc.h"
+
+#include "Debug.h"
+#include "EmissaryIds.h"
+
+#if 0
+__attribute__((flatten, always_inline)) void f90print_(char *s) {
+  _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _printf_idx,0,0), "%s\n", s);
+}
+__attribute__((flatten, always_inline)) void f90printi_(char *s, int *i) {
+  _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _printf_idx,0,0), "%s $d\n", s, *i);
+}
+__attribute__((flatten, always_inline)) void f90printl_(char *s, long *i) {
+  _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _printf_idx,0,0), "%s %ld\n", s, *i);
+}
+__attribute__((flatten, always_inline)) void f90printf_(char *s, float *f) {
+  _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _printf_idx,0,0), "%s %f\n", s, *f);
+}
+__attribute__((flatten, always_inline)) void f90printd_(char *s, double *d) {
+  _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _printf_idx,0,0), "%s %g\n", s, *d);
+}
+#endif
+
+// This definition of __ockl_devmem_request and __ockl_sanitizer_report needs to
+// override the weak symbol for __ockl_devmem_request and
+// __ockl_sanitizer_report in rocm device lib ockl.bc because ockl uses
+// hostcall but OpenMP uses rpc.
+//
+extern "C" {
+__attribute__((noinline)) int
+__ockl_sanitizer_report(uint64_t addr, uint64_t pc, uint64_t wgidx,
+                        uint64_t wgidy, uint64_t wgidz, uint64_t wave_id,
+                        uint64_t is_read, uint64_t access_size) {
+  unsigned long long rc = _emissary_exec(
+      _PACK_EMIS_IDS(EMIS_ID_PRINT, _ockl_asan_report_idx, 0, 0), addr, pc,
+      wgidx, wgidy, wgidz, wave_id, is_read, access_size);
+  return ((int)rc);
+}
+#if SANITIZER_AMDGPU
+__attribute__((noinline)) uint64_t __asan_malloc_impl(uint64_t bufsz,
+                                                      uint64_t pc);
+__attribute__((noinline)) void __asan_free_impl(uint64_t ptr, uint64_t pc);
+#endif
+
+__attribute__((flatten, always_inline)) char *global_allocate(uint32_t bufsz) {
+#if SANITIZER_AMDGPU
+  return (char *)__asan_malloc_impl(bufsz,
+                                    (uint64_t)__builtin_return_address(0));
+#else
+  return (char *)malloc((uint64_t)bufsz);
+#endif
+}
+__attribute__((flatten, always_inline)) int global_free(void *ptr) {
+#if SANITIZER_AMDGPU
+  __asan_free_impl((uint64_t)ptr, (uint64_t)__builtin_return_address(0));
+#else
+  free(ptr);
+#endif
+  return 0;
+}
+
+//
+} // end extern "C"
diff --git a/openmp/device/src/Misc.cpp b/openmp/device/src/Misc.cpp
index 5d5a2a383f2b2..fc300d60ea752 100644
--- a/openmp/device/src/Misc.cpp
+++ b/openmp/device/src/Misc.cpp
@@ -134,6 +134,68 @@ unsigned long long __llvm_omp_host_call(void *fn, void *data, size_t size) {
   Port.close();
   return Ret;
 }
+
+// Calls to __llvm_omp_emissary_rpc and __llvm_omp_emissary_premalloc are
+// generated by device codegen at _emissary_exec call sites.
+// See clang/lib/CodeGen/CGEmitEmissaryExec.cpp
+void *__llvm_emissary_premalloc(uint32_t sz) {
+  return omp_alloc((size_t)sz, omp_default_mem_alloc);
+}
+
+unsigned long long __llvm_emissary_rpc(uint32_t sz32, void *bufdata) {
+  rpc::Client::Port Port = ompx::impl::Client.open<OFFLOAD_EMISSARY>();
+  Port.send_n(bufdata, (size_t)sz32);
+  unsigned long long Ret;
+  Port.recv([&](rpc::Buffer *Buffer, uint32_t) {
+    Ret = static_cast<unsigned long long>(Buffer->data[0]);
+  });
+  omp_free(bufdata, omp_default_mem_alloc);
+  Port.close();
+  return Ret;
+}
+
+// This is for emissary APIs that require d2h or h2d memory transfers.
+unsigned long long __llvm_emissary_rpc_dm(uint32_t sz32, void *bufdata) {
+  rpc::Client::Port Port = ompx::impl::Client.open<OFFLOAD_EMISSARY_DM>();
+  Port.send_n(bufdata, (size_t)sz32);
+
+  char *data = (char *)bufdata;
+  uint32_t *int32_data = (uint32_t *)data;
+  uint32_t NumArgs = int32_data[1];
+  char *keyptr = data + (2 * sizeof(int));
+  char *argptr = keyptr + (NumArgs * sizeof(int));
+  if (((size_t)argptr) % (size_t)8)
+    argptr += 4; // argptr must be aligned
+  uint64_t arg1 = *(uint64_t *)argptr;
+  uint32_t NumSendXfers = (unsigned int)((arg1 >> 16) & 0xFFFF);
+  uint32_t NumRecvXfers = (unsigned int)((arg1) & 0xFFFF);
+  // Skip by arg1 and process Send and Recv Xfers if any
+  argptr += sizeof(uint64_t);
+  for (uint32_t idx = 0; idx < NumSendXfers; idx++) {
+    void *D2Hdata = (void *)*((uint64_t *)argptr);
+    argptr += sizeof(void *);
+    size_t D2Hsize = (size_t) * ((size_t *)argptr);
+    argptr += sizeof(size_t);
+    Port.send_n(D2Hdata, D2Hsize);
+  }
+  for (uint32_t idx = 0; idx < NumRecvXfers; idx++) {
+    void *H2Ddata = (void *)*((uint64_t *)argptr);
+    argptr += sizeof(void *);
+    argptr += sizeof(size_t);
+    uint64_t recv_size;
+    void *buf = nullptr;
+    Port.recv_n(&buf, &recv_size,
+                [&](uint64_t) { return reinterpret_cast<void *>(H2Ddata); });
+  }
+
+  unsigned long long Ret;
+  Port.recv([&](rpc::Buffer *Buffer, uint32_t) {
+    Ret = static_cast<unsigned long long>(Buffer->data[0]);
+  });
+  omp_free(bufdata, omp_default_mem_alloc);
+  Port.close();
+  return Ret;
+}
 }
 
 // C++ ABI helpers.

>From e61bf8f536e5dfa3b1fc61d6dbbddfa96386a110 Mon Sep 17 00:00:00 2001
From: gregrodgers <Gregory.Rodgers at amd.com>
Date: Mon, 12 Jan 2026 06:43:48 -0600
Subject: [PATCH 2/2] [clang-format] apply more recent clang-format. Only
 clang-format changes with this commit

---
 clang/include/clang/Options/Options.td               | 12 +++++++-----
 clang/lib/Headers/EmissaryIds.h                      |  4 ++--
 offload/plugins-nextgen/common/src/Emissary.cpp      |  2 +-
 offload/plugins-nextgen/common/src/EmissaryPrint.cpp |  6 +++---
 offload/plugins-nextgen/common/src/RPC.cpp           |  6 +++---
 openmp/device/src/Misc.cpp                           |  2 +-
 6 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 8cf017d8effd7..5e9c2e35a431a 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -8940,11 +8940,13 @@ def fopenmp_host_ir_file_path : Separate<["-"], "fopenmp-host-ir-file-path">,
 
 } // let Visibility = [CC1Option, FC1Option]
 
-defm use_emissary_print: BoolFOption<"use-emissary-print",
-  LangOpts<"UseEmissaryPrint">, DefaultTrue,
-  PosFlag<SetTrue, [], [ClangOption, CC1Option],
-  "Enable use of Emissary printf/fprint overriding device libc printf/fprintf">,
-  NegFlag<SetFalse>>;
+defm use_emissary_print
+    : BoolFOption<"use-emissary-print", LangOpts<"UseEmissaryPrint">,
+                  DefaultTrue,
+                  PosFlag<SetTrue, [], [ClangOption, CC1Option],
+                          "Enable use of Emissary printf/fprint overriding "
+                          "device libc printf/fprintf">,
+                  NegFlag<SetFalse>>;
 
 //===----------------------------------------------------------------------===//
 // Coarray Options
diff --git a/clang/lib/Headers/EmissaryIds.h b/clang/lib/Headers/EmissaryIds.h
index f9ff616a0aacb..3c38f3d24bdaa 100644
--- a/clang/lib/Headers/EmissaryIds.h
+++ b/clang/lib/Headers/EmissaryIds.h
@@ -33,8 +33,8 @@ typedef enum {
 /// The vargs function used by emissary API device stubs
 unsigned long long _emissary_exec(unsigned long long, ...);
 
-//#define _PACK_EMIS_IDS(x, y)                                                   \
-//  ((unsigned long long)x << 32) | ((unsigned long long)y)
+// #define _PACK_EMIS_IDS(x, y) \
+//   ((unsigned long long)x << 32) | ((unsigned long long)y)
 
 #define _PACK_EMIS_IDS(a, b, c, d)                                             \
   ((unsigned long long)a << 48) | ((unsigned long long)b << 32) |              \
diff --git a/offload/plugins-nextgen/common/src/Emissary.cpp b/offload/plugins-nextgen/common/src/Emissary.cpp
index 0be0e426ee050..063e1aafe9c34 100644
--- a/offload/plugins-nextgen/common/src/Emissary.cpp
+++ b/offload/plugins-nextgen/common/src/Emissary.cpp
@@ -196,7 +196,7 @@ EmissaryBuildVargs(int NumArgs, char *keyptr, char *dataptr, char *strptr,
       if (numbits == 1) { // This is a pointer to string
         num_bytes = 4;
         bytes_consumed = num_bytes;
-        strsz = (size_t) * (unsigned int *)dataptr;
+        strsz = (size_t)*(unsigned int *)dataptr;
         if ((*data_not_used) < bytes_consumed)
           return _ERC_DATA_USED_ERROR;
         a[argcount] = (emis_argptr_t *)((char *)strptr);
diff --git a/offload/plugins-nextgen/common/src/EmissaryPrint.cpp b/offload/plugins-nextgen/common/src/EmissaryPrint.cpp
index 80c55d3c5a5fd..88d92b3274ae1 100644
--- a/offload/plugins-nextgen/common/src/EmissaryPrint.cpp
+++ b/offload/plugins-nextgen/common/src/EmissaryPrint.cpp
@@ -276,7 +276,7 @@ static service_rc emissary_pfBuildValist(emissary_ValistExt_t *valist,
       if (numbits == 1) { // This is a pointer to string
         num_bytes = 4;
         bytes_consumed = num_bytes;
-        strsz = (size_t) * (unsigned int *)dataptr;
+        strsz = (size_t)*(unsigned int *)dataptr;
         if ((*data_not_used) < bytes_consumed)
           return _ERC_DATA_USED_ERROR;
         if (emissary_pfAddString(valist, (char *)&strptr, strsz, &stacksize))
@@ -355,7 +355,7 @@ static service_rc emissary_fprintf(uint *rc, emisArgBuf_t *ab) {
   // Skip past the format string
   ab->NumArgs--;
   ab->keyptr += 4;
-  size_t abstrsz = (size_t) * (unsigned int *)ab->argptr;
+  size_t abstrsz = (size_t)*(unsigned int *)ab->argptr;
   ab->strptr += abstrsz;
   ab->argptr += 4;
   ab->data_not_used -= 4;
@@ -389,7 +389,7 @@ static service_rc emissary_printf(uint *rc, emisArgBuf_t *ab) {
   // Skip past the format string
   ab->NumArgs--;
   ab->keyptr += 4;
-  size_t abstrsz = (size_t) * (unsigned int *)ab->argptr;
+  size_t abstrsz = (size_t)*(unsigned int *)ab->argptr;
   ab->strptr += abstrsz;
   ab->argptr += 4;
   ab->data_not_used -= 4;
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index c3f041aa26637..9474982637135 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -112,13 +112,13 @@ rpc::Status handleOffloadOpcodes(plugin::GenericDeviceTy &Device,
         for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++) {
           numSendXfers++;
           devXfers[id] = (void *)*((uint64_t *)ab->argptr);
-          XferSzs[id] = (size_t) * ((size_t *)(ab->argptr + sizeof(void *)));
+          XferSzs[id] = (size_t)*((size_t *)(ab->argptr + sizeof(void *)));
           emisSkipXferArgSet(ab);
         }
         // Allocate the host space for the receive Xfers
         for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
           void *devAddr = (void *)*((uint64_t *)ab->argptr);
-          size_t devSz = (size_t) * ((size_t *)(ab->argptr + sizeof(void *)));
+          size_t devSz = (size_t)*((size_t *)(ab->argptr + sizeof(void *)));
           void *hostAddr = new char[devSz];
           D2HAddrList.insert(std::pair<void *, void *>(devAddr, hostAddr));
           emisSkipXferArgSet(ab);
@@ -174,7 +174,7 @@ rpc::Status handleOffloadOpcodes(plugin::GenericDeviceTy &Device,
           void *devAddr = (void *)*((uint64_t *)ab->argptr);
           recvXfers[id] = D2HAddrList[devAddr];
           recvXferSzs[id] =
-              (uint64_t) * ((size_t *)(ab->argptr + sizeof(void *)));
+              (uint64_t)*((size_t *)(ab->argptr + sizeof(void *)));
           emisSkipXferArgSet(ab);
         }
         id++;
diff --git a/openmp/device/src/Misc.cpp b/openmp/device/src/Misc.cpp
index fc300d60ea752..77f9815dd2a95 100644
--- a/openmp/device/src/Misc.cpp
+++ b/openmp/device/src/Misc.cpp
@@ -174,7 +174,7 @@ unsigned long long __llvm_emissary_rpc_dm(uint32_t sz32, void *bufdata) {
   for (uint32_t idx = 0; idx < NumSendXfers; idx++) {
     void *D2Hdata = (void *)*((uint64_t *)argptr);
     argptr += sizeof(void *);
-    size_t D2Hsize = (size_t) * ((size_t *)argptr);
+    size_t D2Hsize = (size_t)*((size_t *)argptr);
     argptr += sizeof(size_t);
     Port.send_n(D2Hdata, D2Hsize);
   }



More information about the Openmp-commits mailing list