[clang] [libc] [llvm] [OpenMP] [cuda] [hip] [libc] Support for Emissary APIs (PR #187602)

Greg Rodgers via cfe-commits cfe-commits at lists.llvm.org
Fri Mar 20 07:44:27 PDT 2026


https://github.com/gregrodgers updated https://github.com/llvm/llvm-project/pull/187602

>From c6cb50815a6911bc147ca9348f3de3e164ef4594 Mon Sep 17 00:00:00 2001
From: gregrodgers <Gregory.Rodgers at amd.com>
Date: Thu, 19 Mar 2026 17:18:50 -0500
Subject: [PATCH 1/2] [OpenMP] [cuda] [hip] [libc] Support for Emissary APIs
 without EmissaryPrint

---
 78                                            | 554 ++++++++++++++++++
 clang/lib/CodeGen/CGEmitEmissaryExec.cpp      | 392 +++++++++++++
 clang/lib/CodeGen/CGExpr.cpp                  |  10 +
 clang/lib/CodeGen/CMakeLists.txt              |   1 +
 clang/lib/CodeGen/CodeGenFunction.h           |   1 +
 clang/lib/Headers/CMakeLists.txt              |   1 +
 clang/lib/Headers/EmissaryIds.h               | 121 ++++
 libc/docs/gpu/emissary.rst                    | 274 +++++++++
 libc/docs/gpu/index.rst                       |   1 +
 libc/shared/rpc_server.h                      |   2 +
 libc/src/__support/RPC/CMakeLists.txt         |   1 +
 .../__support/RPC/emissary_device_utils.cpp   | 115 ++++
 libc/src/__support/RPC/emissary_rpc_server.h  | 515 ++++++++++++++++
 offload/plugins-nextgen/common/src/RPC.cpp    |   3 +
 14 files changed, 1991 insertions(+)
 create mode 100644 78
 create mode 100644 clang/lib/CodeGen/CGEmitEmissaryExec.cpp
 create mode 100644 clang/lib/Headers/EmissaryIds.h
 create mode 100644 libc/docs/gpu/emissary.rst
 create mode 100644 libc/src/__support/RPC/emissary_device_utils.cpp
 create mode 100644 libc/src/__support/RPC/emissary_rpc_server.h

diff --git a/78 b/78
new file mode 100644
index 0000000000000..203e1397f84f6
--- /dev/null
+++ b/78
@@ -0,0 +1,554 @@
+//===-- Shared memory RPC server instantiation ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is intended to be used externally as part of the `shared/`
+// interface. Consider this an extenion of rpc_server.h to support emissary
+// APIs. rpc_server.h must be included first.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_RPC_EMISSARY_RPC_SERVER_H
+#define LLVM_LIBC_SRC___SUPPORT_RPC_EMISSARY_RPC_SERVER_H
+
+#include "../clang/lib/Headers/EmissaryIds.h"
+#include "rpc_server.h"
+#include <string.h>
+#include <unordered_map>
+
+namespace EmissaryExternal {
+extern "C" {
+/// Called by EmissaryTop for all MPI emissary API functions
+__attribute((weak)) EmissaryReturn_t EmissaryMPI(char *data, emisArgBuf_t *ab,
+                                                 emis_argptr_t *arg[]);
+/// Called by EmissaryTop for all HDF5 Emissary API functions
+__attribute((weak)) EmissaryReturn_t EmissaryHDF5(char *data, emisArgBuf_t *ab,
+                                                  emis_argptr_t *arg[]);
+/// Called by EmissaryTop to support user-defined emissary API
+__attribute((weak)) EmissaryReturn_t EmissaryReserve(char *data,
+                                                     emisArgBuf_t *ab,
+                                                     emis_argptr_t *arg[]);
+/// Called by EmissaryTop to support Fortran IO runtime
+__attribute((weak)) EmissaryReturn_t EmissaryFortrt(char *data,
+                                                    emisArgBuf_t *ab,
+                                                    emis_argptr_t *arg[]);
+/// Called by EmissaryTop to support printf/fprintf/asan report externally
+__attribute((weak)) EmissaryReturn_t EmissaryPrint(char *data,
+                                                    emisArgBuf_t *ab,
+                                                    emis_argptr_t *arg[]);
+} // end extern "C"
+} // namespace EmissaryExternal
+
+// We would like to get llvm typeID enum from Type.h. e.g.
+// #include ".../llvm/include/llvm/IR/Type.h"
+// But we cannot include LLVM headers in a runtime function.
+// So we a have a manual copy of llvm TypeID enum from Type.h
+// The codegen for _emissary_exec puts this ID in the key for
+// each arg and the host runtime needs to decode this key.
+#if 1
+enum TypeID {
+  // PrimitiveTypes
+  HalfTyID = 0,  ///< 16-bit floating point type
+  BFloatTyID,    ///< 16-bit floating point type (7-bit significand)
+  FloatTyID,     ///< 32-bit floating point type
+  DoubleTyID,    ///< 64-bit floating point type
+  X86_FP80TyID,  ///< 80-bit floating point type (X87)
+  FP128TyID,     ///< 128-bit floating point type (112-bit significand)
+  PPC_FP128TyID, ///< 128-bit floating point type (two 64-bits, PowerPC)
+  VoidTyID,      ///< type with no size
+  LabelTyID,     ///< Labels
+  MetadataTyID,  ///< Metadata
+  X86_AMXTyID,   ///< AMX vectors (8192 bits, X86 specific)
+  TokenTyID,     ///< Tokens
+
+  // Derived types... see DerivedTypes.h file.
+  IntegerTyID,        ///< Arbitrary bit width integers
+  ByteTyID,           ///< Arbitrary bit width bytes
+  FunctionTyID,       ///< Functions
+  PointerTyID,        ///< Pointers
+  StructTyID,         ///< Structures
+  ArrayTyID,          ///< Arrays
+  FixedVectorTyID,    ///< Fixed width SIMD vector type
+  ScalableVectorTyID, ///< Scalable SIMD vector type
+  TypedPointerTyID,   ///< Typed pointer used by some GPU targets
+  TargetExtTyID,      ///< Target extension type
+};
+#endif
+
+// emisExtractArgBuf extract ArgBuf using protocol EmitEmissaryExec makes.
+static void emisExtractArgBuf(char *data, emisArgBuf_t *ab) {
+
+  uint32_t *int32_data = (uint32_t *)data;
+  ab->DataLen = int32_data[0];
+  ab->NumArgs = int32_data[1];
+
+  // Note: while the data buffer contains all args including strings,
+  // ab->DataLen does not include strings. It only counts header, keys,
+  // and aligned numerics.
+
+  ab->keyptr = data + (2 * sizeof(int));
+  ab->argptr = ab->keyptr + (ab->NumArgs * sizeof(int));
+  ab->strptr = data + (size_t)ab->DataLen;
+  int alignfill = 0;
+  if (((size_t)ab->argptr) % (size_t)8) {
+    ab->argptr += 4;
+    alignfill = 4;
+  }
+
+  // Extract the two emissary identifiers and number of send
+  // and recv device data transfers. These are 4 16 bit values
+  // packed into a single 64-bit field.
+  uint64_t arg1 = *(uint64_t *)ab->argptr;
+  ab->emisid = (unsigned int)((arg1 >> 48) & 0xFFFF);
+  ab->emisfnid = (unsigned int)((arg1 >> 32) & 0xFFFF);
+  ab->NumSendXfers = (unsigned int)((arg1 >> 16) & 0xFFFF);
+  ab->NumRecvXfers = (unsigned int)((arg1) & 0xFFFF);
+
+  // skip the uint64_t emissary id arg which is first arg in _emissary_exec.
+  ab->keyptr += sizeof(int);
+  ab->argptr += sizeof(uint64_t);
+  ab->NumArgs -= 1;
+
+  // data_not_used used for testing consistency.
+  ab->data_not_used =
+      (size_t)(ab->DataLen) - (((size_t)(3 + ab->NumArgs) * sizeof(int)) +
+                               sizeof(uint64_t) + alignfill);
+
+  // Ensure first arg after emissary id arg is aligned.
+  if (((size_t)ab->argptr) % (size_t)8) {
+    ab->argptr += 4;
+    ab->data_not_used -= 4;
+  }
+}
+
+/// Get uint32 value extended to uint64_t value from a char ptr
+static uint64_t getuint32(char *val) {
+  uint32_t i32 = *(uint32_t *)val;
+  return (uint64_t)i32;
+}
+
+/// Get uint64_t value from a char ptr
+static uint64_t getuint64(char *val) { return *(uint64_t *)val; }
+
+// build argument array to create call to variadic wrappers
+static uint32_t
+EmissaryBuildVargs(int NumArgs, char *keyptr, char *dataptr, char *strptr,
+                   unsigned long long *data_not_used, emis_argptr_t *a[],
+                   std::unordered_map<void *, void *> *D2HAddrList) {
+  size_t num_bytes;
+  size_t bytes_consumed;
+  size_t strsz;
+  size_t fillerNeeded;
+
+  uint argcount = 0;
+
+  for (int argnum = 0; argnum < NumArgs; argnum++) {
+    num_bytes = 0;
+    strsz = 0;
+    unsigned int key = *(unsigned int *)keyptr;
+    unsigned int llvmID = key >> 16;
+    unsigned int numbits = (key << 16) >> 16;
+
+    switch (llvmID) {
+    case FloatTyID:  ///<  2: 32-bit floating point type
+    case DoubleTyID: ///<  3: 64-bit floating point type
+    case FP128TyID:  ///<  5: 128-bit floating point type (112-bit mantissa)
+      num_bytes = numbits / 8;
+      bytes_consumed = num_bytes;
+      fillerNeeded = ((size_t)dataptr) % num_bytes;
+      if (fillerNeeded) {
+        dataptr += fillerNeeded;
+        bytes_consumed += fillerNeeded;
+      }
+      if ((*data_not_used) < bytes_consumed)
+        return _ERC_DATA_USED_ERROR;
+
+      if (num_bytes == 4)
+        a[argcount] = (emis_argptr_t *)getuint32(dataptr);
+      else
+        a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+      break;
+
+    case IntegerTyID: ///< 11: Arbitrary bit width integers
+      num_bytes = numbits / 8;
+      bytes_consumed = num_bytes;
+      fillerNeeded = ((size_t)dataptr) % num_bytes;
+      if (fillerNeeded) {
+        dataptr += fillerNeeded;
+        bytes_consumed += fillerNeeded;
+      }
+      if ((*data_not_used) < bytes_consumed)
+        return _ERC_DATA_USED_ERROR;
+
+      if (num_bytes == 4)
+        a[argcount] = (emis_argptr_t *)getuint32(dataptr);
+      else
+        a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+      break;
+
+    case PointerTyID: {   ///< 15: Pointers
+      if (numbits == 1) { // This is a pointer to string
+        num_bytes = 4;
+        bytes_consumed = num_bytes;
+        strsz = (size_t)*(unsigned int *)dataptr;
+        if ((*data_not_used) < bytes_consumed)
+          return _ERC_DATA_USED_ERROR;
+        a[argcount] = (emis_argptr_t *)((char *)strptr);
+      } else {
+        num_bytes = 8;
+        bytes_consumed = num_bytes;
+        fillerNeeded = ((size_t)dataptr) % num_bytes;
+        if (fillerNeeded) {
+          dataptr += fillerNeeded; // dataptr is now aligned
+          bytes_consumed += fillerNeeded;
+        }
+        if ((*data_not_used) < bytes_consumed)
+          return _ERC_DATA_USED_ERROR;
+        a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+      }
+      if (D2HAddrList) {
+        auto found = D2HAddrList->find((void *)a[argcount]);
+        if (found != D2HAddrList->end())
+          a[argcount] = (emis_argptr_t *)found->second;
+      }
+    } break;
+
+    case HalfTyID:           ///<  1: 16-bit floating point type
+    case ArrayTyID:          ///< 14: Arrays
+    case StructTyID:         ///< 13: Structures
+    case FunctionTyID:       ///< 12: Functions
+    case TokenTyID:          ///< 10: Tokens
+    case MetadataTyID:       ///<  8: Metadata
+    case LabelTyID:          ///<  7: Labels
+    case PPC_FP128TyID:      ///<  6: 128-bit floating point type (two 64-bits,
+                             ///<  PowerPC)
+    case X86_FP80TyID:       ///<  4: 80-bit floating point type (X87)
+    case FixedVectorTyID:    ///< 16: Fixed width SIMD vector type
+    case ScalableVectorTyID: ///< 17: Scalable SIMD vector type
+    case TypedPointerTyID:   ///< Typed pointer used by some GPU targets
+    case TargetExtTyID:      ///< Target extension type
+    case VoidTyID:
+      return _ERC_UNSUPPORTED_ID_ERROR;
+      break;
+    default:
+      return _ERC_INVALID_ID_ERROR;
+    }
+    // Move to next argument
+    dataptr += num_bytes;
+    strptr += strsz;
+    *data_not_used -= bytes_consumed;
+    keyptr += 4;
+    argcount++;
+  }
+  return _ERC_SUCCESS;
+}
+
+//  Utility to skip two args in the ArgBuf
+static void emisSkipXferArgSet(emisArgBuf_t *ab) {
+  // Skip the ptr and size of the Xfer
+  ab->NumArgs -= 2;
+  ab->keyptr += 2 * sizeof(uint32_t);
+  ab->argptr += 2 * sizeof(void *);
+  ab->data_not_used -= 2 * sizeof(void *);
+}
+
+static service_rc emissary_fprintf(uint *rc, emisArgBuf_t *ab) {
+
+  if (ab->DataLen == 0)
+    return _ERC_SUCCESS;
+  char *fmtstr = ab->strptr;
+  FILE *fileptr = (FILE *)*((size_t *)ab->argptr);
+
+  // Skip past the file pointer
+  ab->NumArgs--;
+  ab->keyptr += 4;
+  ab->argptr += sizeof(FILE *);
+  ab->data_not_used -= sizeof(FILE *);
+
+  // Skip past the format string
+  ab->NumArgs--;
+  ab->keyptr += 4;
+  size_t abstrsz = (size_t)*(unsigned int *)ab->argptr;
+  ab->strptr += abstrsz;
+  ab->argptr += 4;
+  ab->data_not_used -= 4;
+
+  emissary_ValistExt_t valist; // FIXME: we may want to align this declare
+  va_list *real_va_list;
+  real_va_list = (va_list *)&valist;
+
+  if (emissary_pfBuildValist(&valist, ab->NumArgs, ab->keyptr, ab->argptr,
+                             ab->strptr, &ab->data_not_used) != _ERC_SUCCESS)
+    return _ERC_ERROR_INVALID_REQUEST;
+
+  // Roll back offsets and save stack pointer
+  valist.gp_offset = 0;
+  valist.fp_offset = sizeof(emissary_pfIntRegs_t);
+  void *save_stack = valist.overflow_arg_area;
+  *rc = vfprintf(fileptr, fmtstr, *real_va_list);
+  if (valist.reg_save_area)
+    free(valist.reg_save_area);
+  if (save_stack)
+    free(save_stack);
+  return _ERC_SUCCESS;
+}
+
+static EmissaryReturn_t
+EmissaryTop(char *data, emisArgBuf_t *ab,
+            std::unordered_map<void *, void *> *D2HAddrList) {
+  EmissaryReturn_t result = 0;
+  emis_argptr_t **args = (emis_argptr_t **)aligned_alloc(
+      sizeof(emis_argptr_t), ab->NumArgs * sizeof(emis_argptr_t *));
+
+  switch (ab->emisid) {
+  case EMIS_ID_INVALID: {
+    fprintf(stderr, "Emissary (host execution) got invalid EMIS_ID\n");
+    result = 0;
+    break;
+  }
+  case EMIS_ID_PRINT: {
+    result = EmissaryExternal::EmissaryPrint(data, ab, args);
+    break;
+  }
+  case EMIS_ID_MPI: {
+    if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+                           &(ab->data_not_used), &args[0],
+                           D2HAddrList) != _ERC_SUCCESS) {
+      return (EmissaryReturn_t)0;
+    }
+    result = EmissaryExternal::EmissaryMPI(data, ab, args);
+    break;
+  }
+  case EMIS_ID_HDF5: {
+    if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+                           &(ab->data_not_used), &args[0],
+                           D2HAddrList) != _ERC_SUCCESS)
+      return (EmissaryReturn_t)0;
+    result = EmissaryExternal::EmissaryHDF5(data, ab, args);
+    break;
+  }
+  case EMIS_ID_FORTRT: {
+    if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+                           &(ab->data_not_used), &args[0],
+                           D2HAddrList) != _ERC_SUCCESS)
+      return (EmissaryReturn_t)0;
+    result = EmissaryExternal::EmissaryFortrt(data, ab, args);
+    break;
+    break;
+  }
+
+  case EMIS_ID_RESERVE: {
+    if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+                           &(ab->data_not_used), &args[0],
+                           D2HAddrList) != _ERC_SUCCESS)
+      return (EmissaryReturn_t)0;
+    result = EmissaryExternal::EmissaryReserve(data, ab, args);
+    break;
+  }
+  default:
+    fprintf(stderr,
+            "Emissary (host execution) EMIS_ID:%d fnid:%d not supported\n",
+            ab->emisid, ab->emisfnid);
+  }
+  free(args);
+  return result;
+}
+
+// -----------------------------------------------------------------
+// -- Handle OFFLOAD_EMISSARY and OFFLOAD_EMISSARY_DM opcodes     --
+// -- handle_emissary_impl calls EmissaryTop for each active lane --
+// -----------------------------------------------------------------
+template <uint32_t NumLanes>
+LIBC_INLINE static ::rpc::Status
+handle_emissary_impl(::rpc::Server::Port &port) {
+
+  switch (port.get_opcode()) {
+
+  // This case handles the device function __llvm_emissary_rpc for emissary
+  // APIs that require no d2h or h2d memory transfer.
+  case OFFLOAD_EMISSARY: {
+    uint64_t Sizes[NumLanes] = {0};
+    unsigned long long Results[NumLanes] = {0};
+    void *buf_ptrs[NumLanes] = {nullptr};
+    port.recv_n(buf_ptrs, Sizes, [&](uint64_t Size) { return new char[Size]; });
+    uint32_t id = 0;
+    for (void *buffer_ptr : buf_ptrs) {
+      if (buffer_ptr) {
+        emisArgBuf_t ab;
+        emisExtractArgBuf((char *)buffer_ptr, &ab);
+        Results[id++] = EmissaryTop((char *)buffer_ptr, &ab, nullptr);
+      }
+    }
+    port.send([&](::rpc::Buffer *Buffer, uint32_t ID) {
+      Buffer->data[0] = static_cast<uint64_t>(Results[ID]);
+      delete[] reinterpret_cast<char *>(buf_ptrs[ID]);
+    });
+    break;
+  }
+
+  // This case handles the device function __llvm_emissary_rpc_dm for emissary
+  // APIs require D2H or H2D transfer vectors to be processed through the port.
+  // FIXME: test with multiple transfer vectors of the same type.
+  case OFFLOAD_EMISSARY_DM: {
+    uint64_t Sizes[NumLanes] = {0};
+    unsigned long long Results[NumLanes] = {0};
+    void *buf_ptrs[NumLanes] = {nullptr};
+    port.recv_n(buf_ptrs, Sizes, [&](uint64_t Size) { return new char[Size]; });
+
+    uint32_t id = 0;
+    emisArgBuf_t AB[NumLanes];
+    std::unordered_map<void *, void *> D2HAddrList;
+    void *Xfers[NumLanes] = {nullptr};
+    void *devXfers[NumLanes] = {nullptr};
+    uint64_t XferSzs[NumLanes] = {0};
+    uint32_t numSendXfers = 0;
+    id = 0;
+
+    for (void *buffer_ptr : buf_ptrs) {
+      if (buffer_ptr) {
+
+        emisArgBuf_t *ab = &AB[id];
+        emisExtractArgBuf((char *)buffer_ptr, ab);
+        for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++) {
+          numSendXfers++;
+          devXfers[id] = (void *)*((uint64_t *)ab->argptr);
+          XferSzs[id] = (size_t)*((size_t *)(ab->argptr + sizeof(void *)));
+          emisSkipXferArgSet(ab);
+        }
+        // Allocate the host space for the receive Xfers
+        for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+          void *devAddr = (void *)*((uint64_t *)ab->argptr);
+          size_t devSz = (((size_t)*((size_t *)(ab->argptr + sizeof(void *)))) &
+                          0x00000000FFFFFFFF);
+          void *hostAddr = new char[devSz];
+          D2HAddrList.insert(std::pair<void *, void *>(devAddr, hostAddr));
+          emisSkipXferArgSet(ab);
+        }
+        id++;
+      }
+    }
+
+    // recv_n for device send_n into new host-allocated Xfers
+    if (numSendXfers)
+      port.recv_n(Xfers, XferSzs,
+                  [&](uint64_t Size) { return new char[Size]; });
+
+    // Xfers now contains just allocated host addrs for sends and
+    // devXfers contains corresponding devAddr for those sends
+    // Build map to pass to Emissary
+    id = 0;
+    for (void *Xfer : Xfers) {
+      if (Xfer) {
+        D2HAddrList.insert(std::pair<void *, void *>(devXfers[id], Xfer));
+        id++;
+      }
+    }
+
+    // Call EmissaryTop for each active lane
+    id = 0;
+    for (void *buffer_ptr : buf_ptrs) {
+      if (buffer_ptr) {
+        emisArgBuf_t *ab = &AB[id];
+        emisExtractArgBuf((char *)buffer_ptr, ab);
+        for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++)
+          emisSkipXferArgSet(ab);
+        for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++)
+          emisSkipXferArgSet(ab);
+        Results[id] = EmissaryTop((char *)buffer_ptr, ab, &D2HAddrList);
+        id++;
+      }
+    }
+
+    // Process send_n for the H2D Xfers.
+    void *recvXfers[NumLanes] = {nullptr};
+    uint64_t recvXferSzs[NumLanes] = {0};
+    id = 0;
+    uint32_t numRecvXfers = 0;
+    for (void *buffer_ptr : buf_ptrs) {
+      if (buffer_ptr) {
+        emisArgBuf_t *ab = &AB[id];
+        // Reset ArgBuf tracker
+        emisExtractArgBuf((char *)buffer_ptr, ab);
+        for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++)
+          emisSkipXferArgSet(ab);
+        for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+          numRecvXfers++;
+          void *devAddr = (void *)*((uint64_t *)ab->argptr);
+          recvXfers[id] = D2HAddrList[devAddr];
+          recvXferSzs[id] =
+              (((uint64_t)*((size_t *)(ab->argptr + sizeof(void *)))) &
+               0x00000000FFFFFFFF);
+          emisSkipXferArgSet(ab);
+        }
+        id++;
+      }
+    }
+    if (numRecvXfers)
+      port.send_n(recvXfers, recvXferSzs);
+
+    // Cleanup all host allocated transfer buffers
+    id = 0;
+    for (void *buffer_ptr : buf_ptrs) {
+      if (buffer_ptr) {
+        emisArgBuf_t *ab = &AB[id];
+        // Reset the ArgBuf tracker ab
+        emisExtractArgBuf((char *)buffer_ptr, ab);
+        // Cleanup host allocated send Xfers
+        for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++) {
+          void *devAddr = (void *)*((uint64_t *)ab->argptr);
+          void *hostAddr = D2HAddrList[devAddr];
+          delete[] reinterpret_cast<char *>(hostAddr);
+          emisSkipXferArgSet(ab);
+        }
+        // Cleanup host allocated bufs
+        for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+          void *devAddr = (void *)*((uint64_t *)ab->argptr);
+          void *hostAddr = D2HAddrList[devAddr];
+          delete[] reinterpret_cast<char *>(hostAddr);
+          emisSkipXferArgSet(ab);
+        }
+        id++;
+      }
+    }
+
+    port.send([&](::rpc::Buffer *Buffer, uint32_t ID) {
+      Buffer->data[0] = static_cast<uint64_t>(Results[ID]);
+      delete[] reinterpret_cast<char *>(buf_ptrs[ID]);
+    });
+
+    break;
+  } // END CASE OFFLOAD_EMISSARY_DM
+
+  default: {
+    return ::rpc::RPC_UNHANDLED_OPCODE;
+    break;
+  }
+  }
+  return ::rpc::RPC_SUCCESS;
+} // end handle_emissary_impl
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+namespace LIBC_NAMESPACE_DECL {
+namespace rpc {
+LIBC_INLINE ::rpc::Status handleEmissaryOpcodes(::rpc::Server::Port &port,
+                                                uint32_t NumLanes) {
+  if (NumLanes == 1)
+    return internal::handle_emissary_impl<1>(port);
+  else if (NumLanes == 32)
+    return internal::handle_emissary_impl<32>(port);
+  else if (NumLanes == 64)
+    return internal::handle_emissary_impl<64>(port);
+  else
+    return ::rpc::RPC_ERROR;
+}
+
+} // namespace rpc
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_RPC_EMISSARY_RPC_SERVER_H
diff --git a/clang/lib/CodeGen/CGEmitEmissaryExec.cpp b/clang/lib/CodeGen/CGEmitEmissaryExec.cpp
new file mode 100644
index 0000000000000..d9511244a0b8c
--- /dev/null
+++ b/clang/lib/CodeGen/CGEmitEmissaryExec.cpp
@@ -0,0 +1,392 @@
+//===------- CGEmitEmissaryExec.cpp - Codegen for _emissary_exec --==------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// EmitEmissaryExec:
+//
+// When a device call to the varadic function _emissary_exec is encountered
+// (in CGExpr.cpp) EmitEmissaryExec does these steps:
+//
+// 1. If string lens are runtime dependent, Emit code to determine runtime len.
+// 2. Emits call to allocate memory __llvm_emissary_premalloc,
+// 3. Emit stores of each arg into arg buffer,
+// 4. Emits call to function __llvm_emissary_rpc or __llvm_emissary_rpc_dm
+//
+// The arg buffer is a struct that contains the length, number of args, an
+// array of 4-byte keys that represent the type of of each arg, an array of
+// aligned "data" values for each arg, and finally the runtime string values.
+// If an arg is a string the data value is the runtime length of the string.
+// Each 4-byte key contains the llvm type ID and the number of bits for the
+// type. encoded by the macro _PACK_TY_BITLEN(x,y) ((uint32_t)x << 16) |
+// ((uint32_t)y)
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../../clang/lib/Headers/EmissaryIds.h"
+#include "CodeGenFunction.h"
+#include "clang/Basic/Builtins.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h"
+
+using namespace clang;
+using namespace CodeGen;
+
+// These static helper functions support EmitEmissaryExec.
+static llvm::Function *GetOmpStrlenDeclaration(CodeGenModule &CGM) {
+  auto &M = CGM.getModule();
+  // Args are pointer to char and maxstringlen
+  llvm::Type *ArgTypes[] = {CGM.Int8PtrTy, CGM.Int32Ty};
+  llvm::FunctionType *OmpStrlenFTy =
+      llvm::FunctionType::get(CGM.Int32Ty, ArgTypes, false);
+  if (auto *F = M.getFunction("__strlen_max")) {
+    assert(F->getFunctionType() == OmpStrlenFTy);
+    return F;
+  }
+  llvm::Function *FN = llvm::Function::Create(
+      OmpStrlenFTy, llvm::GlobalVariable::ExternalLinkage, "__strlen_max", &M);
+  return FN;
+}
+
+// Deterimines if an expression is a string with variable lenth
+static bool isVarString(const clang::Expr *argX, const clang::Type *argXTy,
+                        const llvm::Value *Arg) {
+  if ((argXTy->isPointerType() || argXTy->isConstantArrayType()) &&
+      argXTy->getPointeeOrArrayElementType()->isCharType() && !argX->isLValue())
+    return true;
+  // Ensure the VarDecl has an inititalizer
+  if (const auto *DRE = dyn_cast<DeclRefExpr>(argX))
+    if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl()))
+      if (!VD->getInit() ||
+          !llvm::isa<StringLiteral>(VD->getInit()->IgnoreImplicit()))
+        return true;
+  return false;
+}
+
+// Deterimines if an argument is a string
+static bool isString(const clang::Type *argXTy) {
+  if ((argXTy->isPointerType() || argXTy->isConstantArrayType()) &&
+      argXTy->getPointeeOrArrayElementType()->isCharType())
+    return true;
+  else
+    return false;
+}
+
+// Gets a string literal to write into the transfer buffer
+static const StringLiteral *getSL(const clang::Expr *argX,
+                                  const clang::Type *argXTy) {
+  // String in argX has known constant length
+  if (!argXTy->isConstantArrayType()) {
+    // Allow constant string to be a declared variable,
+    // But it must be constant and initialized.
+    const DeclRefExpr *DRE = cast<DeclRefExpr>(argX);
+    const VarDecl *VarD = cast<VarDecl>(DRE->getDecl());
+    argX = VarD->getInit()->IgnoreImplicit();
+  }
+  const StringLiteral *SL = cast<StringLiteral>(argX);
+  return SL;
+}
+
+// Returns a function pointer to __llvm_emissary_premalloc
+static llvm::Function *GetEmissaryAllocDeclaration(CodeGenModule &CGM) {
+  auto &M = CGM.getModule();
+  // clang::CodeGen::CodeGenTypes &CGT = CGM.getTypes();
+  const char *_executeName = "__llvm_emissary_premalloc";
+  llvm::Type *ArgTypes[] = {CGM.Int32Ty};
+  llvm::Function *FN;
+  // Maybe this should be pointer to char instead of pointer to void
+  llvm::FunctionType *VargsFnAllocFuncType = llvm::FunctionType::get(
+      CGM.getTypes().ConvertType(
+          CGM.getContext().getPointerType(CGM.getContext().VoidTy)),
+      ArgTypes, false);
+  if (!(FN = M.getFunction(_executeName)))
+    FN = llvm::Function::Create(VargsFnAllocFuncType,
+                                llvm::GlobalVariable::ExternalLinkage,
+                                _executeName, &M);
+  assert(FN->getFunctionType() == VargsFnAllocFuncType);
+  return FN;
+}
+
+// Returns a function pointer to __llvm_emissary_rpc
+static llvm::Function *GetEmissaryExecDeclaration(CodeGenModule &CGM,
+                                                  bool hasXfers) {
+  const char *_executeName =
+      hasXfers ? "__llvm_emissary_rpc_dm" : "__llvm_emissary_rpc";
+  auto &M = CGM.getModule();
+  llvm::Type *ArgTypes[] = {
+      CGM.Int32Ty, CGM.getTypes().ConvertType(CGM.getContext().getPointerType(
+                       CGM.getContext().VoidTy))};
+  llvm::Function *FN;
+  llvm::FunctionType *VarfnFuncType =
+      llvm::FunctionType::get(CGM.Int64Ty, ArgTypes, false);
+  if (!(FN = M.getFunction(_executeName)))
+    FN = llvm::Function::Create(
+        VarfnFuncType, llvm::GlobalVariable::ExternalLinkage, _executeName, &M);
+  assert(FN->getFunctionType() == VarfnFuncType);
+  return FN;
+}
+
+// A macro to pack the llvm type ID and numbits into 4-byte key
+#define _PACK_TY_BITLEN(x, y) ((uint32_t)x << 16) | ((uint32_t)y)
+
+//  ----- External function EmitEmissaryExec called from CGExpr.cpp -----
+RValue CodeGenFunction::EmitEmissaryExec(const CallExpr *E) {
+  assert(getTarget().getTriple().isAMDGCN() ||
+         getTarget().getTriple().isNVPTX());
+  assert(E->getNumArgs() >= 1); // _emissary_exec always has at least one arg.
+  const llvm::DataLayout &DL = CGM.getDataLayout();
+  CallArgList Args;
+  // --- Insert 1st emisid arg if emiting fprintf or printf.
+  unsigned int AOE = 0;
+  if (E->getDirectCallee()->getNameAsString() == "fprintf") {
+    constexpr unsigned long long emisid =
+        ((unsigned long long)EMIS_ID_PRINT << 48) |
+        ((unsigned long long)_fprintf_idx << 32);
+    Args.add(
+        RValue::get(llvm::ConstantInt::get(Int64Ty, emisid)),
+        getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/false));
+    AOE = 1; // Arg# offset to E->arguments to use with E->getArg(I-AOE)
+  }
+  if (E->getDirectCallee()->getNameAsString() == "printf") {
+    constexpr unsigned long long emisid =
+        ((unsigned long long)EMIS_ID_PRINT << 48) |
+        ((unsigned long long)_printf_idx << 32);
+    Args.add(
+        RValue::get(llvm::ConstantInt::get(Int64Ty, emisid)),
+        getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/false));
+    AOE = 1; // Arg# offset to E->arguments to use with E->getArg(I-AOE)
+  }
+
+  EmitCallArgs(Args,
+               E->getDirectCallee()->getType()->getAs<FunctionProtoType>(),
+               E->arguments(), E->getDirectCallee(),
+               /* ParamsToSkip = */ 0);
+
+  // We don't know how to emit non-scalar varargs.
+  if (std::any_of(Args.begin() + 1, Args.end(), [&](const CallArg &A) {
+        return !A.getRValue(*this).isScalar();
+      })) {
+    CGM.ErrorUnsupported(E, "non-scalar arg in GPU vargs function");
+    return RValue::get(llvm::ConstantInt::get(IntTy, 0));
+  }
+  // NumArgs always includes emisid, but E->getNumArgs() could be 1 less if
+  // inserted it above.
+  unsigned NumArgs = (unsigned)Args.size();
+  llvm::SmallVector<llvm::Type *, 32> ArgTypes;
+  llvm::SmallVector<llvm::Value *, 32> VarStrLengths;
+  llvm::Value *TotalVarStrsLength = llvm::ConstantInt::get(Int32Ty, 0);
+  bool hasVarStrings = false;
+  ArgTypes.push_back(Int32Ty); // 1st field in struct is total DataLen
+  ArgTypes.push_back(Int32Ty); // 2nd field in struct will be num args
+  // An array of 4-byte keys that describe the arg type
+  for (unsigned I = 0; I < NumArgs; ++I)
+    ArgTypes.push_back(Int32Ty);
+
+  // Track the size of the numeric data length and string length
+  unsigned DataLen_CT =
+      (unsigned)(DL.getTypeAllocSize(Int32Ty)) * (NumArgs + 2);
+  unsigned AllStringsLen_CT = 0;
+
+  // ---  1st Pass over Args to create ArgTypes and count size ---
+  size_t structOffset = 4 * (NumArgs + 2);
+  for (unsigned I = 0; I < NumArgs; I++) {
+    llvm::Value *Arg = Args[I].getRValue(*this).getScalarVal();
+    llvm::Type *ArgType = Arg->getType();
+    // Skip string processing on arg0 which may not be in E->getArg(0)
+    if (I != 0) {
+      const Expr *argX = E->getArg(I - AOE)->IgnoreParenCasts();
+      auto *argXTy = argX->getType().getTypePtr();
+      if (isString(argXTy)) {
+        if (isVarString(argX, argXTy, Arg)) {
+          hasVarStrings = true;
+          if (auto *PtrTy = dyn_cast<llvm::PointerType>(ArgType))
+            if (PtrTy->getPointerAddressSpace()) {
+              Arg = Builder.CreateAddrSpaceCast(Arg, CGM.Int8PtrTy);
+              ArgType = Arg->getType();
+            }
+          llvm::Value *VarStrLen =
+              Builder.CreateCall(GetOmpStrlenDeclaration(CGM),
+                                 {Arg, llvm::ConstantInt::get(Int32Ty, 1024)});
+          VarStrLengths.push_back(VarStrLen);
+          TotalVarStrsLength = Builder.CreateAdd(TotalVarStrsLength, VarStrLen,
+                                                 "sum_of_var_strings_length");
+          ArgType = Int32Ty;
+        } else {
+          const StringLiteral *SL = getSL(argX, argXTy);
+          StringRef ArgString = SL->getString();
+          AllStringsLen_CT += ((int)ArgString.size() + 1);
+          // change ArgType from char ptr to int to contain string length
+          ArgType = Int32Ty;
+        }
+      } // end of processing string argument
+    } // End of skip 1st arg
+    // if ArgTypeSize is >4 bytes we need to insert dummy align
+    // values in the struct so all stores can be aligned .
+    // These dummy fields must be inserted before the arg.
+    //
+    // In the pass below where the stores are generated careful
+    // tracking of the index into the struct is necessary.
+    size_t needsPadding = (structOffset % (size_t)DL.getTypeAllocSize(ArgType));
+    if (needsPadding) {
+      DataLen_CT += (unsigned)needsPadding;
+      structOffset += needsPadding;
+      ArgTypes.push_back(Int32Ty); // could assert that needsPadding == 4 here
+    }
+
+    ArgTypes.push_back(ArgType);
+    DataLen_CT += ((int)DL.getTypeAllocSize(ArgType));
+    structOffset += (size_t)DL.getTypeAllocSize(ArgType);
+  }
+
+  // ---  Generate call to __llvm_emissary_premalloc to get data pointer
+  if (hasVarStrings)
+    TotalVarStrsLength = Builder.CreateAdd(
+        TotalVarStrsLength,
+        llvm::ConstantInt::get(Int32Ty, AllStringsLen_CT + DataLen_CT),
+        "total_buffer_size");
+  llvm::Value *BufferLen =
+      hasVarStrings
+          ? TotalVarStrsLength
+          : llvm::ConstantInt::get(Int32Ty, AllStringsLen_CT + DataLen_CT);
+  llvm::Value *DataStructPtr =
+      Builder.CreateCall(GetEmissaryAllocDeclaration(CGM), {BufferLen});
+
+  // --- Cast the generic return pointer to be a struct in device global memory
+  llvm::StructType *DataStructTy =
+      llvm::StructType::create(ArgTypes, "varfn_args_store");
+  unsigned AS = getContext().getTargetAddressSpace(LangAS::cuda_device);
+  llvm::Value *BufferPtr = Builder.CreatePointerCast(
+      DataStructPtr, llvm::PointerType::get(CGM.getLLVMContext(), AS),
+      "varfn_args_store_casted");
+  // ---  Header of struct contains length and NumArgs ---
+  llvm::Value *DataLenField = llvm::ConstantInt::get(Int32Ty, DataLen_CT);
+  llvm::Value *P = Builder.CreateStructGEP(DataStructTy, BufferPtr, 0);
+  Builder.CreateAlignedStore(DataLenField, P,
+                             DL.getPrefTypeAlign(DataLenField->getType()));
+  llvm::Value *NumArgsField = llvm::ConstantInt::get(Int32Ty, NumArgs);
+  P = Builder.CreateStructGEP(DataStructTy, BufferPtr, 1);
+  Builder.CreateAlignedStore(NumArgsField, P,
+                             DL.getPrefTypeAlign(NumArgsField->getType()));
+
+  // ---  2nd Pass: create array of 4-byte keys to describe each arg
+  for (unsigned I = 0; I < NumArgs; I++) {
+    llvm::Type *ty = Args[I].getRValue(*this).getScalarVal()->getType();
+    llvm::Type::TypeID argtypeid =
+        Args[I].getRValue(*this).getScalarVal()->getType()->getTypeID();
+
+    // Get type size in bits. Usually 64 or 32.
+    uint32_t numbits = 0;
+    if (I > 0 &&
+        isString(
+            E->getArg(I - AOE)->IgnoreParenCasts()->getType().getTypePtr()))
+      // The llvm typeID for string is pointer.  Since pointer numbits is 0,
+      // we set numbits to 1 to distinguish pointer type ID as string pointer.
+      numbits = 1;
+    else
+      numbits = ty->getScalarSizeInBits();
+    // Create a key that combines llvm typeID and size
+    llvm::Value *Key =
+        llvm::ConstantInt::get(Int32Ty, _PACK_TY_BITLEN(argtypeid, numbits));
+    P = Builder.CreateStructGEP(DataStructTy, BufferPtr, I + 2);
+    Builder.CreateAlignedStore(Key, P, DL.getPrefTypeAlign(Key->getType()));
+  }
+
+  // ---  3rd Pass: Store data values for each arg ---
+  unsigned varstring_index = 0;
+  unsigned structIndex = 2 + NumArgs;
+  structOffset = 4 * structIndex;
+  bool hasXfers;
+  for (unsigned I = 0; I < NumArgs; I++) {
+    llvm::Value *Arg = nullptr;
+    if (I == 0) {
+      Arg = Args[I].getKnownRValue().getScalarVal();
+      llvm::ConstantInt *CI = llvm::dyn_cast<llvm::ConstantInt>(Arg);
+      uint64_t uint64value = CI->getZExtValue();
+      uint32_t lower_32 = (uint32_t)(uint64value & 0xFFFFFFFF);
+      hasXfers = lower_32 ? true : false;
+    } else {
+      const Expr *argX = E->getArg(I - AOE)->IgnoreParenCasts();
+      auto *argXTy = argX->getType().getTypePtr();
+      if (isString(argXTy)) {
+        if (isVarString(argX, argXTy, Arg)) {
+          Arg = VarStrLengths[varstring_index];
+          varstring_index++;
+        } else {
+          const StringLiteral *SL = getSL(argX, argXTy);
+          StringRef ArgString = SL->getString();
+          int ArgStrLen = (int)ArgString.size() + 1;
+          // Change Arg from a char pointer to the integer string length
+          Arg = llvm::ConstantInt::get(Int32Ty, ArgStrLen);
+        }
+      } else {
+        Arg = Args[I].getKnownRValue().getScalarVal();
+      }
+    }
+    size_t structElementSize = (size_t)DL.getTypeAllocSize(Arg->getType());
+    size_t needsPadding = (structOffset % structElementSize);
+    if (needsPadding) {
+      // Skip over dummy fields in struct to align
+      structOffset += needsPadding; // should assert needsPadding == 4
+      structIndex++;
+    }
+    P = Builder.CreateStructGEP(DataStructTy, BufferPtr, structIndex);
+    Builder.CreateAlignedStore(Arg, P, DL.getPrefTypeAlign(Arg->getType()));
+    structOffset += structElementSize;
+    structIndex++;
+  }
+
+  // ---  4th Pass: memcpy all strings after the data values ---
+  // bitcast the struct in device global memory as a char buffer
+  Address BufferPtrByteAddr =
+      Address(Builder.CreatePointerCast(
+                  BufferPtr, llvm::PointerType::get(CGM.getLLVMContext(), AS),
+                  "_casted"),
+              Int8Ty, CharUnits::fromQuantity(1));
+
+  // BufferPtrByteAddr is a pointer to where we want to write the next string
+  BufferPtrByteAddr = Builder.CreateConstInBoundsByteGEP(
+      BufferPtrByteAddr, CharUnits::fromQuantity(DataLen_CT));
+  varstring_index = 0;
+  // Skip string processing on arg0 which may not be in E->getArg(0)
+  for (unsigned I = 1; I < NumArgs; ++I) {
+    llvm::Value *Arg = Args[I].getKnownRValue().getScalarVal();
+    const Expr *argX = E->getArg(I - AOE)->IgnoreParenCasts();
+    auto *argXTy = argX->getType().getTypePtr();
+    if (isString(argXTy)) {
+      if (isVarString(argX, argXTy, Arg)) {
+        llvm::Value *varStrLength = VarStrLengths[varstring_index];
+        varstring_index++;
+        Address SrcAddr = Address(Arg, Int8Ty, CharUnits::fromQuantity(1));
+        Builder.CreateMemCpy(BufferPtrByteAddr, SrcAddr, varStrLength);
+        // update BufferPtrByteAddr for next string memcpy
+        llvm::Value *PtrAsInt = BufferPtrByteAddr.emitRawPointer(*this);
+        BufferPtrByteAddr =
+            Address(Builder.CreateGEP(Int8Ty, PtrAsInt,
+                                      ArrayRef<llvm::Value *>(varStrLength)),
+                    Int8Ty, CharUnits::fromQuantity(1));
+      } else {
+        const StringLiteral *SL = getSL(argX, argXTy);
+        StringRef ArgString = SL->getString();
+        int ArgStrLen = (int)ArgString.size() + 1;
+        Address SrcAddr = CGM.GetAddrOfConstantStringFromLiteral(SL);
+        Builder.CreateMemCpy(BufferPtrByteAddr, SrcAddr, ArgStrLen);
+        // update BufferPtrByteAddr for next memcpy
+        BufferPtrByteAddr = Builder.CreateConstInBoundsByteGEP(
+            BufferPtrByteAddr, CharUnits::fromQuantity(ArgStrLen));
+      }
+    }
+  }
+  // --- Generate call to __llvm_emissary_rpc and return RValue
+  llvm::Value *emis_rc = Builder.CreateCall(
+      GetEmissaryExecDeclaration(CGM, hasXfers), {BufferLen, DataStructPtr});
+  // truncate long long int to int for printf return value.
+  if ((E->getDirectCallee()->getNameAsString() == "fprintf") ||
+      (E->getDirectCallee()->getNameAsString() == "printf"))
+    emis_rc = Builder.CreateTrunc(emis_rc, CGM.Int32Ty, "emis_rc");
+  return RValue::get(emis_rc);
+}
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 23802cdeb4811..4bfc5f4b15ee9 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -7033,6 +7033,16 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType,
       StaticOperator = true;
   }
 
+  // EmitEmissaryExec generates code to allocate an arg buffer, fill buffer
+  // with _emissary_exec args, then generate a call to either
+  // __llvm_emissary_rpc or __llvm_emissary_rpc_dm which are rpc utilities.
+  if ((CGM.getTriple().isAMDGCN() || CGM.getTriple().isNVPTX()) && FnType &&
+      dyn_cast<FunctionProtoType>(FnType) &&
+      dyn_cast<FunctionProtoType>(FnType)->isVariadic() &&
+      E->getDirectCallee() &&
+      (E->getDirectCallee()->getNameAsString() == "_emissary_exec"))
+    return EmitEmissaryExec(E);
+
   auto Arguments = E->arguments();
   if (StaticOperator) {
     // If we're calling a static operator, we need to emit the object argument
diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt
index 434781b3c4f02..0481678be5c35 100644
--- a/clang/lib/CodeGen/CMakeLists.txt
+++ b/clang/lib/CodeGen/CMakeLists.txt
@@ -62,6 +62,7 @@ add_clang_library(clangCodeGen
   CGAtomic.cpp
   CGBlocks.cpp
   CGBuiltin.cpp
+  CGEmitEmissaryExec.cpp
   CGCUDANV.cpp
   CGCUDARuntime.cpp
   CGCXX.cpp
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 85c058ba237ee..1f061dc63c5db 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4751,6 +4751,7 @@ class CodeGenFunction : public CodeGenTypeCache {
 
   RValue EmitNVPTXDevicePrintfCallExpr(const CallExpr *E);
   RValue EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E);
+  RValue EmitEmissaryExec(const CallExpr *E);
 
   RValue EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
                          const CallExpr *E, ReturnValueSlot ReturnValue);
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index c6c299bb61af3..1cc5302e27cba 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -42,6 +42,7 @@ set(core_files
   tgmath.h
   unwind.h
   varargs.h
+  EmissaryIds.h
   )
 
 set(arm_common_files
diff --git a/clang/lib/Headers/EmissaryIds.h b/clang/lib/Headers/EmissaryIds.h
new file mode 100644
index 0000000000000..921bc5453b848
--- /dev/null
+++ b/clang/lib/Headers/EmissaryIds.h
@@ -0,0 +1,121 @@
+//===- openmp/device/include/EmissaryIds.h enum & headers ----- C++ -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines Emissary API identifiers. This header is used by both host
+// and device compilations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OFFLOAD_EMISSARY_IDS_H
+#define OFFLOAD_EMISSARY_IDS_H
+
+#define __DEVATTR__
+#if defined(__NVPTX__) || defined(__AMDGCN__)
+#if defined(__HIP__) || defined(__CUDA__)
+#if defined(__DEVATTR__)
+#undef __DEVATTR__
+#endif
+#define __DEVATTR__ __device__
+#endif
+#endif
+
+extern "C" __DEVATTR__ unsigned long long int
+_emissary_exec(const unsigned long long int, ...);
+
+#define _PACK_EMIS_IDS(a, b, c, d)                                             \
+  ((unsigned long long)a << 48) | ((unsigned long long)b << 32) |              \
+      ((unsigned long long)c << 16) | ((unsigned long long)d)
+
+/// These are the various Emissary APIs currently defined.
+/// MPI, HDF5, and, RESERVE are "external" Emissary APIs whose device stubs and
+/// host runtime support are provided by library maintainers typically in the
+/// form of a header such as "EmissaryMPI.h". The stubs call _emissary_exec.
+/// The host runtime support will call functions from the actual host library
+/// which are often platform specific and thus only linkable by an application.
+/// A small demo of an external Emissary API (EmissaryMPI.h) is found in docs.
+
+typedef enum {
+  EMIS_ID_INVALID,
+  EMIS_ID_FORTRT,
+  EMIS_ID_PRINT,
+  EMIS_ID_MPI,
+  EMIS_ID_HDF5,
+  EMIS_ID_RESERVE,
+} offload_emis_id_t;
+
+typedef enum {
+  _print_INVALID,
+  _printf_idx,
+  _fprintf_idx,
+  _ockl_asan_report_idx,
+} offload_emis_print_t;
+
+/// The future EMIS_ID_FORTRT will provide these device functions
+typedef enum {
+  _FortranAio_INVALID,
+  _FortranAioBeginExternalListOutput_idx,
+  _FortranAioOutputAscii_idx,
+  _FortranAioOutputInteger32_idx,
+  _FortranAioEndIoStatement_idx,
+  _FortranAioOutputInteger8_idx,
+  _FortranAioOutputInteger16_idx,
+  _FortranAioOutputInteger64_idx,
+  _FortranAioOutputReal32_idx,
+  _FortranAioOutputReal64_idx,
+  _FortranAioOutputComplex32_idx,
+  _FortranAioOutputComplex64_idx,
+  _FortranAioOutputLogical_idx,
+  _FortranAAbort_idx,
+  _FortranAStopStatementText_idx,
+  _FortranAioBeginExternalFormattedOutput_idx,
+  _FortranAStopStatement_idx,
+} offload_emis_fortrt_idx;
+
+/// This structure is created by emisExtractArgBuf to get information
+/// from the data buffer passed by rpc.
+typedef struct {
+  unsigned int DataLen;
+  unsigned int NumArgs;
+  unsigned int emisid;
+  unsigned int emisfnid;
+  unsigned int NumSendXfers;
+  unsigned int NumRecvXfers;
+  unsigned long long data_not_used;
+  char *keyptr;
+  char *argptr;
+  char *strptr;
+} emisArgBuf_t;
+
+typedef unsigned long long EmissaryReturn_t;
+typedef unsigned long long emis_argptr_t;
+typedef EmissaryReturn_t emisfn_t(void *, ...);
+
+#define MAXVARGS 32
+
+typedef enum service_rc {
+  _ERC_SUCCESS = 0,
+  _ERC_STATUS_ERROR = 1,
+  _ERC_DATA_USED_ERROR = 2,
+  _ERC_ADDINT_ERROR = 3,
+  _ERC_ADDFLOAT_ERROR = 4,
+  _ERC_ADDSTRING_ERROR = 5,
+  _ERC_UNSUPPORTED_ID_ERROR = 6,
+  _ERC_INVALID_ID_ERROR = 7,
+  _ERC_ERROR_INVALID_REQUEST = 8,
+  _ERC_EXCEED_MAXVARGS_ERROR = 9,
+} service_rc;
+
+#define LLVM_EMISSARY_BASE 'e'
+#define LLVM_EMISSARY_OPCODE(n) (LLVM_EMISSARY_BASE << 24 | n)
+
+typedef enum {
+  OFFLOAD_EMISSARY = LLVM_EMISSARY_OPCODE(1),
+  OFFLOAD_EMISSARY_DM = LLVM_EMISSARY_OPCODE(2),
+} offload_emissary_t;
+
+#endif // OFFLOAD_EMISSARY_IDS_H
diff --git a/libc/docs/gpu/emissary.rst b/libc/docs/gpu/emissary.rst
new file mode 100644
index 0000000000000..cdff3e59ba671
--- /dev/null
+++ b/libc/docs/gpu/emissary.rst
@@ -0,0 +1,274 @@
+.. _libc_gpu_emissary:
+
+=============
+Emissary APIs
+=============
+
+Emissary APIs
+-------------
+
+The libc GPU feature of LLVM offload provides the ability to execute 
+libc functions on the host initiated from GPU code including OpenMP
+target regions, CUDA kernels, or HIP kernels. 
+This libc-gpu capability uses the LLVM offload RPC mechanism. 
+An extension to libc-gpu is the abiity to execute arbitrary functions
+initiated by GPU source code. This extension is called ``Emissary APIs``. 
+
+Emissary APIs allow host API maintainers to easily maintain and distribute 
+a platform-specific Emissary API directly callable from the GPU. 
+The LLVM runtime does not link to the platform-specific host runtime until
+th application links to the host runtime. For standarized APIs, such as MPI,
+emissary APIs allows multiple platform-specific implementations of MPI.
+This results in increased application portability. But the primary 
+benefit of Emissary APIs is the ability to execute host functions without
+terminating the GPU kernel or code.
+
+An Emissary API implementation consists of a header file and a simple c++ 
+source file. The same implementation can be used for OpenMP, HIP, or Cuda.
+The c++ source file can be compiled as part of the host API library.
+
+This architecture allows host API maintainers to maintain the Emissary API
+externally.  That is, no ,change to the compiler or runtime is required.
+Maintainers can implement any subset of the host API in their emissary
+API implementation. Users attempting to use an unimplemented function
+from GPU code would get the same unresolved GPU reference they get
+without an Emissary API implementation. 
+
+Because they execute the actual external host functions, the server
+implementation cannot be directly linked to the LLVM runtime. The LLVM
+runtime in the emissary support provides a weak external reference
+to a single master function for the Emissary API. The external API
+maintainer provides this master function consisting of a case clause and
+wrapper function for each implemented function. There are a number of ways
+the API maintainer can package and distribute emissary support for a
+platform-specific API. Compiling the master function into the host library
+and distributing a device header file is typical.
+
+This external implementation architecture provides the ability to have 
+different platform-specific APIs for standard interface libraries
+such as ROCm MPI or CUDA MPI. 
+
+In this documentation, we provide an MPI example with a few MPI 
+functions as a demonstration. 
+
+EmisssaryMPI Example
+--------------------
+
+External Emissary APIs require an external library such as OpenMPI. 
+This example source shows the execution of MPI_Send and MPI_Recv
+from an OpenMP target region. 
+
+
+.. code-block:: c++
+
+  // 
+  //  EmissaryMPI_example.cpp
+  //
+  #include <EmissaryMPI.h>
+  #include <mpi.h>
+  #include <omp.h>
+  #include <stdio.h>
+  #define VSIZE 5000
+  int main(int argc, char *argv[]) {
+    int numranks, rank;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_size(MPI_COMM_WORLD, &numranks);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm _mpi_comm = MPI_COMM_WORLD;
+    MPI_Datatype _mpi_int = MPI_INT;
+    int rc = 0;
+    int *send_recv_buffer = (int *)malloc(VSIZE * sizeof(int));
+    printf("R%d: Number of Ranks= %d ArraySize:%d\n", rank, numranks, VSIZE);
+    #pragma omp target teams distribute parallel for map(tofrom : rc)          \
+      map(to : send_recv_buffer[0 : VSIZE])
+    for (int i = 0; i < VSIZE; i++) {
+      if (rank == 0) {
+        send_recv_buffer[i] = -i;
+        MPI_Send(&send_recv_buffer[i], 1, _mpi_int, 1, i, _mpi_comm);
+      } else {
+        MPI_Recv(&send_recv_buffer[i], 1, _mpi_int, 0, i, _mpi_comm,
+                 MPI_STATUS_IGNORE);
+        if (send_recv_buffer[i] != -i)
+          rc = 1;  // FLAG AS ERROR IF NOT EXPECTED.
+      }
+    }
+    MPI_Finalize();
+    printf("R%d: === rc === %d\n", rank, rc);
+    return rc;
+  }
+
+It is worth noting that without Emissary API for MPI, the device link 
+step for the above code would fail with unresolved references to 
+MPI_Send and MPI_Recv. The include of EmissaryMPI.h resolves this. 
+
+.. code-block:: h
+
+  //===--------------------------------------------------------------------===//
+  // 
+  // EmissaryMPI.h : Example device header for EmissaryMPI
+  //
+  //===--------------------------------------------------------------------===//
+  #ifndef OFFLOAD_EMISSARY_MPI_H
+  #define OFFLOAD_EMISSARY_MPI_H
+  #include "EmissaryIds.h"
+  #include <mpi.h>
+  #include <stdarg.h>
+  typedef enum {
+    _MPI_INVALID,
+    _MPI_Send_idx,
+    _MPI_Recv_idx,
+  } offload_emis_mpi_t;
+  #if defined(__NVPTX__) || defined(__AMDGCN__)
+  //  EmissaryIds.h sets __DEVATTR__ to __device__ when compiling for either
+  //  CUDA or HIP. That attribute is not used for OpenMP device compilation.
+  __DEVATTR__
+  extern "C" int MPI_Send(const void *buf, int count, MPI_Datatype datatype,
+                          int dest, int tag, MPI_Comm comm) {
+    return (int) _emissary_exec(
+        _PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Send_idx, 1, 0),
+        buf, (int)count * 4, buf, count, datatype, dest, tag, comm);
+  }
+  __DEVATTR__
+  extern "C" int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source,
+                          int tag, MPI_Comm comm, MPI_Status *st) {
+    return (int) _emissary_exec(
+        _PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Recv_idx, 0, 1),
+       buf, (int)count * 4, buf, count, datatype, source, tag, comm, st);
+  }
+  #endif
+  #endif // end #ifndef OFFLOAD_EMISSARY_MPI_H
+
+The above is a minimal Emissary API implementation to provide MPI_Send and MPI_Recv
+functionality for demonstration.  The host service side to support this is the following:
+
+.. code-block:: cpp
+
+  //===--------------------------------------------------------------------===//
+  //
+  // EmissaryMPI.cpp : Defines the EmissaryMPI master function and variadic
+  //                   wrappers for each implemented function. In this example
+  //                   only MPI_Send and MPI_Recv are callable by GPU.  
+  //
+  //===--------------------------------------------------------------------===//
+  #include "EmissaryMPI.h"
+  #include <EmissaryIds.h>
+  #include <mpi.h>
+  static int V_MPI_Send(void *fnptr, ...) {
+    va_list args;
+    va_start(args, fnptr);
+    void *v0 = va_arg(args, void *);
+    int v1 = va_arg(args, int);
+    MPI_Datatype v2 = va_arg(args, MPI_Datatype);
+    int v3 = va_arg(args, int);
+    int v4 = va_arg(args, int);
+    MPI_Comm v5 = va_arg(args, MPI_Comm);
+    va_end(args);
+    int rval = MPI_Send(v0, v1, v2, v3, v4, v5);
+    return rval;
+  }
+  static int V_MPI_Recv(void *fnptr, ...) {
+    va_list args;
+    va_start(args, fnptr);
+    void *v0 = va_arg(args, void *);
+    int v1 = va_arg(args, int);
+    MPI_Datatype v2 = va_arg(args, MPI_Datatype);
+    int v3 = va_arg(args, int);
+    int v4 = va_arg(args, int);
+    MPI_Comm v5 = va_arg(args, MPI_Comm);
+    MPI_Status *v6 = va_arg(args, MPI_Status *);
+    va_end(args);
+    int rval = MPI_Recv(v0, v1, v2, v3, v4, v5, v6);
+    return rval;
+  }
+  namespace EmissaryExternal {
+  // The EmissaryMPI master function selector
+  extern "C" EmissaryReturn_t EmissaryMPI(char *data, emisArgBuf_t *ab,
+                                          emis_argptr_t *a[]) {
+    switch (ab->emisfnid) {
+    case _MPI_Send_idx: {
+      void *fnptr = (void *)V_MPI_Send;
+      int return_value_int =
+          V_MPI_Send(fnptr, a[0], a[1], a[2], a[3], a[4], a[5]);
+      return (EmissaryReturn_t)return_value_int;
+    }
+    case _MPI_Recv_idx: {
+      void *fnptr = (void *)V_MPI_Recv;
+      int return_value_int =
+          V_MPI_Recv(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6]);
+      return (EmissaryReturn_t)return_value_int;
+    }
+    return (EmissaryReturn_t)0;
+    }
+  }
+
+The above OpenMP user source code can be compiled and executed with
+the following shell script:
+
+.. code-block:: sh
+
+  #/bin/bash
+  #
+  #  demo_mpi.sh
+  #
+  MPI=${MPI:-~/local/openmpi}
+  LLVM_INSTALL=${LLVM_INSTALL:-/work/grodgers/rocm/trunk}
+  OFFLOAD_ARCH=${OFFLOAD_ARCH:-gfx90a}
+  export PATH=$MPI/bin:$PATH
+  [ ! -d "$MPI" ] && echo "MPI:$MPI not found" && exit
+  [ ! -d "$LLVM_INSTALL" ] && echo "LLVM_INSTALL:$LLVM_INSTALL not found" && exit
+  echo "===1===>  Compiling Host Master Function EmissaryMPI found in EmissaryMPI.cpp"
+  $LLVM_INSTALL/bin/clang++ ../EmissaryMPI.cpp  -I.. -I$MPI/include -O3 -c -fPIC -o EmissaryMPI.o
+  echo "===2===>  Compiling and linking OpenMP application"
+  export OMPI_CC=$LLVM_INSTALL/bin/clang++
+  mpic++ -fopenmp --offload-arch=$OFFLOAD_ARCH EmissaryMPI_example.cpp -I.. -Xlinker EmissaryMPI.o
+  echo "===3===>  Executing 2 MPI ranks with ./a.out on GPU $OFFLOAD_ARCH"
+  mpirun -np 2 a.out 
+
+The shell compiles the host master function EmissaryMPI to resolve the weak 
+reference to EmissaryMPI(...) provided by the OpenMP runtime.
+which is then linked to the application a.out.  The platform-specific MPI
+might put EmissaryMPI in their library.
+
+A typical installation of OpenMPI would not accept device pointers.  However, a platform-specific
+implementation might have GPU-aware host library that does recognize device pointers.
+
+To make the typical installation work, EmissaryMPIs provide the definition of 
+device-to-host (send) and host-to-device(receive) transfer vectors. 
+
+The number of send transfer
+vectors and receive transfer vectors are embedded into the first argument to _emissary_exec
+with the macro _PACK_EMIS_IDS. The macro _PACK_EMIS_IDS has 4 16-bit fields: The enum ID 
+of the Emissary ID , the function index within that Emissary ID, the number of send 
+transfer vectors, and the number of receive transfer vectors. In the above example, 
+these first arguments are used in EmissaryMPI.h.
+      
+.. code-block::
+
+      PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Send_idx, 1, 0)
+      PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Recv_idx, 0, 1)
+
+Each transfer vector adds two arguments to the call to _emissary_exec: the device pointer 
+argument, followed by the length in number of bytes to transfer. Obviously the use of
+transfer vectors slows runtime execution to allocate and move data.
+
+If the host platform-specific library was GPU-aware, no transfer vectors would be required.
+In this case the first arg in the header to _emissary_exec would be:
+
+.. code-block::
+
+      PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Send_idx, 0, 0)
+      PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Recv_idx, 0, 0)
+
+This would be significantly faster than the typical OpenMPI that require
+transfer vectors.  
+
+PACK_EMIS_IDS generates a compile time 64-bit constant to the first argument
+to _emissary_exec, followed by transfer vectors (if any) and then followed by
+the actual arguments to the host function. The device pass of the clang compiler
+emits code to pack the arguments into a buffer and generates a call to the
+proper RPC function.
+
+The use of Emissary API relieves the API maintainer from implementing different
+RPC functions to manage different sets of arguments. The later approach is what
+is done in the implementation of most libc functions. Emissary APIs are 
+useful when there are complex sets of arguments such as in IO APIs. 
diff --git a/libc/docs/gpu/index.rst b/libc/docs/gpu/index.rst
index 1fca67205acb4..9d2c3dc602031 100644
--- a/libc/docs/gpu/index.rst
+++ b/libc/docs/gpu/index.rst
@@ -18,3 +18,4 @@ learn more about this project.
    rpc
    testing
    motivation
+   emissary
diff --git a/libc/shared/rpc_server.h b/libc/shared/rpc_server.h
index 46e35f13f0eac..1149fa18e597d 100644
--- a/libc/shared/rpc_server.h
+++ b/libc/shared/rpc_server.h
@@ -10,12 +10,14 @@
 #define LLVM_LIBC_SHARED_RPC_SERVER_H
 
 #include "libc_common.h"
+#include "src/__support/RPC/emissary_rpc_server.h"
 #include "src/__support/RPC/rpc_server.h"
 
 namespace LIBC_NAMESPACE_DECL {
 namespace shared {
 
 using LIBC_NAMESPACE::rpc::handle_libc_opcodes;
+using LIBC_NAMESPACE::rpc::handleEmissaryOpcodes;
 
 } // namespace shared
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/RPC/CMakeLists.txt b/libc/src/__support/RPC/CMakeLists.txt
index cac9c4e05e369..876bd221dab03 100644
--- a/libc/src/__support/RPC/CMakeLists.txt
+++ b/libc/src/__support/RPC/CMakeLists.txt
@@ -6,6 +6,7 @@ add_object_library(
   rpc_client
   SRCS
     rpc_client.cpp
+    emissary_device_utils.cpp
   HDRS
     rpc_client.h
   DEPENDS
diff --git a/libc/src/__support/RPC/emissary_device_utils.cpp b/libc/src/__support/RPC/emissary_device_utils.cpp
new file mode 100644
index 0000000000000..0900b5f2b56bd
--- /dev/null
+++ b/libc/src/__support/RPC/emissary_device_utils.cpp
@@ -0,0 +1,115 @@
+//===- emisssary_device_utils.cpp - utils for Emissary APIs ------- c++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Device functions emitted by clang/lib/CodeGen/EmitEmissaryExec.cpp
+//
+//===----------------------------------------------------------------------===//
+
+// #include "Allocator.h"
+// #include "Configuration.h"
+// #include "DeviceTypes.h"
+#include "EmissaryIds.h"
+#include "rpc_client.h"
+#include "shared/rpc.h"
+#include "src/__support/macros/config.h"
+#include "src/stdlib/free.h"
+#include "src/stdlib/malloc.h"
+
+extern "C" {
+
+#ifdef __NVPTX__
+[[gnu::leaf]] void *malloc(size_t Size);
+[[gnu::leaf]] void free(void *Ptr);
+#endif
+
+/// static auto null_string = "(null)";
+
+// namespace LIBC_NAMESPACE_DECL{
+// namespace rpc {
+
+// The clang compiler will generate calls to __strlen_max when string length
+// is not compile time constant.
+uint32_t __strlen_max(const char *instr, uint32_t maxstrlen) {
+  if (instr == 0) // encountered a null pointer to string
+    return 0;
+  for (uint32_t i = 0; i < maxstrlen; i++)
+    if (instr[i] == (char)0)
+      return (uint32_t)(i + 1);
+  return maxstrlen;
+}
+
+void *__llvm_emissary_premalloc(uint32_t sz) {
+#ifdef __NVPTX__
+  return malloc((size_t)sz);
+#else
+  return LIBC_NAMESPACE::malloc((size_t)sz);
+#endif
+}
+unsigned long long __llvm_emissary_rpc(uint32_t sz32, void *bufdata) {
+  rpc::Client::Port Port = LIBC_NAMESPACE::rpc::client.open<OFFLOAD_EMISSARY>();
+  Port.send_n(bufdata, (size_t)sz32);
+  unsigned long long Ret;
+  Port.recv([&](rpc::Buffer *Buffer, uint32_t) {
+    Ret = static_cast<unsigned long long>(Buffer->data[0]);
+  });
+#ifdef __NVPTX__
+  free(bufdata);
+#else
+  LIBC_NAMESPACE::free(bufdata);
+#endif
+  return Ret;
+}
+
+// This is for emissary APIs that require d2h or h2d memory transfers.
+unsigned long long __llvm_emissary_rpc_dm(uint32_t sz32, void *bufdata) {
+  rpc::Client::Port Port =
+      LIBC_NAMESPACE::rpc::client.open<OFFLOAD_EMISSARY_DM>();
+  Port.send_n(bufdata, (size_t)sz32);
+  char *data = (char *)bufdata;
+  uint32_t *int32_data = (uint32_t *)data;
+  uint32_t NumArgs = int32_data[1];
+  char *keyptr = data + (2 * sizeof(int));
+  char *argptr = keyptr + (NumArgs * sizeof(int));
+  if (((size_t)argptr) % (size_t)8)
+    argptr += 4; // argptr must be aligned
+  uint64_t arg1 = *(uint64_t *)argptr;
+  uint32_t NumSendXfers = (unsigned int)((arg1 >> 16) & 0xFFFF);
+  uint32_t NumRecvXfers = (unsigned int)((arg1) & 0xFFFF);
+  // Skip by arg1 and process Send and Recv Xfers if any
+  argptr += sizeof(uint64_t);
+  for (uint32_t idx = 0; idx < NumSendXfers; idx++) {
+    void *D2Hdata = (void *)*((uint64_t *)argptr);
+    argptr += sizeof(void *);
+    size_t D2Hsize = ((size_t)*((size_t *)argptr) & 0x00000000FFFFFFFF);
+    argptr += sizeof(size_t);
+    Port.send_n(D2Hdata, D2Hsize);
+  }
+  for (uint32_t idx = 0; idx < NumRecvXfers; idx++) {
+    void *H2Ddata = (void *)*((uint64_t *)argptr);
+    argptr += sizeof(void *);
+    argptr += sizeof(size_t);
+    uint64_t recv_size;
+    void *buf = nullptr;
+    Port.recv_n(&buf, &recv_size,
+                [&](uint64_t) { return reinterpret_cast<void *>(H2Ddata); });
+  }
+  unsigned long long Ret;
+  Port.recv([&](rpc::Buffer *Buffer, uint32_t) {
+    Ret = static_cast<unsigned long long>(Buffer->data[0]);
+  });
+#ifdef __NVPTX__
+  free(bufdata);
+#else
+  LIBC_NAMESPACE::free(bufdata);
+#endif
+  return Ret;
+}
+
+//} // end namespace rpc
+//} // end  namespace LIBC_NAMESPACE_DECL
+} // end extern "C"
diff --git a/libc/src/__support/RPC/emissary_rpc_server.h b/libc/src/__support/RPC/emissary_rpc_server.h
new file mode 100644
index 0000000000000..bf6757d9c3b64
--- /dev/null
+++ b/libc/src/__support/RPC/emissary_rpc_server.h
@@ -0,0 +1,515 @@
+//===-- Shared memory RPC server instantiation ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is intended to be used externally as part of the `shared/`
+// interface. Consider this an extenion of rpc_server.h to support emissary
+// APIs. rpc_server.h must be included first.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_RPC_EMISSARY_RPC_SERVER_H
+#define LLVM_LIBC_SRC___SUPPORT_RPC_EMISSARY_RPC_SERVER_H
+
+#include "../clang/lib/Headers/EmissaryIds.h"
+#include "rpc_server.h"
+#include <string.h>
+#include <unordered_map>
+
+namespace EmissaryExternal {
+extern "C" {
+/// Called by EmissaryTop for all MPI emissary API functions
+__attribute((weak)) EmissaryReturn_t EmissaryMPI(char *data, emisArgBuf_t *ab,
+                                                 emis_argptr_t *arg[]);
+/// Called by EmissaryTop for all HDF5 Emissary API functions
+__attribute((weak)) EmissaryReturn_t EmissaryHDF5(char *data, emisArgBuf_t *ab,
+                                                  emis_argptr_t *arg[]);
+/// Called by EmissaryTop to support user-defined emissary API
+__attribute((weak)) EmissaryReturn_t EmissaryReserve(char *data,
+                                                     emisArgBuf_t *ab,
+                                                     emis_argptr_t *arg[]);
+/// Called by EmissaryTop to support Fortran IO runtime
+__attribute((weak)) EmissaryReturn_t EmissaryFortrt(char *data,
+                                                    emisArgBuf_t *ab,
+                                                    emis_argptr_t *arg[]);
+/// Called by EmissaryTop to support printf/fprintf/asan report externally
+__attribute((weak)) EmissaryReturn_t EmissaryPrint(char *data, emisArgBuf_t *ab,
+                                                   emis_argptr_t *arg[]);
+} // end extern "C"
+} // namespace EmissaryExternal
+
+// We would like to get llvm typeID enum from Type.h. e.g.
+// #include ".../llvm/include/llvm/IR/Type.h"
+// But we cannot include LLVM headers in a runtime function.
+// So we a have a manual copy of llvm TypeID enum from Type.h
+// The codegen for _emissary_exec puts this ID in the key for
+// each arg and the host runtime needs to decode this key.
+#if 1
+enum TypeID {
+  // PrimitiveTypes
+  HalfTyID = 0,  ///< 16-bit floating point type
+  BFloatTyID,    ///< 16-bit floating point type (7-bit significand)
+  FloatTyID,     ///< 32-bit floating point type
+  DoubleTyID,    ///< 64-bit floating point type
+  X86_FP80TyID,  ///< 80-bit floating point type (X87)
+  FP128TyID,     ///< 128-bit floating point type (112-bit significand)
+  PPC_FP128TyID, ///< 128-bit floating point type (two 64-bits, PowerPC)
+  VoidTyID,      ///< type with no size
+  LabelTyID,     ///< Labels
+  MetadataTyID,  ///< Metadata
+  X86_AMXTyID,   ///< AMX vectors (8192 bits, X86 specific)
+  TokenTyID,     ///< Tokens
+
+  // Derived types... see DerivedTypes.h file.
+  IntegerTyID,        ///< Arbitrary bit width integers
+  ByteTyID,           ///< Arbitrary bit width bytes
+  FunctionTyID,       ///< Functions
+  PointerTyID,        ///< Pointers
+  StructTyID,         ///< Structures
+  ArrayTyID,          ///< Arrays
+  FixedVectorTyID,    ///< Fixed width SIMD vector type
+  ScalableVectorTyID, ///< Scalable SIMD vector type
+  TypedPointerTyID,   ///< Typed pointer used by some GPU targets
+  TargetExtTyID,      ///< Target extension type
+};
+#endif
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+// emisExtractArgBuf extract ArgBuf using protocol EmitEmissaryExec makes.
+static void emisExtractArgBuf(char *data, emisArgBuf_t *ab) {
+
+  uint32_t *int32_data = (uint32_t *)data;
+  ab->DataLen = int32_data[0];
+  ab->NumArgs = int32_data[1];
+
+  // Note: while the data buffer contains all args including strings,
+  // ab->DataLen does not include strings. It only counts header, keys,
+  // and aligned numerics.
+
+  ab->keyptr = data + (2 * sizeof(int));
+  ab->argptr = ab->keyptr + (ab->NumArgs * sizeof(int));
+  ab->strptr = data + (size_t)ab->DataLen;
+  int alignfill = 0;
+  if (((size_t)ab->argptr) % (size_t)8) {
+    ab->argptr += 4;
+    alignfill = 4;
+  }
+
+  // Extract the two emissary identifiers and number of send
+  // and recv device data transfers. These are 4 16 bit values
+  // packed into a single 64-bit field.
+  uint64_t arg1 = *(uint64_t *)ab->argptr;
+  ab->emisid = (unsigned int)((arg1 >> 48) & 0xFFFF);
+  ab->emisfnid = (unsigned int)((arg1 >> 32) & 0xFFFF);
+  ab->NumSendXfers = (unsigned int)((arg1 >> 16) & 0xFFFF);
+  ab->NumRecvXfers = (unsigned int)((arg1) & 0xFFFF);
+
+  // skip the uint64_t emissary id arg which is first arg in _emissary_exec.
+  ab->keyptr += sizeof(int);
+  ab->argptr += sizeof(uint64_t);
+  ab->NumArgs -= 1;
+
+  // data_not_used used for testing consistency.
+  ab->data_not_used =
+      (size_t)(ab->DataLen) - (((size_t)(3 + ab->NumArgs) * sizeof(int)) +
+                               sizeof(uint64_t) + alignfill);
+
+  // Ensure first arg after emissary id arg is aligned.
+  if (((size_t)ab->argptr) % (size_t)8) {
+    ab->argptr += 4;
+    ab->data_not_used -= 4;
+  }
+}
+
+/// Get uint32 value extended to uint64_t value from a char ptr
+static uint64_t getuint32(char *val) {
+  uint32_t i32 = *(uint32_t *)val;
+  return (uint64_t)i32;
+}
+
+/// Get uint64_t value from a char ptr
+static uint64_t getuint64(char *val) { return *(uint64_t *)val; }
+
+// build argument array to create call to variadic wrappers
+static uint32_t
+EmissaryBuildVargs(int NumArgs, char *keyptr, char *dataptr, char *strptr,
+                   unsigned long long *data_not_used, emis_argptr_t *a[],
+                   std::unordered_map<void *, void *> *D2HAddrList) {
+  size_t num_bytes;
+  size_t bytes_consumed;
+  size_t strsz;
+  size_t fillerNeeded;
+
+  uint argcount = 0;
+
+  for (int argnum = 0; argnum < NumArgs; argnum++) {
+    num_bytes = 0;
+    strsz = 0;
+    unsigned int key = *(unsigned int *)keyptr;
+    unsigned int llvmID = key >> 16;
+    unsigned int numbits = (key << 16) >> 16;
+
+    switch (llvmID) {
+    case FloatTyID:  ///<  2: 32-bit floating point type
+    case DoubleTyID: ///<  3: 64-bit floating point type
+    case FP128TyID:  ///<  5: 128-bit floating point type (112-bit mantissa)
+      num_bytes = numbits / 8;
+      bytes_consumed = num_bytes;
+      fillerNeeded = ((size_t)dataptr) % num_bytes;
+      if (fillerNeeded) {
+        dataptr += fillerNeeded;
+        bytes_consumed += fillerNeeded;
+      }
+      if ((*data_not_used) < bytes_consumed)
+        return _ERC_DATA_USED_ERROR;
+
+      if (num_bytes == 4)
+        a[argcount] = (emis_argptr_t *)getuint32(dataptr);
+      else
+        a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+      break;
+
+    case IntegerTyID: ///< 11: Arbitrary bit width integers
+      num_bytes = numbits / 8;
+      bytes_consumed = num_bytes;
+      fillerNeeded = ((size_t)dataptr) % num_bytes;
+      if (fillerNeeded) {
+        dataptr += fillerNeeded;
+        bytes_consumed += fillerNeeded;
+      }
+      if ((*data_not_used) < bytes_consumed)
+        return _ERC_DATA_USED_ERROR;
+
+      if (num_bytes == 4)
+        a[argcount] = (emis_argptr_t *)getuint32(dataptr);
+      else
+        a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+      break;
+
+    case PointerTyID: {   ///< 15: Pointers
+      if (numbits == 1) { // This is a pointer to string
+        num_bytes = 4;
+        bytes_consumed = num_bytes;
+        strsz = (size_t)*(unsigned int *)dataptr;
+        if ((*data_not_used) < bytes_consumed)
+          return _ERC_DATA_USED_ERROR;
+        a[argcount] = (emis_argptr_t *)((char *)strptr);
+      } else {
+        num_bytes = 8;
+        bytes_consumed = num_bytes;
+        fillerNeeded = ((size_t)dataptr) % num_bytes;
+        if (fillerNeeded) {
+          dataptr += fillerNeeded; // dataptr is now aligned
+          bytes_consumed += fillerNeeded;
+        }
+        if ((*data_not_used) < bytes_consumed)
+          return _ERC_DATA_USED_ERROR;
+        a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+      }
+      if (D2HAddrList) {
+        auto found = D2HAddrList->find((void *)a[argcount]);
+        if (found != D2HAddrList->end())
+          a[argcount] = (emis_argptr_t *)found->second;
+      }
+    } break;
+
+    case HalfTyID:           ///<  1: 16-bit floating point type
+    case ArrayTyID:          ///< 14: Arrays
+    case StructTyID:         ///< 13: Structures
+    case FunctionTyID:       ///< 12: Functions
+    case TokenTyID:          ///< 10: Tokens
+    case MetadataTyID:       ///<  8: Metadata
+    case LabelTyID:          ///<  7: Labels
+    case PPC_FP128TyID:      ///<  6: 128-bit floating point type (two 64-bits,
+                             ///<  PowerPC)
+    case X86_FP80TyID:       ///<  4: 80-bit floating point type (X87)
+    case FixedVectorTyID:    ///< 16: Fixed width SIMD vector type
+    case ScalableVectorTyID: ///< 17: Scalable SIMD vector type
+    case TypedPointerTyID:   ///< Typed pointer used by some GPU targets
+    case TargetExtTyID:      ///< Target extension type
+    case VoidTyID:
+      return _ERC_UNSUPPORTED_ID_ERROR;
+      break;
+    default:
+      return _ERC_INVALID_ID_ERROR;
+    }
+    // Move to next argument
+    dataptr += num_bytes;
+    strptr += strsz;
+    *data_not_used -= bytes_consumed;
+    keyptr += 4;
+    argcount++;
+  }
+  return _ERC_SUCCESS;
+}
+
+//  Utility to skip two args in the ArgBuf
+static void emisSkipXferArgSet(emisArgBuf_t *ab) {
+  // Skip the ptr and size of the Xfer
+  ab->NumArgs -= 2;
+  ab->keyptr += 2 * sizeof(uint32_t);
+  ab->argptr += 2 * sizeof(void *);
+  ab->data_not_used -= 2 * sizeof(void *);
+}
+
+static EmissaryReturn_t
+EmissaryTop(char *data, emisArgBuf_t *ab,
+            std::unordered_map<void *, void *> *D2HAddrList) {
+  EmissaryReturn_t result = 0;
+  emis_argptr_t **args = (emis_argptr_t **)aligned_alloc(
+      sizeof(emis_argptr_t), ab->NumArgs * sizeof(emis_argptr_t *));
+
+  switch (ab->emisid) {
+  case EMIS_ID_INVALID: {
+    fprintf(stderr, "Emissary (host execution) got invalid EMIS_ID\n");
+    result = 0;
+    break;
+  }
+  case EMIS_ID_PRINT: {
+    result = EmissaryExternal::EmissaryPrint(data, ab, args);
+    break;
+  }
+  case EMIS_ID_MPI: {
+    if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+                           &(ab->data_not_used), &args[0],
+                           D2HAddrList) != _ERC_SUCCESS) {
+      return (EmissaryReturn_t)0;
+    }
+    result = EmissaryExternal::EmissaryMPI(data, ab, args);
+    break;
+  }
+  case EMIS_ID_HDF5: {
+    if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+                           &(ab->data_not_used), &args[0],
+                           D2HAddrList) != _ERC_SUCCESS)
+      return (EmissaryReturn_t)0;
+    result = EmissaryExternal::EmissaryHDF5(data, ab, args);
+    break;
+  }
+  case EMIS_ID_FORTRT: {
+    if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+                           &(ab->data_not_used), &args[0],
+                           D2HAddrList) != _ERC_SUCCESS)
+      return (EmissaryReturn_t)0;
+    result = EmissaryExternal::EmissaryFortrt(data, ab, args);
+    break;
+    break;
+  }
+
+  case EMIS_ID_RESERVE: {
+    if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+                           &(ab->data_not_used), &args[0],
+                           D2HAddrList) != _ERC_SUCCESS)
+      return (EmissaryReturn_t)0;
+    result = EmissaryExternal::EmissaryReserve(data, ab, args);
+    break;
+  }
+  default:
+    fprintf(stderr,
+            "Emissary (host execution) EMIS_ID:%d fnid:%d not supported\n",
+            ab->emisid, ab->emisfnid);
+  }
+  free(args);
+  return result;
+}
+
+// -----------------------------------------------------------------
+// -- Handle OFFLOAD_EMISSARY and OFFLOAD_EMISSARY_DM opcodes     --
+// -- handle_emissary_impl calls EmissaryTop for each active lane --
+// -----------------------------------------------------------------
+template <uint32_t NumLanes>
+LIBC_INLINE static ::rpc::Status
+handle_emissary_impl(::rpc::Server::Port &port) {
+
+  switch (port.get_opcode()) {
+
+  // This case handles the device function __llvm_emissary_rpc for emissary
+  // APIs that require no d2h or h2d memory transfer.
+  case OFFLOAD_EMISSARY: {
+    uint64_t Sizes[NumLanes] = {0};
+    unsigned long long Results[NumLanes] = {0};
+    void *buf_ptrs[NumLanes] = {nullptr};
+    port.recv_n(buf_ptrs, Sizes, [&](uint64_t Size) { return new char[Size]; });
+    uint32_t id = 0;
+    for (void *buffer_ptr : buf_ptrs) {
+      if (buffer_ptr) {
+        emisArgBuf_t ab;
+        emisExtractArgBuf((char *)buffer_ptr, &ab);
+        Results[id++] = EmissaryTop((char *)buffer_ptr, &ab, nullptr);
+      }
+    }
+    port.send([&](::rpc::Buffer *Buffer, uint32_t ID) {
+      Buffer->data[0] = static_cast<uint64_t>(Results[ID]);
+      delete[] reinterpret_cast<char *>(buf_ptrs[ID]);
+    });
+    break;
+  }
+
+  // This case handles the device function __llvm_emissary_rpc_dm for emissary
+  // APIs require D2H or H2D transfer vectors to be processed through the port.
+  // FIXME: test with multiple transfer vectors of the same type.
+  case OFFLOAD_EMISSARY_DM: {
+    uint64_t Sizes[NumLanes] = {0};
+    unsigned long long Results[NumLanes] = {0};
+    void *buf_ptrs[NumLanes] = {nullptr};
+    port.recv_n(buf_ptrs, Sizes, [&](uint64_t Size) { return new char[Size]; });
+
+    uint32_t id = 0;
+    emisArgBuf_t AB[NumLanes];
+    std::unordered_map<void *, void *> D2HAddrList;
+    void *Xfers[NumLanes] = {nullptr};
+    void *devXfers[NumLanes] = {nullptr};
+    uint64_t XferSzs[NumLanes] = {0};
+    uint32_t numSendXfers = 0;
+    id = 0;
+
+    for (void *buffer_ptr : buf_ptrs) {
+      if (buffer_ptr) {
+
+        emisArgBuf_t *ab = &AB[id];
+        emisExtractArgBuf((char *)buffer_ptr, ab);
+        for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++) {
+          numSendXfers++;
+          devXfers[id] = (void *)*((uint64_t *)ab->argptr);
+          XferSzs[id] = (size_t)*((size_t *)(ab->argptr + sizeof(void *)));
+          emisSkipXferArgSet(ab);
+        }
+        // Allocate the host space for the receive Xfers
+        for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+          void *devAddr = (void *)*((uint64_t *)ab->argptr);
+          size_t devSz = (((size_t)*((size_t *)(ab->argptr + sizeof(void *)))) &
+                          0x00000000FFFFFFFF);
+          void *hostAddr = new char[devSz];
+          D2HAddrList.insert(std::pair<void *, void *>(devAddr, hostAddr));
+          emisSkipXferArgSet(ab);
+        }
+        id++;
+      }
+    }
+
+    // recv_n for device send_n into new host-allocated Xfers
+    if (numSendXfers)
+      port.recv_n(Xfers, XferSzs,
+                  [&](uint64_t Size) { return new char[Size]; });
+
+    // Xfers now contains just allocated host addrs for sends and
+    // devXfers contains corresponding devAddr for those sends
+    // Build map to pass to Emissary
+    id = 0;
+    for (void *Xfer : Xfers) {
+      if (Xfer) {
+        D2HAddrList.insert(std::pair<void *, void *>(devXfers[id], Xfer));
+        id++;
+      }
+    }
+
+    // Call EmissaryTop for each active lane
+    id = 0;
+    for (void *buffer_ptr : buf_ptrs) {
+      if (buffer_ptr) {
+        emisArgBuf_t *ab = &AB[id];
+        emisExtractArgBuf((char *)buffer_ptr, ab);
+        for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++)
+          emisSkipXferArgSet(ab);
+        for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++)
+          emisSkipXferArgSet(ab);
+        Results[id] = EmissaryTop((char *)buffer_ptr, ab, &D2HAddrList);
+        id++;
+      }
+    }
+
+    // Process send_n for the H2D Xfers.
+    void *recvXfers[NumLanes] = {nullptr};
+    uint64_t recvXferSzs[NumLanes] = {0};
+    id = 0;
+    uint32_t numRecvXfers = 0;
+    for (void *buffer_ptr : buf_ptrs) {
+      if (buffer_ptr) {
+        emisArgBuf_t *ab = &AB[id];
+        // Reset ArgBuf tracker
+        emisExtractArgBuf((char *)buffer_ptr, ab);
+        for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++)
+          emisSkipXferArgSet(ab);
+        for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+          numRecvXfers++;
+          void *devAddr = (void *)*((uint64_t *)ab->argptr);
+          recvXfers[id] = D2HAddrList[devAddr];
+          recvXferSzs[id] =
+              (((uint64_t)*((size_t *)(ab->argptr + sizeof(void *)))) &
+               0x00000000FFFFFFFF);
+          emisSkipXferArgSet(ab);
+        }
+        id++;
+      }
+    }
+    if (numRecvXfers)
+      port.send_n(recvXfers, recvXferSzs);
+
+    // Cleanup all host allocated transfer buffers
+    id = 0;
+    for (void *buffer_ptr : buf_ptrs) {
+      if (buffer_ptr) {
+        emisArgBuf_t *ab = &AB[id];
+        // Reset the ArgBuf tracker ab
+        emisExtractArgBuf((char *)buffer_ptr, ab);
+        // Cleanup host allocated send Xfers
+        for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++) {
+          void *devAddr = (void *)*((uint64_t *)ab->argptr);
+          void *hostAddr = D2HAddrList[devAddr];
+          delete[] reinterpret_cast<char *>(hostAddr);
+          emisSkipXferArgSet(ab);
+        }
+        // Cleanup host allocated bufs
+        for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+          void *devAddr = (void *)*((uint64_t *)ab->argptr);
+          void *hostAddr = D2HAddrList[devAddr];
+          delete[] reinterpret_cast<char *>(hostAddr);
+          emisSkipXferArgSet(ab);
+        }
+        id++;
+      }
+    }
+
+    port.send([&](::rpc::Buffer *Buffer, uint32_t ID) {
+      Buffer->data[0] = static_cast<uint64_t>(Results[ID]);
+      delete[] reinterpret_cast<char *>(buf_ptrs[ID]);
+    });
+
+    break;
+  } // END CASE OFFLOAD_EMISSARY_DM
+
+  default: {
+    return ::rpc::RPC_UNHANDLED_OPCODE;
+    break;
+  }
+  }
+  return ::rpc::RPC_SUCCESS;
+} // end handle_emissary_impl
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+namespace LIBC_NAMESPACE_DECL {
+namespace rpc {
+LIBC_INLINE ::rpc::Status handleEmissaryOpcodes(::rpc::Server::Port &port,
+                                                uint32_t NumLanes) {
+  if (NumLanes == 1)
+    return internal::handle_emissary_impl<1>(port);
+  else if (NumLanes == 32)
+    return internal::handle_emissary_impl<32>(port);
+  else if (NumLanes == 64)
+    return internal::handle_emissary_impl<64>(port);
+  else
+    return ::rpc::RPC_ERROR;
+}
+
+} // namespace rpc
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_RPC_EMISSARY_RPC_SERVER_H
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index 83cde630ebf40..4a8a0647e727f 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -115,6 +115,9 @@ runServer(plugin::GenericDeviceTy &Device, void *Buffer,
   if (Status == rpc::RPC_UNHANDLED_OPCODE)
     Status = LIBC_NAMESPACE::shared::handle_libc_opcodes(*Port, NumLanes);
 
+  if (Status == rpc::RPC_UNHANDLED_OPCODE)
+    Status = LIBC_NAMESPACE::shared::handleEmissaryOpcodes(*Port, NumLanes);
+
 #ifdef OFFLOAD_HAS_FLANG_RT
   if (Status == rpc::RPC_UNHANDLED_OPCODE)
     Status = static_cast<rpc::Status>(

>From 205aed3c859e12e6e8130fc69d357e5cbce9bfe8 Mon Sep 17 00:00:00 2001
From: gregrodgers <Gregory.Rodgers at amd.com>
Date: Fri, 20 Mar 2026 09:43:37 -0500
Subject: [PATCH 2/2] [OpenMP] remove need to copy llvm::type::TypeID to the
 Emissary runtime

---
 clang/lib/CodeGen/CGEmitEmissaryExec.cpp     | 43 ++++++++++++-
 clang/lib/Headers/EmissaryIds.h              |  7 ++
 libc/src/__support/RPC/emissary_rpc_server.h | 67 ++------------------
 3 files changed, 54 insertions(+), 63 deletions(-)

diff --git a/clang/lib/CodeGen/CGEmitEmissaryExec.cpp b/clang/lib/CodeGen/CGEmitEmissaryExec.cpp
index d9511244a0b8c..48585a8f3b496 100644
--- a/clang/lib/CodeGen/CGEmitEmissaryExec.cpp
+++ b/clang/lib/CodeGen/CGEmitEmissaryExec.cpp
@@ -134,6 +134,46 @@ static llvm::Function *GetEmissaryExecDeclaration(CodeGenModule &CGM,
 // A macro to pack the llvm type ID and numbits into 4-byte key
 #define _PACK_TY_BITLEN(x, y) ((uint32_t)x << 16) | ((uint32_t)y)
 
+static EmisTyID getEmisTyID(llvm::Type::TypeID tyid) {
+  switch (tyid) {
+  case llvm::Type::HalfTyID:     ///< 16-bit floating point type
+  case llvm::Type::X86_FP80TyID: ///< 80-bit floating point type (X87)
+  case llvm::Type::BFloatTyID:   ///< 16-bit floating point type (7-bit
+                                 ///< significand)
+    return EmisInvalidTy;
+  case llvm::Type::FloatTyID:  ///< 32-bit floating point type
+  case llvm::Type::DoubleTyID: ///< 64-bit floating point type
+  case llvm::Type::FP128TyID:  ///< 128-bit floating point type (112-bit
+                               ///< significand)
+    return EmisFloatTy;
+  case llvm::Type::PPC_FP128TyID: ///< 128-bit floating point type (two 64-bits,
+                                  ///< PowerPC)
+  case llvm::Type::VoidTyID:      ///< type with no size
+  case llvm::Type::LabelTyID:     ///< Labels
+  case llvm::Type::MetadataTyID:  ///< Metadata
+  case llvm::Type::X86_AMXTyID:   ///< AMX vectors (8192 bits, X86 specific)
+  case llvm::Type::TokenTyID:     ///< Tokens
+    return EmisInvalidTy;
+  // Derived types... see DerivedTypes.h file.
+  case llvm::Type::IntegerTyID: ///< Arbitrary bit width integers
+    return EmisIntegerTy;
+  case llvm::Type::ByteTyID:     ///< Arbitrary bit width bytes
+  case llvm::Type::FunctionTyID: ///< Functions
+    return EmisInvalidTy;
+  case llvm::Type::PointerTyID: ///< Pointers
+    return EmisPointerTy;
+  case llvm::Type::StructTyID:         ///< Structures
+  case llvm::Type::ArrayTyID:          ///< Arrays
+  case llvm::Type::FixedVectorTyID:    ///< Fixed width SIMD vector type
+  case llvm::Type::ScalableVectorTyID: ///< Scalable SIMD vector type
+  case llvm::Type::TypedPointerTyID: ///< Typed pointer used by some GPU targets
+  case llvm::Type::TargetExtTyID:    ///< Target extension type
+    return EmisInvalidTy;
+  default:
+    return EmisInvalidTy;
+  }
+}
+
 //  ----- External function EmitEmissaryExec called from CGExpr.cpp -----
 RValue CodeGenFunction::EmitEmissaryExec(const CallExpr *E) {
   assert(getTarget().getTriple().isAMDGCN() ||
@@ -278,6 +318,7 @@ RValue CodeGenFunction::EmitEmissaryExec(const CallExpr *E) {
     llvm::Type *ty = Args[I].getRValue(*this).getScalarVal()->getType();
     llvm::Type::TypeID argtypeid =
         Args[I].getRValue(*this).getScalarVal()->getType()->getTypeID();
+    EmisTyID emis_tyid = getEmisTyID(argtypeid);
 
     // Get type size in bits. Usually 64 or 32.
     uint32_t numbits = 0;
@@ -291,7 +332,7 @@ RValue CodeGenFunction::EmitEmissaryExec(const CallExpr *E) {
       numbits = ty->getScalarSizeInBits();
     // Create a key that combines llvm typeID and size
     llvm::Value *Key =
-        llvm::ConstantInt::get(Int32Ty, _PACK_TY_BITLEN(argtypeid, numbits));
+        llvm::ConstantInt::get(Int32Ty, _PACK_TY_BITLEN(emis_tyid, numbits));
     P = Builder.CreateStructGEP(DataStructTy, BufferPtr, I + 2);
     Builder.CreateAlignedStore(Key, P, DL.getPrefTypeAlign(Key->getType()));
   }
diff --git a/clang/lib/Headers/EmissaryIds.h b/clang/lib/Headers/EmissaryIds.h
index 921bc5453b848..8056ed0e7ee83 100644
--- a/clang/lib/Headers/EmissaryIds.h
+++ b/clang/lib/Headers/EmissaryIds.h
@@ -31,6 +31,13 @@ _emissary_exec(const unsigned long long int, ...);
   ((unsigned long long)a << 48) | ((unsigned long long)b << 32) |              \
       ((unsigned long long)c << 16) | ((unsigned long long)d)
 
+enum EmisTyID {
+  EmisInvalidTy = 0,
+  EmisFloatTy,
+  EmisIntegerTy,
+  EmisPointerTy,
+};
+
 /// These are the various Emissary APIs currently defined.
 /// MPI, HDF5, and, RESERVE are "external" Emissary APIs whose device stubs and
 /// host runtime support are provided by library maintainers typically in the
diff --git a/libc/src/__support/RPC/emissary_rpc_server.h b/libc/src/__support/RPC/emissary_rpc_server.h
index bf6757d9c3b64..6b52984634e03 100644
--- a/libc/src/__support/RPC/emissary_rpc_server.h
+++ b/libc/src/__support/RPC/emissary_rpc_server.h
@@ -42,42 +42,6 @@ __attribute((weak)) EmissaryReturn_t EmissaryPrint(char *data, emisArgBuf_t *ab,
 } // end extern "C"
 } // namespace EmissaryExternal
 
-// We would like to get llvm typeID enum from Type.h. e.g.
-// #include ".../llvm/include/llvm/IR/Type.h"
-// But we cannot include LLVM headers in a runtime function.
-// So we a have a manual copy of llvm TypeID enum from Type.h
-// The codegen for _emissary_exec puts this ID in the key for
-// each arg and the host runtime needs to decode this key.
-#if 1
-enum TypeID {
-  // PrimitiveTypes
-  HalfTyID = 0,  ///< 16-bit floating point type
-  BFloatTyID,    ///< 16-bit floating point type (7-bit significand)
-  FloatTyID,     ///< 32-bit floating point type
-  DoubleTyID,    ///< 64-bit floating point type
-  X86_FP80TyID,  ///< 80-bit floating point type (X87)
-  FP128TyID,     ///< 128-bit floating point type (112-bit significand)
-  PPC_FP128TyID, ///< 128-bit floating point type (two 64-bits, PowerPC)
-  VoidTyID,      ///< type with no size
-  LabelTyID,     ///< Labels
-  MetadataTyID,  ///< Metadata
-  X86_AMXTyID,   ///< AMX vectors (8192 bits, X86 specific)
-  TokenTyID,     ///< Tokens
-
-  // Derived types... see DerivedTypes.h file.
-  IntegerTyID,        ///< Arbitrary bit width integers
-  ByteTyID,           ///< Arbitrary bit width bytes
-  FunctionTyID,       ///< Functions
-  PointerTyID,        ///< Pointers
-  StructTyID,         ///< Structures
-  ArrayTyID,          ///< Arrays
-  FixedVectorTyID,    ///< Fixed width SIMD vector type
-  ScalableVectorTyID, ///< Scalable SIMD vector type
-  TypedPointerTyID,   ///< Typed pointer used by some GPU targets
-  TargetExtTyID,      ///< Target extension type
-};
-#endif
-
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 
@@ -145,20 +109,16 @@ EmissaryBuildVargs(int NumArgs, char *keyptr, char *dataptr, char *strptr,
   size_t bytes_consumed;
   size_t strsz;
   size_t fillerNeeded;
-
   uint argcount = 0;
-
   for (int argnum = 0; argnum < NumArgs; argnum++) {
     num_bytes = 0;
     strsz = 0;
     unsigned int key = *(unsigned int *)keyptr;
-    unsigned int llvmID = key >> 16;
+    unsigned int emis_id = key >> 16;
     unsigned int numbits = (key << 16) >> 16;
 
-    switch (llvmID) {
-    case FloatTyID:  ///<  2: 32-bit floating point type
-    case DoubleTyID: ///<  3: 64-bit floating point type
-    case FP128TyID:  ///<  5: 128-bit floating point type (112-bit mantissa)
+    switch (emis_id) {
+    case EmisFloatTy:
       num_bytes = numbits / 8;
       bytes_consumed = num_bytes;
       fillerNeeded = ((size_t)dataptr) % num_bytes;
@@ -175,7 +135,7 @@ EmissaryBuildVargs(int NumArgs, char *keyptr, char *dataptr, char *strptr,
         a[argcount] = (emis_argptr_t *)getuint64(dataptr);
       break;
 
-    case IntegerTyID: ///< 11: Arbitrary bit width integers
+    case EmisIntegerTy:
       num_bytes = numbits / 8;
       bytes_consumed = num_bytes;
       fillerNeeded = ((size_t)dataptr) % num_bytes;
@@ -192,7 +152,7 @@ EmissaryBuildVargs(int NumArgs, char *keyptr, char *dataptr, char *strptr,
         a[argcount] = (emis_argptr_t *)getuint64(dataptr);
       break;
 
-    case PointerTyID: {   ///< 15: Pointers
+    case EmisPointerTy: {
       if (numbits == 1) { // This is a pointer to string
         num_bytes = 4;
         bytes_consumed = num_bytes;
@@ -219,23 +179,6 @@ EmissaryBuildVargs(int NumArgs, char *keyptr, char *dataptr, char *strptr,
       }
     } break;
 
-    case HalfTyID:           ///<  1: 16-bit floating point type
-    case ArrayTyID:          ///< 14: Arrays
-    case StructTyID:         ///< 13: Structures
-    case FunctionTyID:       ///< 12: Functions
-    case TokenTyID:          ///< 10: Tokens
-    case MetadataTyID:       ///<  8: Metadata
-    case LabelTyID:          ///<  7: Labels
-    case PPC_FP128TyID:      ///<  6: 128-bit floating point type (two 64-bits,
-                             ///<  PowerPC)
-    case X86_FP80TyID:       ///<  4: 80-bit floating point type (X87)
-    case FixedVectorTyID:    ///< 16: Fixed width SIMD vector type
-    case ScalableVectorTyID: ///< 17: Scalable SIMD vector type
-    case TypedPointerTyID:   ///< Typed pointer used by some GPU targets
-    case TargetExtTyID:      ///< Target extension type
-    case VoidTyID:
-      return _ERC_UNSUPPORTED_ID_ERROR;
-      break;
     default:
       return _ERC_INVALID_ID_ERROR;
     }



More information about the cfe-commits mailing list