[clang] [libc] [llvm] [OpenMP] [cuda] [hip] [libc] Support for Emissary APIs (PR #187602)
Greg Rodgers via cfe-commits
cfe-commits at lists.llvm.org
Fri Mar 20 07:44:27 PDT 2026
https://github.com/gregrodgers updated https://github.com/llvm/llvm-project/pull/187602
>From c6cb50815a6911bc147ca9348f3de3e164ef4594 Mon Sep 17 00:00:00 2001
From: gregrodgers <Gregory.Rodgers at amd.com>
Date: Thu, 19 Mar 2026 17:18:50 -0500
Subject: [PATCH 1/2] [OpenMP] [cuda] [hip] [libc] Support for Emissary APIs
without EmissaryPrint
---
78 | 554 ++++++++++++++++++
clang/lib/CodeGen/CGEmitEmissaryExec.cpp | 392 +++++++++++++
clang/lib/CodeGen/CGExpr.cpp | 10 +
clang/lib/CodeGen/CMakeLists.txt | 1 +
clang/lib/CodeGen/CodeGenFunction.h | 1 +
clang/lib/Headers/CMakeLists.txt | 1 +
clang/lib/Headers/EmissaryIds.h | 121 ++++
libc/docs/gpu/emissary.rst | 274 +++++++++
libc/docs/gpu/index.rst | 1 +
libc/shared/rpc_server.h | 2 +
libc/src/__support/RPC/CMakeLists.txt | 1 +
.../__support/RPC/emissary_device_utils.cpp | 115 ++++
libc/src/__support/RPC/emissary_rpc_server.h | 515 ++++++++++++++++
offload/plugins-nextgen/common/src/RPC.cpp | 3 +
14 files changed, 1991 insertions(+)
create mode 100644 78
create mode 100644 clang/lib/CodeGen/CGEmitEmissaryExec.cpp
create mode 100644 clang/lib/Headers/EmissaryIds.h
create mode 100644 libc/docs/gpu/emissary.rst
create mode 100644 libc/src/__support/RPC/emissary_device_utils.cpp
create mode 100644 libc/src/__support/RPC/emissary_rpc_server.h
diff --git a/78 b/78
new file mode 100644
index 0000000000000..203e1397f84f6
--- /dev/null
+++ b/78
@@ -0,0 +1,554 @@
+//===-- Shared memory RPC server instantiation ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is intended to be used externally as part of the `shared/`
+// interface. Consider this an extenion of rpc_server.h to support emissary
+// APIs. rpc_server.h must be included first.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_RPC_EMISSARY_RPC_SERVER_H
+#define LLVM_LIBC_SRC___SUPPORT_RPC_EMISSARY_RPC_SERVER_H
+
+#include "../clang/lib/Headers/EmissaryIds.h"
+#include "rpc_server.h"
+#include <string.h>
+#include <unordered_map>
+
+namespace EmissaryExternal {
+extern "C" {
+/// Called by EmissaryTop for all MPI emissary API functions
+__attribute((weak)) EmissaryReturn_t EmissaryMPI(char *data, emisArgBuf_t *ab,
+ emis_argptr_t *arg[]);
+/// Called by EmissaryTop for all HDF5 Emissary API functions
+__attribute((weak)) EmissaryReturn_t EmissaryHDF5(char *data, emisArgBuf_t *ab,
+ emis_argptr_t *arg[]);
+/// Called by EmissaryTop to support user-defined emissary API
+__attribute((weak)) EmissaryReturn_t EmissaryReserve(char *data,
+ emisArgBuf_t *ab,
+ emis_argptr_t *arg[]);
+/// Called by EmissaryTop to support Fortran IO runtime
+__attribute((weak)) EmissaryReturn_t EmissaryFortrt(char *data,
+ emisArgBuf_t *ab,
+ emis_argptr_t *arg[]);
+/// Called by EmissaryTop to support printf/fprintf/asan report externally
+__attribute((weak)) EmissaryReturn_t EmissaryPrint(char *data,
+ emisArgBuf_t *ab,
+ emis_argptr_t *arg[]);
+} // end extern "C"
+} // namespace EmissaryExternal
+
+// We would like to get llvm typeID enum from Type.h. e.g.
+// #include ".../llvm/include/llvm/IR/Type.h"
+// But we cannot include LLVM headers in a runtime function.
+// So we a have a manual copy of llvm TypeID enum from Type.h
+// The codegen for _emissary_exec puts this ID in the key for
+// each arg and the host runtime needs to decode this key.
+#if 1
+enum TypeID {
+ // PrimitiveTypes
+ HalfTyID = 0, ///< 16-bit floating point type
+ BFloatTyID, ///< 16-bit floating point type (7-bit significand)
+ FloatTyID, ///< 32-bit floating point type
+ DoubleTyID, ///< 64-bit floating point type
+ X86_FP80TyID, ///< 80-bit floating point type (X87)
+ FP128TyID, ///< 128-bit floating point type (112-bit significand)
+ PPC_FP128TyID, ///< 128-bit floating point type (two 64-bits, PowerPC)
+ VoidTyID, ///< type with no size
+ LabelTyID, ///< Labels
+ MetadataTyID, ///< Metadata
+ X86_AMXTyID, ///< AMX vectors (8192 bits, X86 specific)
+ TokenTyID, ///< Tokens
+
+ // Derived types... see DerivedTypes.h file.
+ IntegerTyID, ///< Arbitrary bit width integers
+ ByteTyID, ///< Arbitrary bit width bytes
+ FunctionTyID, ///< Functions
+ PointerTyID, ///< Pointers
+ StructTyID, ///< Structures
+ ArrayTyID, ///< Arrays
+ FixedVectorTyID, ///< Fixed width SIMD vector type
+ ScalableVectorTyID, ///< Scalable SIMD vector type
+ TypedPointerTyID, ///< Typed pointer used by some GPU targets
+ TargetExtTyID, ///< Target extension type
+};
+#endif
+
+// emisExtractArgBuf extract ArgBuf using protocol EmitEmissaryExec makes.
+static void emisExtractArgBuf(char *data, emisArgBuf_t *ab) {
+
+ uint32_t *int32_data = (uint32_t *)data;
+ ab->DataLen = int32_data[0];
+ ab->NumArgs = int32_data[1];
+
+ // Note: while the data buffer contains all args including strings,
+ // ab->DataLen does not include strings. It only counts header, keys,
+ // and aligned numerics.
+
+ ab->keyptr = data + (2 * sizeof(int));
+ ab->argptr = ab->keyptr + (ab->NumArgs * sizeof(int));
+ ab->strptr = data + (size_t)ab->DataLen;
+ int alignfill = 0;
+ if (((size_t)ab->argptr) % (size_t)8) {
+ ab->argptr += 4;
+ alignfill = 4;
+ }
+
+ // Extract the two emissary identifiers and number of send
+ // and recv device data transfers. These are 4 16 bit values
+ // packed into a single 64-bit field.
+ uint64_t arg1 = *(uint64_t *)ab->argptr;
+ ab->emisid = (unsigned int)((arg1 >> 48) & 0xFFFF);
+ ab->emisfnid = (unsigned int)((arg1 >> 32) & 0xFFFF);
+ ab->NumSendXfers = (unsigned int)((arg1 >> 16) & 0xFFFF);
+ ab->NumRecvXfers = (unsigned int)((arg1) & 0xFFFF);
+
+ // skip the uint64_t emissary id arg which is first arg in _emissary_exec.
+ ab->keyptr += sizeof(int);
+ ab->argptr += sizeof(uint64_t);
+ ab->NumArgs -= 1;
+
+ // data_not_used used for testing consistency.
+ ab->data_not_used =
+ (size_t)(ab->DataLen) - (((size_t)(3 + ab->NumArgs) * sizeof(int)) +
+ sizeof(uint64_t) + alignfill);
+
+ // Ensure first arg after emissary id arg is aligned.
+ if (((size_t)ab->argptr) % (size_t)8) {
+ ab->argptr += 4;
+ ab->data_not_used -= 4;
+ }
+}
+
+/// Get uint32 value extended to uint64_t value from a char ptr
+static uint64_t getuint32(char *val) {
+ uint32_t i32 = *(uint32_t *)val;
+ return (uint64_t)i32;
+}
+
+/// Get uint64_t value from a char ptr
+static uint64_t getuint64(char *val) { return *(uint64_t *)val; }
+
+// build argument array to create call to variadic wrappers
+static uint32_t
+EmissaryBuildVargs(int NumArgs, char *keyptr, char *dataptr, char *strptr,
+ unsigned long long *data_not_used, emis_argptr_t *a[],
+ std::unordered_map<void *, void *> *D2HAddrList) {
+ size_t num_bytes;
+ size_t bytes_consumed;
+ size_t strsz;
+ size_t fillerNeeded;
+
+ uint argcount = 0;
+
+ for (int argnum = 0; argnum < NumArgs; argnum++) {
+ num_bytes = 0;
+ strsz = 0;
+ unsigned int key = *(unsigned int *)keyptr;
+ unsigned int llvmID = key >> 16;
+ unsigned int numbits = (key << 16) >> 16;
+
+ switch (llvmID) {
+ case FloatTyID: ///< 2: 32-bit floating point type
+ case DoubleTyID: ///< 3: 64-bit floating point type
+ case FP128TyID: ///< 5: 128-bit floating point type (112-bit mantissa)
+ num_bytes = numbits / 8;
+ bytes_consumed = num_bytes;
+ fillerNeeded = ((size_t)dataptr) % num_bytes;
+ if (fillerNeeded) {
+ dataptr += fillerNeeded;
+ bytes_consumed += fillerNeeded;
+ }
+ if ((*data_not_used) < bytes_consumed)
+ return _ERC_DATA_USED_ERROR;
+
+ if (num_bytes == 4)
+ a[argcount] = (emis_argptr_t *)getuint32(dataptr);
+ else
+ a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+ break;
+
+ case IntegerTyID: ///< 11: Arbitrary bit width integers
+ num_bytes = numbits / 8;
+ bytes_consumed = num_bytes;
+ fillerNeeded = ((size_t)dataptr) % num_bytes;
+ if (fillerNeeded) {
+ dataptr += fillerNeeded;
+ bytes_consumed += fillerNeeded;
+ }
+ if ((*data_not_used) < bytes_consumed)
+ return _ERC_DATA_USED_ERROR;
+
+ if (num_bytes == 4)
+ a[argcount] = (emis_argptr_t *)getuint32(dataptr);
+ else
+ a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+ break;
+
+ case PointerTyID: { ///< 15: Pointers
+ if (numbits == 1) { // This is a pointer to string
+ num_bytes = 4;
+ bytes_consumed = num_bytes;
+ strsz = (size_t)*(unsigned int *)dataptr;
+ if ((*data_not_used) < bytes_consumed)
+ return _ERC_DATA_USED_ERROR;
+ a[argcount] = (emis_argptr_t *)((char *)strptr);
+ } else {
+ num_bytes = 8;
+ bytes_consumed = num_bytes;
+ fillerNeeded = ((size_t)dataptr) % num_bytes;
+ if (fillerNeeded) {
+ dataptr += fillerNeeded; // dataptr is now aligned
+ bytes_consumed += fillerNeeded;
+ }
+ if ((*data_not_used) < bytes_consumed)
+ return _ERC_DATA_USED_ERROR;
+ a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+ }
+ if (D2HAddrList) {
+ auto found = D2HAddrList->find((void *)a[argcount]);
+ if (found != D2HAddrList->end())
+ a[argcount] = (emis_argptr_t *)found->second;
+ }
+ } break;
+
+ case HalfTyID: ///< 1: 16-bit floating point type
+ case ArrayTyID: ///< 14: Arrays
+ case StructTyID: ///< 13: Structures
+ case FunctionTyID: ///< 12: Functions
+ case TokenTyID: ///< 10: Tokens
+ case MetadataTyID: ///< 8: Metadata
+ case LabelTyID: ///< 7: Labels
+ case PPC_FP128TyID: ///< 6: 128-bit floating point type (two 64-bits,
+ ///< PowerPC)
+ case X86_FP80TyID: ///< 4: 80-bit floating point type (X87)
+ case FixedVectorTyID: ///< 16: Fixed width SIMD vector type
+ case ScalableVectorTyID: ///< 17: Scalable SIMD vector type
+ case TypedPointerTyID: ///< Typed pointer used by some GPU targets
+ case TargetExtTyID: ///< Target extension type
+ case VoidTyID:
+ return _ERC_UNSUPPORTED_ID_ERROR;
+ break;
+ default:
+ return _ERC_INVALID_ID_ERROR;
+ }
+ // Move to next argument
+ dataptr += num_bytes;
+ strptr += strsz;
+ *data_not_used -= bytes_consumed;
+ keyptr += 4;
+ argcount++;
+ }
+ return _ERC_SUCCESS;
+}
+
+// Utility to skip two args in the ArgBuf
+static void emisSkipXferArgSet(emisArgBuf_t *ab) {
+ // Skip the ptr and size of the Xfer
+ ab->NumArgs -= 2;
+ ab->keyptr += 2 * sizeof(uint32_t);
+ ab->argptr += 2 * sizeof(void *);
+ ab->data_not_used -= 2 * sizeof(void *);
+}
+
+static service_rc emissary_fprintf(uint *rc, emisArgBuf_t *ab) {
+
+ if (ab->DataLen == 0)
+ return _ERC_SUCCESS;
+ char *fmtstr = ab->strptr;
+ FILE *fileptr = (FILE *)*((size_t *)ab->argptr);
+
+ // Skip past the file pointer
+ ab->NumArgs--;
+ ab->keyptr += 4;
+ ab->argptr += sizeof(FILE *);
+ ab->data_not_used -= sizeof(FILE *);
+
+ // Skip past the format string
+ ab->NumArgs--;
+ ab->keyptr += 4;
+ size_t abstrsz = (size_t)*(unsigned int *)ab->argptr;
+ ab->strptr += abstrsz;
+ ab->argptr += 4;
+ ab->data_not_used -= 4;
+
+ emissary_ValistExt_t valist; // FIXME: we may want to align this declare
+ va_list *real_va_list;
+ real_va_list = (va_list *)&valist;
+
+ if (emissary_pfBuildValist(&valist, ab->NumArgs, ab->keyptr, ab->argptr,
+ ab->strptr, &ab->data_not_used) != _ERC_SUCCESS)
+ return _ERC_ERROR_INVALID_REQUEST;
+
+ // Roll back offsets and save stack pointer
+ valist.gp_offset = 0;
+ valist.fp_offset = sizeof(emissary_pfIntRegs_t);
+ void *save_stack = valist.overflow_arg_area;
+ *rc = vfprintf(fileptr, fmtstr, *real_va_list);
+ if (valist.reg_save_area)
+ free(valist.reg_save_area);
+ if (save_stack)
+ free(save_stack);
+ return _ERC_SUCCESS;
+}
+
+static EmissaryReturn_t
+EmissaryTop(char *data, emisArgBuf_t *ab,
+ std::unordered_map<void *, void *> *D2HAddrList) {
+ EmissaryReturn_t result = 0;
+ emis_argptr_t **args = (emis_argptr_t **)aligned_alloc(
+ sizeof(emis_argptr_t), ab->NumArgs * sizeof(emis_argptr_t *));
+
+ switch (ab->emisid) {
+ case EMIS_ID_INVALID: {
+ fprintf(stderr, "Emissary (host execution) got invalid EMIS_ID\n");
+ result = 0;
+ break;
+ }
+ case EMIS_ID_PRINT: {
+ result = EmissaryExternal::EmissaryPrint(data, ab, args);
+ break;
+ }
+ case EMIS_ID_MPI: {
+ if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+ &(ab->data_not_used), &args[0],
+ D2HAddrList) != _ERC_SUCCESS) {
+ return (EmissaryReturn_t)0;
+ }
+ result = EmissaryExternal::EmissaryMPI(data, ab, args);
+ break;
+ }
+ case EMIS_ID_HDF5: {
+ if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+ &(ab->data_not_used), &args[0],
+ D2HAddrList) != _ERC_SUCCESS)
+ return (EmissaryReturn_t)0;
+ result = EmissaryExternal::EmissaryHDF5(data, ab, args);
+ break;
+ }
+ case EMIS_ID_FORTRT: {
+ if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+ &(ab->data_not_used), &args[0],
+ D2HAddrList) != _ERC_SUCCESS)
+ return (EmissaryReturn_t)0;
+ result = EmissaryExternal::EmissaryFortrt(data, ab, args);
+ break;
+ break;
+ }
+
+ case EMIS_ID_RESERVE: {
+ if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+ &(ab->data_not_used), &args[0],
+ D2HAddrList) != _ERC_SUCCESS)
+ return (EmissaryReturn_t)0;
+ result = EmissaryExternal::EmissaryReserve(data, ab, args);
+ break;
+ }
+ default:
+ fprintf(stderr,
+ "Emissary (host execution) EMIS_ID:%d fnid:%d not supported\n",
+ ab->emisid, ab->emisfnid);
+ }
+ free(args);
+ return result;
+}
+
+// -----------------------------------------------------------------
+// -- Handle OFFLOAD_EMISSARY and OFFLOAD_EMISSARY_DM opcodes --
+// -- handle_emissary_impl calls EmissaryTop for each active lane --
+// -----------------------------------------------------------------
+template <uint32_t NumLanes>
+LIBC_INLINE static ::rpc::Status
+handle_emissary_impl(::rpc::Server::Port &port) {
+
+ switch (port.get_opcode()) {
+
+ // This case handles the device function __llvm_emissary_rpc for emissary
+ // APIs that require no d2h or h2d memory transfer.
+ case OFFLOAD_EMISSARY: {
+ uint64_t Sizes[NumLanes] = {0};
+ unsigned long long Results[NumLanes] = {0};
+ void *buf_ptrs[NumLanes] = {nullptr};
+ port.recv_n(buf_ptrs, Sizes, [&](uint64_t Size) { return new char[Size]; });
+ uint32_t id = 0;
+ for (void *buffer_ptr : buf_ptrs) {
+ if (buffer_ptr) {
+ emisArgBuf_t ab;
+ emisExtractArgBuf((char *)buffer_ptr, &ab);
+ Results[id++] = EmissaryTop((char *)buffer_ptr, &ab, nullptr);
+ }
+ }
+ port.send([&](::rpc::Buffer *Buffer, uint32_t ID) {
+ Buffer->data[0] = static_cast<uint64_t>(Results[ID]);
+ delete[] reinterpret_cast<char *>(buf_ptrs[ID]);
+ });
+ break;
+ }
+
+ // This case handles the device function __llvm_emissary_rpc_dm for emissary
+ // APIs require D2H or H2D transfer vectors to be processed through the port.
+ // FIXME: test with multiple transfer vectors of the same type.
+ case OFFLOAD_EMISSARY_DM: {
+ uint64_t Sizes[NumLanes] = {0};
+ unsigned long long Results[NumLanes] = {0};
+ void *buf_ptrs[NumLanes] = {nullptr};
+ port.recv_n(buf_ptrs, Sizes, [&](uint64_t Size) { return new char[Size]; });
+
+ uint32_t id = 0;
+ emisArgBuf_t AB[NumLanes];
+ std::unordered_map<void *, void *> D2HAddrList;
+ void *Xfers[NumLanes] = {nullptr};
+ void *devXfers[NumLanes] = {nullptr};
+ uint64_t XferSzs[NumLanes] = {0};
+ uint32_t numSendXfers = 0;
+ id = 0;
+
+ for (void *buffer_ptr : buf_ptrs) {
+ if (buffer_ptr) {
+
+ emisArgBuf_t *ab = &AB[id];
+ emisExtractArgBuf((char *)buffer_ptr, ab);
+ for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++) {
+ numSendXfers++;
+ devXfers[id] = (void *)*((uint64_t *)ab->argptr);
+ XferSzs[id] = (size_t)*((size_t *)(ab->argptr + sizeof(void *)));
+ emisSkipXferArgSet(ab);
+ }
+ // Allocate the host space for the receive Xfers
+ for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+ void *devAddr = (void *)*((uint64_t *)ab->argptr);
+ size_t devSz = (((size_t)*((size_t *)(ab->argptr + sizeof(void *)))) &
+ 0x00000000FFFFFFFF);
+ void *hostAddr = new char[devSz];
+ D2HAddrList.insert(std::pair<void *, void *>(devAddr, hostAddr));
+ emisSkipXferArgSet(ab);
+ }
+ id++;
+ }
+ }
+
+ // recv_n for device send_n into new host-allocated Xfers
+ if (numSendXfers)
+ port.recv_n(Xfers, XferSzs,
+ [&](uint64_t Size) { return new char[Size]; });
+
+ // Xfers now contains just allocated host addrs for sends and
+ // devXfers contains corresponding devAddr for those sends
+ // Build map to pass to Emissary
+ id = 0;
+ for (void *Xfer : Xfers) {
+ if (Xfer) {
+ D2HAddrList.insert(std::pair<void *, void *>(devXfers[id], Xfer));
+ id++;
+ }
+ }
+
+ // Call EmissaryTop for each active lane
+ id = 0;
+ for (void *buffer_ptr : buf_ptrs) {
+ if (buffer_ptr) {
+ emisArgBuf_t *ab = &AB[id];
+ emisExtractArgBuf((char *)buffer_ptr, ab);
+ for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++)
+ emisSkipXferArgSet(ab);
+ for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++)
+ emisSkipXferArgSet(ab);
+ Results[id] = EmissaryTop((char *)buffer_ptr, ab, &D2HAddrList);
+ id++;
+ }
+ }
+
+ // Process send_n for the H2D Xfers.
+ void *recvXfers[NumLanes] = {nullptr};
+ uint64_t recvXferSzs[NumLanes] = {0};
+ id = 0;
+ uint32_t numRecvXfers = 0;
+ for (void *buffer_ptr : buf_ptrs) {
+ if (buffer_ptr) {
+ emisArgBuf_t *ab = &AB[id];
+ // Reset ArgBuf tracker
+ emisExtractArgBuf((char *)buffer_ptr, ab);
+ for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++)
+ emisSkipXferArgSet(ab);
+ for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+ numRecvXfers++;
+ void *devAddr = (void *)*((uint64_t *)ab->argptr);
+ recvXfers[id] = D2HAddrList[devAddr];
+ recvXferSzs[id] =
+ (((uint64_t)*((size_t *)(ab->argptr + sizeof(void *)))) &
+ 0x00000000FFFFFFFF);
+ emisSkipXferArgSet(ab);
+ }
+ id++;
+ }
+ }
+ if (numRecvXfers)
+ port.send_n(recvXfers, recvXferSzs);
+
+ // Cleanup all host allocated transfer buffers
+ id = 0;
+ for (void *buffer_ptr : buf_ptrs) {
+ if (buffer_ptr) {
+ emisArgBuf_t *ab = &AB[id];
+ // Reset the ArgBuf tracker ab
+ emisExtractArgBuf((char *)buffer_ptr, ab);
+ // Cleanup host allocated send Xfers
+ for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++) {
+ void *devAddr = (void *)*((uint64_t *)ab->argptr);
+ void *hostAddr = D2HAddrList[devAddr];
+ delete[] reinterpret_cast<char *>(hostAddr);
+ emisSkipXferArgSet(ab);
+ }
+ // Cleanup host allocated bufs
+ for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+ void *devAddr = (void *)*((uint64_t *)ab->argptr);
+ void *hostAddr = D2HAddrList[devAddr];
+ delete[] reinterpret_cast<char *>(hostAddr);
+ emisSkipXferArgSet(ab);
+ }
+ id++;
+ }
+ }
+
+ port.send([&](::rpc::Buffer *Buffer, uint32_t ID) {
+ Buffer->data[0] = static_cast<uint64_t>(Results[ID]);
+ delete[] reinterpret_cast<char *>(buf_ptrs[ID]);
+ });
+
+ break;
+ } // END CASE OFFLOAD_EMISSARY_DM
+
+ default: {
+ return ::rpc::RPC_UNHANDLED_OPCODE;
+ break;
+ }
+ }
+ return ::rpc::RPC_SUCCESS;
+} // end handle_emissary_impl
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+namespace LIBC_NAMESPACE_DECL {
+namespace rpc {
+LIBC_INLINE ::rpc::Status handleEmissaryOpcodes(::rpc::Server::Port &port,
+ uint32_t NumLanes) {
+ if (NumLanes == 1)
+ return internal::handle_emissary_impl<1>(port);
+ else if (NumLanes == 32)
+ return internal::handle_emissary_impl<32>(port);
+ else if (NumLanes == 64)
+ return internal::handle_emissary_impl<64>(port);
+ else
+ return ::rpc::RPC_ERROR;
+}
+
+} // namespace rpc
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_RPC_EMISSARY_RPC_SERVER_H
diff --git a/clang/lib/CodeGen/CGEmitEmissaryExec.cpp b/clang/lib/CodeGen/CGEmitEmissaryExec.cpp
new file mode 100644
index 0000000000000..d9511244a0b8c
--- /dev/null
+++ b/clang/lib/CodeGen/CGEmitEmissaryExec.cpp
@@ -0,0 +1,392 @@
+//===------- CGEmitEmissaryExec.cpp - Codegen for _emissary_exec --==------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// EmitEmissaryExec:
+//
+// When a device call to the varadic function _emissary_exec is encountered
+// (in CGExpr.cpp) EmitEmissaryExec does these steps:
+//
+// 1. If string lens are runtime dependent, Emit code to determine runtime len.
+// 2. Emits call to allocate memory __llvm_emissary_premalloc,
+// 3. Emit stores of each arg into arg buffer,
+// 4. Emits call to function __llvm_emissary_rpc or __llvm_emissary_rpc_dm
+//
+// The arg buffer is a struct that contains the length, number of args, an
+// array of 4-byte keys that represent the type of of each arg, an array of
+// aligned "data" values for each arg, and finally the runtime string values.
+// If an arg is a string the data value is the runtime length of the string.
+// Each 4-byte key contains the llvm type ID and the number of bits for the
+// type. encoded by the macro _PACK_TY_BITLEN(x,y) ((uint32_t)x << 16) |
+// ((uint32_t)y)
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../../clang/lib/Headers/EmissaryIds.h"
+#include "CodeGenFunction.h"
+#include "clang/Basic/Builtins.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h"
+
+using namespace clang;
+using namespace CodeGen;
+
+// These static helper functions support EmitEmissaryExec.
+static llvm::Function *GetOmpStrlenDeclaration(CodeGenModule &CGM) {
+ auto &M = CGM.getModule();
+ // Args are pointer to char and maxstringlen
+ llvm::Type *ArgTypes[] = {CGM.Int8PtrTy, CGM.Int32Ty};
+ llvm::FunctionType *OmpStrlenFTy =
+ llvm::FunctionType::get(CGM.Int32Ty, ArgTypes, false);
+ if (auto *F = M.getFunction("__strlen_max")) {
+ assert(F->getFunctionType() == OmpStrlenFTy);
+ return F;
+ }
+ llvm::Function *FN = llvm::Function::Create(
+ OmpStrlenFTy, llvm::GlobalVariable::ExternalLinkage, "__strlen_max", &M);
+ return FN;
+}
+
+// Deterimines if an expression is a string with variable lenth
+static bool isVarString(const clang::Expr *argX, const clang::Type *argXTy,
+ const llvm::Value *Arg) {
+ if ((argXTy->isPointerType() || argXTy->isConstantArrayType()) &&
+ argXTy->getPointeeOrArrayElementType()->isCharType() && !argX->isLValue())
+ return true;
+ // Ensure the VarDecl has an inititalizer
+ if (const auto *DRE = dyn_cast<DeclRefExpr>(argX))
+ if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl()))
+ if (!VD->getInit() ||
+ !llvm::isa<StringLiteral>(VD->getInit()->IgnoreImplicit()))
+ return true;
+ return false;
+}
+
+// Deterimines if an argument is a string
+static bool isString(const clang::Type *argXTy) {
+ if ((argXTy->isPointerType() || argXTy->isConstantArrayType()) &&
+ argXTy->getPointeeOrArrayElementType()->isCharType())
+ return true;
+ else
+ return false;
+}
+
+// Gets a string literal to write into the transfer buffer
+static const StringLiteral *getSL(const clang::Expr *argX,
+ const clang::Type *argXTy) {
+ // String in argX has known constant length
+ if (!argXTy->isConstantArrayType()) {
+ // Allow constant string to be a declared variable,
+ // But it must be constant and initialized.
+ const DeclRefExpr *DRE = cast<DeclRefExpr>(argX);
+ const VarDecl *VarD = cast<VarDecl>(DRE->getDecl());
+ argX = VarD->getInit()->IgnoreImplicit();
+ }
+ const StringLiteral *SL = cast<StringLiteral>(argX);
+ return SL;
+}
+
+// Returns a function pointer to __llvm_emissary_premalloc
+static llvm::Function *GetEmissaryAllocDeclaration(CodeGenModule &CGM) {
+ auto &M = CGM.getModule();
+ // clang::CodeGen::CodeGenTypes &CGT = CGM.getTypes();
+ const char *_executeName = "__llvm_emissary_premalloc";
+ llvm::Type *ArgTypes[] = {CGM.Int32Ty};
+ llvm::Function *FN;
+ // Maybe this should be pointer to char instead of pointer to void
+ llvm::FunctionType *VargsFnAllocFuncType = llvm::FunctionType::get(
+ CGM.getTypes().ConvertType(
+ CGM.getContext().getPointerType(CGM.getContext().VoidTy)),
+ ArgTypes, false);
+ if (!(FN = M.getFunction(_executeName)))
+ FN = llvm::Function::Create(VargsFnAllocFuncType,
+ llvm::GlobalVariable::ExternalLinkage,
+ _executeName, &M);
+ assert(FN->getFunctionType() == VargsFnAllocFuncType);
+ return FN;
+}
+
+// Returns a function pointer to __llvm_emissary_rpc
+static llvm::Function *GetEmissaryExecDeclaration(CodeGenModule &CGM,
+ bool hasXfers) {
+ const char *_executeName =
+ hasXfers ? "__llvm_emissary_rpc_dm" : "__llvm_emissary_rpc";
+ auto &M = CGM.getModule();
+ llvm::Type *ArgTypes[] = {
+ CGM.Int32Ty, CGM.getTypes().ConvertType(CGM.getContext().getPointerType(
+ CGM.getContext().VoidTy))};
+ llvm::Function *FN;
+ llvm::FunctionType *VarfnFuncType =
+ llvm::FunctionType::get(CGM.Int64Ty, ArgTypes, false);
+ if (!(FN = M.getFunction(_executeName)))
+ FN = llvm::Function::Create(
+ VarfnFuncType, llvm::GlobalVariable::ExternalLinkage, _executeName, &M);
+ assert(FN->getFunctionType() == VarfnFuncType);
+ return FN;
+}
+
+// A macro to pack the llvm type ID and numbits into 4-byte key
+#define _PACK_TY_BITLEN(x, y) ((uint32_t)x << 16) | ((uint32_t)y)
+
+// ----- External function EmitEmissaryExec called from CGExpr.cpp -----
+RValue CodeGenFunction::EmitEmissaryExec(const CallExpr *E) {
+ assert(getTarget().getTriple().isAMDGCN() ||
+ getTarget().getTriple().isNVPTX());
+ assert(E->getNumArgs() >= 1); // _emissary_exec always has at least one arg.
+ const llvm::DataLayout &DL = CGM.getDataLayout();
+ CallArgList Args;
+ // --- Insert 1st emisid arg if emiting fprintf or printf.
+ unsigned int AOE = 0;
+ if (E->getDirectCallee()->getNameAsString() == "fprintf") {
+ constexpr unsigned long long emisid =
+ ((unsigned long long)EMIS_ID_PRINT << 48) |
+ ((unsigned long long)_fprintf_idx << 32);
+ Args.add(
+ RValue::get(llvm::ConstantInt::get(Int64Ty, emisid)),
+ getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/false));
+ AOE = 1; // Arg# offset to E->arguments to use with E->getArg(I-AOE)
+ }
+ if (E->getDirectCallee()->getNameAsString() == "printf") {
+ constexpr unsigned long long emisid =
+ ((unsigned long long)EMIS_ID_PRINT << 48) |
+ ((unsigned long long)_printf_idx << 32);
+ Args.add(
+ RValue::get(llvm::ConstantInt::get(Int64Ty, emisid)),
+ getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/false));
+ AOE = 1; // Arg# offset to E->arguments to use with E->getArg(I-AOE)
+ }
+
+ EmitCallArgs(Args,
+ E->getDirectCallee()->getType()->getAs<FunctionProtoType>(),
+ E->arguments(), E->getDirectCallee(),
+ /* ParamsToSkip = */ 0);
+
+ // We don't know how to emit non-scalar varargs.
+ if (std::any_of(Args.begin() + 1, Args.end(), [&](const CallArg &A) {
+ return !A.getRValue(*this).isScalar();
+ })) {
+ CGM.ErrorUnsupported(E, "non-scalar arg in GPU vargs function");
+ return RValue::get(llvm::ConstantInt::get(IntTy, 0));
+ }
+ // NumArgs always includes emisid, but E->getNumArgs() could be 1 less if
+ // inserted it above.
+ unsigned NumArgs = (unsigned)Args.size();
+ llvm::SmallVector<llvm::Type *, 32> ArgTypes;
+ llvm::SmallVector<llvm::Value *, 32> VarStrLengths;
+ llvm::Value *TotalVarStrsLength = llvm::ConstantInt::get(Int32Ty, 0);
+ bool hasVarStrings = false;
+ ArgTypes.push_back(Int32Ty); // 1st field in struct is total DataLen
+ ArgTypes.push_back(Int32Ty); // 2nd field in struct will be num args
+ // An array of 4-byte keys that describe the arg type
+ for (unsigned I = 0; I < NumArgs; ++I)
+ ArgTypes.push_back(Int32Ty);
+
+ // Track the size of the numeric data length and string length
+ unsigned DataLen_CT =
+ (unsigned)(DL.getTypeAllocSize(Int32Ty)) * (NumArgs + 2);
+ unsigned AllStringsLen_CT = 0;
+
+ // --- 1st Pass over Args to create ArgTypes and count size ---
+ size_t structOffset = 4 * (NumArgs + 2);
+ for (unsigned I = 0; I < NumArgs; I++) {
+ llvm::Value *Arg = Args[I].getRValue(*this).getScalarVal();
+ llvm::Type *ArgType = Arg->getType();
+ // Skip string processing on arg0 which may not be in E->getArg(0)
+ if (I != 0) {
+ const Expr *argX = E->getArg(I - AOE)->IgnoreParenCasts();
+ auto *argXTy = argX->getType().getTypePtr();
+ if (isString(argXTy)) {
+ if (isVarString(argX, argXTy, Arg)) {
+ hasVarStrings = true;
+ if (auto *PtrTy = dyn_cast<llvm::PointerType>(ArgType))
+ if (PtrTy->getPointerAddressSpace()) {
+ Arg = Builder.CreateAddrSpaceCast(Arg, CGM.Int8PtrTy);
+ ArgType = Arg->getType();
+ }
+ llvm::Value *VarStrLen =
+ Builder.CreateCall(GetOmpStrlenDeclaration(CGM),
+ {Arg, llvm::ConstantInt::get(Int32Ty, 1024)});
+ VarStrLengths.push_back(VarStrLen);
+ TotalVarStrsLength = Builder.CreateAdd(TotalVarStrsLength, VarStrLen,
+ "sum_of_var_strings_length");
+ ArgType = Int32Ty;
+ } else {
+ const StringLiteral *SL = getSL(argX, argXTy);
+ StringRef ArgString = SL->getString();
+ AllStringsLen_CT += ((int)ArgString.size() + 1);
+ // change ArgType from char ptr to int to contain string length
+ ArgType = Int32Ty;
+ }
+ } // end of processing string argument
+ } // End of skip 1st arg
+ // if ArgTypeSize is >4 bytes we need to insert dummy align
+ // values in the struct so all stores can be aligned .
+ // These dummy fields must be inserted before the arg.
+ //
+ // In the pass below where the stores are generated careful
+ // tracking of the index into the struct is necessary.
+ size_t needsPadding = (structOffset % (size_t)DL.getTypeAllocSize(ArgType));
+ if (needsPadding) {
+ DataLen_CT += (unsigned)needsPadding;
+ structOffset += needsPadding;
+ ArgTypes.push_back(Int32Ty); // could assert that needsPadding == 4 here
+ }
+
+ ArgTypes.push_back(ArgType);
+ DataLen_CT += ((int)DL.getTypeAllocSize(ArgType));
+ structOffset += (size_t)DL.getTypeAllocSize(ArgType);
+ }
+
+ // --- Generate call to __llvm_emissary_premalloc to get data pointer
+ if (hasVarStrings)
+ TotalVarStrsLength = Builder.CreateAdd(
+ TotalVarStrsLength,
+ llvm::ConstantInt::get(Int32Ty, AllStringsLen_CT + DataLen_CT),
+ "total_buffer_size");
+ llvm::Value *BufferLen =
+ hasVarStrings
+ ? TotalVarStrsLength
+ : llvm::ConstantInt::get(Int32Ty, AllStringsLen_CT + DataLen_CT);
+ llvm::Value *DataStructPtr =
+ Builder.CreateCall(GetEmissaryAllocDeclaration(CGM), {BufferLen});
+
+ // --- Cast the generic return pointer to be a struct in device global memory
+ llvm::StructType *DataStructTy =
+ llvm::StructType::create(ArgTypes, "varfn_args_store");
+ unsigned AS = getContext().getTargetAddressSpace(LangAS::cuda_device);
+ llvm::Value *BufferPtr = Builder.CreatePointerCast(
+ DataStructPtr, llvm::PointerType::get(CGM.getLLVMContext(), AS),
+ "varfn_args_store_casted");
+ // --- Header of struct contains length and NumArgs ---
+ llvm::Value *DataLenField = llvm::ConstantInt::get(Int32Ty, DataLen_CT);
+ llvm::Value *P = Builder.CreateStructGEP(DataStructTy, BufferPtr, 0);
+ Builder.CreateAlignedStore(DataLenField, P,
+ DL.getPrefTypeAlign(DataLenField->getType()));
+ llvm::Value *NumArgsField = llvm::ConstantInt::get(Int32Ty, NumArgs);
+ P = Builder.CreateStructGEP(DataStructTy, BufferPtr, 1);
+ Builder.CreateAlignedStore(NumArgsField, P,
+ DL.getPrefTypeAlign(NumArgsField->getType()));
+
+ // --- 2nd Pass: create array of 4-byte keys to describe each arg
+ for (unsigned I = 0; I < NumArgs; I++) {
+ llvm::Type *ty = Args[I].getRValue(*this).getScalarVal()->getType();
+ llvm::Type::TypeID argtypeid =
+ Args[I].getRValue(*this).getScalarVal()->getType()->getTypeID();
+
+ // Get type size in bits. Usually 64 or 32.
+ uint32_t numbits = 0;
+ if (I > 0 &&
+ isString(
+ E->getArg(I - AOE)->IgnoreParenCasts()->getType().getTypePtr()))
+ // The llvm typeID for string is pointer. Since pointer numbits is 0,
+ // we set numbits to 1 to distinguish pointer type ID as string pointer.
+ numbits = 1;
+ else
+ numbits = ty->getScalarSizeInBits();
+ // Create a key that combines llvm typeID and size
+ llvm::Value *Key =
+ llvm::ConstantInt::get(Int32Ty, _PACK_TY_BITLEN(argtypeid, numbits));
+ P = Builder.CreateStructGEP(DataStructTy, BufferPtr, I + 2);
+ Builder.CreateAlignedStore(Key, P, DL.getPrefTypeAlign(Key->getType()));
+ }
+
+ // --- 3rd Pass: Store data values for each arg ---
+ unsigned varstring_index = 0;
+ unsigned structIndex = 2 + NumArgs;
+ structOffset = 4 * structIndex;
+ bool hasXfers;
+ for (unsigned I = 0; I < NumArgs; I++) {
+ llvm::Value *Arg = nullptr;
+ if (I == 0) {
+ Arg = Args[I].getKnownRValue().getScalarVal();
+ llvm::ConstantInt *CI = llvm::dyn_cast<llvm::ConstantInt>(Arg);
+ uint64_t uint64value = CI->getZExtValue();
+ uint32_t lower_32 = (uint32_t)(uint64value & 0xFFFFFFFF);
+ hasXfers = lower_32 ? true : false;
+ } else {
+ const Expr *argX = E->getArg(I - AOE)->IgnoreParenCasts();
+ auto *argXTy = argX->getType().getTypePtr();
+ if (isString(argXTy)) {
+ if (isVarString(argX, argXTy, Arg)) {
+ Arg = VarStrLengths[varstring_index];
+ varstring_index++;
+ } else {
+ const StringLiteral *SL = getSL(argX, argXTy);
+ StringRef ArgString = SL->getString();
+ int ArgStrLen = (int)ArgString.size() + 1;
+ // Change Arg from a char pointer to the integer string length
+ Arg = llvm::ConstantInt::get(Int32Ty, ArgStrLen);
+ }
+ } else {
+ Arg = Args[I].getKnownRValue().getScalarVal();
+ }
+ }
+ size_t structElementSize = (size_t)DL.getTypeAllocSize(Arg->getType());
+ size_t needsPadding = (structOffset % structElementSize);
+ if (needsPadding) {
+ // Skip over dummy fields in struct to align
+ structOffset += needsPadding; // should assert needsPadding == 4
+ structIndex++;
+ }
+ P = Builder.CreateStructGEP(DataStructTy, BufferPtr, structIndex);
+ Builder.CreateAlignedStore(Arg, P, DL.getPrefTypeAlign(Arg->getType()));
+ structOffset += structElementSize;
+ structIndex++;
+ }
+
+ // --- 4th Pass: memcpy all strings after the data values ---
+ // bitcast the struct in device global memory as a char buffer
+ Address BufferPtrByteAddr =
+ Address(Builder.CreatePointerCast(
+ BufferPtr, llvm::PointerType::get(CGM.getLLVMContext(), AS),
+ "_casted"),
+ Int8Ty, CharUnits::fromQuantity(1));
+
+ // BufferPtrByteAddr is a pointer to where we want to write the next string
+ BufferPtrByteAddr = Builder.CreateConstInBoundsByteGEP(
+ BufferPtrByteAddr, CharUnits::fromQuantity(DataLen_CT));
+ varstring_index = 0;
+ // Skip string processing on arg0 which may not be in E->getArg(0)
+ for (unsigned I = 1; I < NumArgs; ++I) {
+ llvm::Value *Arg = Args[I].getKnownRValue().getScalarVal();
+ const Expr *argX = E->getArg(I - AOE)->IgnoreParenCasts();
+ auto *argXTy = argX->getType().getTypePtr();
+ if (isString(argXTy)) {
+ if (isVarString(argX, argXTy, Arg)) {
+ llvm::Value *varStrLength = VarStrLengths[varstring_index];
+ varstring_index++;
+ Address SrcAddr = Address(Arg, Int8Ty, CharUnits::fromQuantity(1));
+ Builder.CreateMemCpy(BufferPtrByteAddr, SrcAddr, varStrLength);
+ // update BufferPtrByteAddr for next string memcpy
+ llvm::Value *PtrAsInt = BufferPtrByteAddr.emitRawPointer(*this);
+ BufferPtrByteAddr =
+ Address(Builder.CreateGEP(Int8Ty, PtrAsInt,
+ ArrayRef<llvm::Value *>(varStrLength)),
+ Int8Ty, CharUnits::fromQuantity(1));
+ } else {
+ const StringLiteral *SL = getSL(argX, argXTy);
+ StringRef ArgString = SL->getString();
+ int ArgStrLen = (int)ArgString.size() + 1;
+ Address SrcAddr = CGM.GetAddrOfConstantStringFromLiteral(SL);
+ Builder.CreateMemCpy(BufferPtrByteAddr, SrcAddr, ArgStrLen);
+ // update BufferPtrByteAddr for next memcpy
+ BufferPtrByteAddr = Builder.CreateConstInBoundsByteGEP(
+ BufferPtrByteAddr, CharUnits::fromQuantity(ArgStrLen));
+ }
+ }
+ }
+ // --- Generate call to __llvm_emissary_rpc and return RValue
+ llvm::Value *emis_rc = Builder.CreateCall(
+ GetEmissaryExecDeclaration(CGM, hasXfers), {BufferLen, DataStructPtr});
+ // truncate long long int to int for printf return value.
+ if ((E->getDirectCallee()->getNameAsString() == "fprintf") ||
+ (E->getDirectCallee()->getNameAsString() == "printf"))
+ emis_rc = Builder.CreateTrunc(emis_rc, CGM.Int32Ty, "emis_rc");
+ return RValue::get(emis_rc);
+}
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 23802cdeb4811..4bfc5f4b15ee9 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -7033,6 +7033,16 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType,
StaticOperator = true;
}
+ // EmitEmissaryExec generates code to allocate an arg buffer, fill buffer
+ // with _emissary_exec args, then generate a call to either
+ // __llvm_emissary_rpc or __llvm_emissary_rpc_dm which are rpc utilities.
+ if ((CGM.getTriple().isAMDGCN() || CGM.getTriple().isNVPTX()) && FnType &&
+ dyn_cast<FunctionProtoType>(FnType) &&
+ dyn_cast<FunctionProtoType>(FnType)->isVariadic() &&
+ E->getDirectCallee() &&
+ (E->getDirectCallee()->getNameAsString() == "_emissary_exec"))
+ return EmitEmissaryExec(E);
+
auto Arguments = E->arguments();
if (StaticOperator) {
// If we're calling a static operator, we need to emit the object argument
diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt
index 434781b3c4f02..0481678be5c35 100644
--- a/clang/lib/CodeGen/CMakeLists.txt
+++ b/clang/lib/CodeGen/CMakeLists.txt
@@ -62,6 +62,7 @@ add_clang_library(clangCodeGen
CGAtomic.cpp
CGBlocks.cpp
CGBuiltin.cpp
+ CGEmitEmissaryExec.cpp
CGCUDANV.cpp
CGCUDARuntime.cpp
CGCXX.cpp
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 85c058ba237ee..1f061dc63c5db 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4751,6 +4751,7 @@ class CodeGenFunction : public CodeGenTypeCache {
RValue EmitNVPTXDevicePrintfCallExpr(const CallExpr *E);
RValue EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E);
+ RValue EmitEmissaryExec(const CallExpr *E);
RValue EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
const CallExpr *E, ReturnValueSlot ReturnValue);
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index c6c299bb61af3..1cc5302e27cba 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -42,6 +42,7 @@ set(core_files
tgmath.h
unwind.h
varargs.h
+ EmissaryIds.h
)
set(arm_common_files
diff --git a/clang/lib/Headers/EmissaryIds.h b/clang/lib/Headers/EmissaryIds.h
new file mode 100644
index 0000000000000..921bc5453b848
--- /dev/null
+++ b/clang/lib/Headers/EmissaryIds.h
@@ -0,0 +1,121 @@
+//===- openmp/device/include/EmissaryIds.h enum & headers ----- C++ -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines Emissary API identifiers. This header is used by both host
+// and device compilations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OFFLOAD_EMISSARY_IDS_H
+#define OFFLOAD_EMISSARY_IDS_H
+
+#define __DEVATTR__
+#if defined(__NVPTX__) || defined(__AMDGCN__)
+#if defined(__HIP__) || defined(__CUDA__)
+#if defined(__DEVATTR__)
+#undef __DEVATTR__
+#endif
+#define __DEVATTR__ __device__
+#endif
+#endif
+
+extern "C" __DEVATTR__ unsigned long long int
+_emissary_exec(const unsigned long long int, ...);
+
+#define _PACK_EMIS_IDS(a, b, c, d) \
+ ((unsigned long long)a << 48) | ((unsigned long long)b << 32) | \
+ ((unsigned long long)c << 16) | ((unsigned long long)d)
+
+/// These are the various Emissary APIs currently defined.
+/// MPI, HDF5, and, RESERVE are "external" Emissary APIs whose device stubs and
+/// host runtime support are provided by library maintainers typically in the
+/// form of a header such as "EmissaryMPI.h". The stubs call _emissary_exec.
+/// The host runtime support will call functions from the actual host library
+/// which are often platform specific and thus only linkable by an application.
+/// A small demo of an external Emissary API (EmissaryMPI.h) is found in docs.
+
+typedef enum {
+ EMIS_ID_INVALID,
+ EMIS_ID_FORTRT,
+ EMIS_ID_PRINT,
+ EMIS_ID_MPI,
+ EMIS_ID_HDF5,
+ EMIS_ID_RESERVE,
+} offload_emis_id_t;
+
+typedef enum {
+ _print_INVALID,
+ _printf_idx,
+ _fprintf_idx,
+ _ockl_asan_report_idx,
+} offload_emis_print_t;
+
+/// The future EMIS_ID_FORTRT will provide these device functions
+typedef enum {
+ _FortranAio_INVALID,
+ _FortranAioBeginExternalListOutput_idx,
+ _FortranAioOutputAscii_idx,
+ _FortranAioOutputInteger32_idx,
+ _FortranAioEndIoStatement_idx,
+ _FortranAioOutputInteger8_idx,
+ _FortranAioOutputInteger16_idx,
+ _FortranAioOutputInteger64_idx,
+ _FortranAioOutputReal32_idx,
+ _FortranAioOutputReal64_idx,
+ _FortranAioOutputComplex32_idx,
+ _FortranAioOutputComplex64_idx,
+ _FortranAioOutputLogical_idx,
+ _FortranAAbort_idx,
+ _FortranAStopStatementText_idx,
+ _FortranAioBeginExternalFormattedOutput_idx,
+ _FortranAStopStatement_idx,
+} offload_emis_fortrt_idx;
+
+/// This structure is created by emisExtractArgBuf to get information
+/// from the data buffer passed by rpc.
+typedef struct {
+ unsigned int DataLen;
+ unsigned int NumArgs;
+ unsigned int emisid;
+ unsigned int emisfnid;
+ unsigned int NumSendXfers;
+ unsigned int NumRecvXfers;
+ unsigned long long data_not_used;
+ char *keyptr;
+ char *argptr;
+ char *strptr;
+} emisArgBuf_t;
+
+typedef unsigned long long EmissaryReturn_t;
+typedef unsigned long long emis_argptr_t;
+typedef EmissaryReturn_t emisfn_t(void *, ...);
+
+#define MAXVARGS 32
+
+typedef enum service_rc {
+ _ERC_SUCCESS = 0,
+ _ERC_STATUS_ERROR = 1,
+ _ERC_DATA_USED_ERROR = 2,
+ _ERC_ADDINT_ERROR = 3,
+ _ERC_ADDFLOAT_ERROR = 4,
+ _ERC_ADDSTRING_ERROR = 5,
+ _ERC_UNSUPPORTED_ID_ERROR = 6,
+ _ERC_INVALID_ID_ERROR = 7,
+ _ERC_ERROR_INVALID_REQUEST = 8,
+ _ERC_EXCEED_MAXVARGS_ERROR = 9,
+} service_rc;
+
+#define LLVM_EMISSARY_BASE 'e'
+#define LLVM_EMISSARY_OPCODE(n) (LLVM_EMISSARY_BASE << 24 | n)
+
+typedef enum {
+ OFFLOAD_EMISSARY = LLVM_EMISSARY_OPCODE(1),
+ OFFLOAD_EMISSARY_DM = LLVM_EMISSARY_OPCODE(2),
+} offload_emissary_t;
+
+#endif // OFFLOAD_EMISSARY_IDS_H
diff --git a/libc/docs/gpu/emissary.rst b/libc/docs/gpu/emissary.rst
new file mode 100644
index 0000000000000..cdff3e59ba671
--- /dev/null
+++ b/libc/docs/gpu/emissary.rst
@@ -0,0 +1,274 @@
+.. _libc_gpu_emissary:
+
+=============
+Emissary APIs
+=============
+
+Emissary APIs
+-------------
+
+The libc GPU feature of LLVM offload provides the ability to execute
+libc functions on the host initiated from GPU code including OpenMP
+target regions, CUDA kernels, or HIP kernels.
+This libc-gpu capability uses the LLVM offload RPC mechanism.
+An extension to libc-gpu is the abiity to execute arbitrary functions
+initiated by GPU source code. This extension is called ``Emissary APIs``.
+
+Emissary APIs allow host API maintainers to easily maintain and distribute
+a platform-specific Emissary API directly callable from the GPU.
+The LLVM runtime does not link to the platform-specific host runtime until
+th application links to the host runtime. For standarized APIs, such as MPI,
+emissary APIs allows multiple platform-specific implementations of MPI.
+This results in increased application portability. But the primary
+benefit of Emissary APIs is the ability to execute host functions without
+terminating the GPU kernel or code.
+
+An Emissary API implementation consists of a header file and a simple c++
+source file. The same implementation can be used for OpenMP, HIP, or Cuda.
+The c++ source file can be compiled as part of the host API library.
+
+This architecture allows host API maintainers to maintain the Emissary API
+externally. That is, no ,change to the compiler or runtime is required.
+Maintainers can implement any subset of the host API in their emissary
+API implementation. Users attempting to use an unimplemented function
+from GPU code would get the same unresolved GPU reference they get
+without an Emissary API implementation.
+
+Because they execute the actual external host functions, the server
+implementation cannot be directly linked to the LLVM runtime. The LLVM
+runtime in the emissary support provides a weak external reference
+to a single master function for the Emissary API. The external API
+maintainer provides this master function consisting of a case clause and
+wrapper function for each implemented function. There are a number of ways
+the API maintainer can package and distribute emissary support for a
+platform-specific API. Compiling the master function into the host library
+and distributing a device header file is typical.
+
+This external implementation architecture provides the ability to have
+different platform-specific APIs for standard interface libraries
+such as ROCm MPI or CUDA MPI.
+
+In this documentation, we provide an MPI example with a few MPI
+functions as a demonstration.
+
+EmisssaryMPI Example
+--------------------
+
+External Emissary APIs require an external library such as OpenMPI.
+This example source shows the execution of MPI_Send and MPI_Recv
+from an OpenMP target region.
+
+
+.. code-block:: c++
+
+ //
+ // EmissaryMPI_example.cpp
+ //
+ #include <EmissaryMPI.h>
+ #include <mpi.h>
+ #include <omp.h>
+ #include <stdio.h>
+ #define VSIZE 5000
+ int main(int argc, char *argv[]) {
+ int numranks, rank;
+ MPI_Init(&argc, &argv);
+ MPI_Comm_size(MPI_COMM_WORLD, &numranks);
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ MPI_Comm _mpi_comm = MPI_COMM_WORLD;
+ MPI_Datatype _mpi_int = MPI_INT;
+ int rc = 0;
+ int *send_recv_buffer = (int *)malloc(VSIZE * sizeof(int));
+ printf("R%d: Number of Ranks= %d ArraySize:%d\n", rank, numranks, VSIZE);
+ #pragma omp target teams distribute parallel for map(tofrom : rc) \
+ map(to : send_recv_buffer[0 : VSIZE])
+ for (int i = 0; i < VSIZE; i++) {
+ if (rank == 0) {
+ send_recv_buffer[i] = -i;
+ MPI_Send(&send_recv_buffer[i], 1, _mpi_int, 1, i, _mpi_comm);
+ } else {
+ MPI_Recv(&send_recv_buffer[i], 1, _mpi_int, 0, i, _mpi_comm,
+ MPI_STATUS_IGNORE);
+ if (send_recv_buffer[i] != -i)
+ rc = 1; // FLAG AS ERROR IF NOT EXPECTED.
+ }
+ }
+ MPI_Finalize();
+ printf("R%d: === rc === %d\n", rank, rc);
+ return rc;
+ }
+
+It is worth noting that without Emissary API for MPI, the device link
+step for the above code would fail with unresolved references to
+MPI_Send and MPI_Recv. The include of EmissaryMPI.h resolves this.
+
+.. code-block:: h
+
+ //===--------------------------------------------------------------------===//
+ //
+ // EmissaryMPI.h : Example device header for EmissaryMPI
+ //
+ //===--------------------------------------------------------------------===//
+ #ifndef OFFLOAD_EMISSARY_MPI_H
+ #define OFFLOAD_EMISSARY_MPI_H
+ #include "EmissaryIds.h"
+ #include <mpi.h>
+ #include <stdarg.h>
+ typedef enum {
+ _MPI_INVALID,
+ _MPI_Send_idx,
+ _MPI_Recv_idx,
+ } offload_emis_mpi_t;
+ #if defined(__NVPTX__) || defined(__AMDGCN__)
+ // EmissaryIds.h sets __DEVATTR__ to __device__ when compiling for either
+ // CUDA or HIP. That attribute is not used for OpenMP device compilation.
+ __DEVATTR__
+ extern "C" int MPI_Send(const void *buf, int count, MPI_Datatype datatype,
+ int dest, int tag, MPI_Comm comm) {
+ return (int) _emissary_exec(
+ _PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Send_idx, 1, 0),
+ buf, (int)count * 4, buf, count, datatype, dest, tag, comm);
+ }
+ __DEVATTR__
+ extern "C" int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source,
+ int tag, MPI_Comm comm, MPI_Status *st) {
+ return (int) _emissary_exec(
+ _PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Recv_idx, 0, 1),
+ buf, (int)count * 4, buf, count, datatype, source, tag, comm, st);
+ }
+ #endif
+ #endif // end #ifndef OFFLOAD_EMISSARY_MPI_H
+
+The above is a minimal Emissary API implementation to provide MPI_Send and MPI_Recv
+functionality for demonstration. The host service side to support this is the following:
+
+.. code-block:: cpp
+
+ //===--------------------------------------------------------------------===//
+ //
+ // EmissaryMPI.cpp : Defines the EmissaryMPI master function and variadic
+ // wrappers for each implemented function. In this example
+ // only MPI_Send and MPI_Recv are callable by GPU.
+ //
+ //===--------------------------------------------------------------------===//
+ #include "EmissaryMPI.h"
+ #include <EmissaryIds.h>
+ #include <mpi.h>
+ static int V_MPI_Send(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ void *v0 = va_arg(args, void *);
+ int v1 = va_arg(args, int);
+ MPI_Datatype v2 = va_arg(args, MPI_Datatype);
+ int v3 = va_arg(args, int);
+ int v4 = va_arg(args, int);
+ MPI_Comm v5 = va_arg(args, MPI_Comm);
+ va_end(args);
+ int rval = MPI_Send(v0, v1, v2, v3, v4, v5);
+ return rval;
+ }
+ static int V_MPI_Recv(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ void *v0 = va_arg(args, void *);
+ int v1 = va_arg(args, int);
+ MPI_Datatype v2 = va_arg(args, MPI_Datatype);
+ int v3 = va_arg(args, int);
+ int v4 = va_arg(args, int);
+ MPI_Comm v5 = va_arg(args, MPI_Comm);
+ MPI_Status *v6 = va_arg(args, MPI_Status *);
+ va_end(args);
+ int rval = MPI_Recv(v0, v1, v2, v3, v4, v5, v6);
+ return rval;
+ }
+ namespace EmissaryExternal {
+ // The EmissaryMPI master function selector
+ extern "C" EmissaryReturn_t EmissaryMPI(char *data, emisArgBuf_t *ab,
+ emis_argptr_t *a[]) {
+ switch (ab->emisfnid) {
+ case _MPI_Send_idx: {
+ void *fnptr = (void *)V_MPI_Send;
+ int return_value_int =
+ V_MPI_Send(fnptr, a[0], a[1], a[2], a[3], a[4], a[5]);
+ return (EmissaryReturn_t)return_value_int;
+ }
+ case _MPI_Recv_idx: {
+ void *fnptr = (void *)V_MPI_Recv;
+ int return_value_int =
+ V_MPI_Recv(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6]);
+ return (EmissaryReturn_t)return_value_int;
+ }
+ return (EmissaryReturn_t)0;
+ }
+ }
+
+The above OpenMP user source code can be compiled and executed with
+the following shell script:
+
+.. code-block:: sh
+
+ #/bin/bash
+ #
+ # demo_mpi.sh
+ #
+ MPI=${MPI:-~/local/openmpi}
+ LLVM_INSTALL=${LLVM_INSTALL:-/work/grodgers/rocm/trunk}
+ OFFLOAD_ARCH=${OFFLOAD_ARCH:-gfx90a}
+ export PATH=$MPI/bin:$PATH
+ [ ! -d "$MPI" ] && echo "MPI:$MPI not found" && exit
+ [ ! -d "$LLVM_INSTALL" ] && echo "LLVM_INSTALL:$LLVM_INSTALL not found" && exit
+ echo "===1===> Compiling Host Master Function EmissaryMPI found in EmissaryMPI.cpp"
+ $LLVM_INSTALL/bin/clang++ ../EmissaryMPI.cpp -I.. -I$MPI/include -O3 -c -fPIC -o EmissaryMPI.o
+ echo "===2===> Compiling and linking OpenMP application"
+ export OMPI_CC=$LLVM_INSTALL/bin/clang++
+ mpic++ -fopenmp --offload-arch=$OFFLOAD_ARCH EmissaryMPI_example.cpp -I.. -Xlinker EmissaryMPI.o
+ echo "===3===> Executing 2 MPI ranks with ./a.out on GPU $OFFLOAD_ARCH"
+ mpirun -np 2 a.out
+
+The shell compiles the host master function EmissaryMPI to resolve the weak
+reference to EmissaryMPI(...) provided by the OpenMP runtime.
+which is then linked to the application a.out. The platform-specific MPI
+might put EmissaryMPI in their library.
+
+A typical installation of OpenMPI would not accept device pointers. However, a platform-specific
+implementation might have GPU-aware host library that does recognize device pointers.
+
+To make the typical installation work, EmissaryMPIs provide the definition of
+device-to-host (send) and host-to-device(receive) transfer vectors.
+
+The number of send transfer
+vectors and receive transfer vectors are embedded into the first argument to _emissary_exec
+with the macro _PACK_EMIS_IDS. The macro _PACK_EMIS_IDS has 4 16-bit fields: The enum ID
+of the Emissary ID , the function index within that Emissary ID, the number of send
+transfer vectors, and the number of receive transfer vectors. In the above example,
+these first arguments are used in EmissaryMPI.h.
+
+.. code-block::
+
+ PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Send_idx, 1, 0)
+ PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Recv_idx, 0, 1)
+
+Each transfer vector adds two arguments to the call to _emissary_exec: the device pointer
+argument, followed by the length in number of bytes to transfer. Obviously the use of
+transfer vectors slows runtime execution to allocate and move data.
+
+If the host platform-specific library was GPU-aware, no transfer vectors would be required.
+In this case the first arg in the header to _emissary_exec would be:
+
+.. code-block::
+
+ PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Send_idx, 0, 0)
+ PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Recv_idx, 0, 0)
+
+This would be significantly faster than the typical OpenMPI that require
+transfer vectors.
+
+PACK_EMIS_IDS generates a compile time 64-bit constant to the first argument
+to _emissary_exec, followed by transfer vectors (if any) and then followed by
+the actual arguments to the host function. The device pass of the clang compiler
+emits code to pack the arguments into a buffer and generates a call to the
+proper RPC function.
+
+The use of Emissary API relieves the API maintainer from implementing different
+RPC functions to manage different sets of arguments. The later approach is what
+is done in the implementation of most libc functions. Emissary APIs are
+useful when there are complex sets of arguments such as in IO APIs.
diff --git a/libc/docs/gpu/index.rst b/libc/docs/gpu/index.rst
index 1fca67205acb4..9d2c3dc602031 100644
--- a/libc/docs/gpu/index.rst
+++ b/libc/docs/gpu/index.rst
@@ -18,3 +18,4 @@ learn more about this project.
rpc
testing
motivation
+ emissary
diff --git a/libc/shared/rpc_server.h b/libc/shared/rpc_server.h
index 46e35f13f0eac..1149fa18e597d 100644
--- a/libc/shared/rpc_server.h
+++ b/libc/shared/rpc_server.h
@@ -10,12 +10,14 @@
#define LLVM_LIBC_SHARED_RPC_SERVER_H
#include "libc_common.h"
+#include "src/__support/RPC/emissary_rpc_server.h"
#include "src/__support/RPC/rpc_server.h"
namespace LIBC_NAMESPACE_DECL {
namespace shared {
using LIBC_NAMESPACE::rpc::handle_libc_opcodes;
+using LIBC_NAMESPACE::rpc::handleEmissaryOpcodes;
} // namespace shared
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/RPC/CMakeLists.txt b/libc/src/__support/RPC/CMakeLists.txt
index cac9c4e05e369..876bd221dab03 100644
--- a/libc/src/__support/RPC/CMakeLists.txt
+++ b/libc/src/__support/RPC/CMakeLists.txt
@@ -6,6 +6,7 @@ add_object_library(
rpc_client
SRCS
rpc_client.cpp
+ emissary_device_utils.cpp
HDRS
rpc_client.h
DEPENDS
diff --git a/libc/src/__support/RPC/emissary_device_utils.cpp b/libc/src/__support/RPC/emissary_device_utils.cpp
new file mode 100644
index 0000000000000..0900b5f2b56bd
--- /dev/null
+++ b/libc/src/__support/RPC/emissary_device_utils.cpp
@@ -0,0 +1,115 @@
+//===- emisssary_device_utils.cpp - utils for Emissary APIs ------- c++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Device functions emitted by clang/lib/CodeGen/EmitEmissaryExec.cpp
+//
+//===----------------------------------------------------------------------===//
+
+// #include "Allocator.h"
+// #include "Configuration.h"
+// #include "DeviceTypes.h"
+#include "EmissaryIds.h"
+#include "rpc_client.h"
+#include "shared/rpc.h"
+#include "src/__support/macros/config.h"
+#include "src/stdlib/free.h"
+#include "src/stdlib/malloc.h"
+
+extern "C" {
+
+#ifdef __NVPTX__
+[[gnu::leaf]] void *malloc(size_t Size);
+[[gnu::leaf]] void free(void *Ptr);
+#endif
+
+/// static auto null_string = "(null)";
+
+// namespace LIBC_NAMESPACE_DECL{
+// namespace rpc {
+
+// The clang compiler will generate calls to __strlen_max when string length
+// is not compile time constant.
+uint32_t __strlen_max(const char *instr, uint32_t maxstrlen) {
+ if (instr == 0) // encountered a null pointer to string
+ return 0;
+ for (uint32_t i = 0; i < maxstrlen; i++)
+ if (instr[i] == (char)0)
+ return (uint32_t)(i + 1);
+ return maxstrlen;
+}
+
+void *__llvm_emissary_premalloc(uint32_t sz) {
+#ifdef __NVPTX__
+ return malloc((size_t)sz);
+#else
+ return LIBC_NAMESPACE::malloc((size_t)sz);
+#endif
+}
+unsigned long long __llvm_emissary_rpc(uint32_t sz32, void *bufdata) {
+ rpc::Client::Port Port = LIBC_NAMESPACE::rpc::client.open<OFFLOAD_EMISSARY>();
+ Port.send_n(bufdata, (size_t)sz32);
+ unsigned long long Ret;
+ Port.recv([&](rpc::Buffer *Buffer, uint32_t) {
+ Ret = static_cast<unsigned long long>(Buffer->data[0]);
+ });
+#ifdef __NVPTX__
+ free(bufdata);
+#else
+ LIBC_NAMESPACE::free(bufdata);
+#endif
+ return Ret;
+}
+
+// This is for emissary APIs that require d2h or h2d memory transfers.
+unsigned long long __llvm_emissary_rpc_dm(uint32_t sz32, void *bufdata) {
+ rpc::Client::Port Port =
+ LIBC_NAMESPACE::rpc::client.open<OFFLOAD_EMISSARY_DM>();
+ Port.send_n(bufdata, (size_t)sz32);
+ char *data = (char *)bufdata;
+ uint32_t *int32_data = (uint32_t *)data;
+ uint32_t NumArgs = int32_data[1];
+ char *keyptr = data + (2 * sizeof(int));
+ char *argptr = keyptr + (NumArgs * sizeof(int));
+ if (((size_t)argptr) % (size_t)8)
+ argptr += 4; // argptr must be aligned
+ uint64_t arg1 = *(uint64_t *)argptr;
+ uint32_t NumSendXfers = (unsigned int)((arg1 >> 16) & 0xFFFF);
+ uint32_t NumRecvXfers = (unsigned int)((arg1) & 0xFFFF);
+ // Skip by arg1 and process Send and Recv Xfers if any
+ argptr += sizeof(uint64_t);
+ for (uint32_t idx = 0; idx < NumSendXfers; idx++) {
+ void *D2Hdata = (void *)*((uint64_t *)argptr);
+ argptr += sizeof(void *);
+ size_t D2Hsize = ((size_t)*((size_t *)argptr) & 0x00000000FFFFFFFF);
+ argptr += sizeof(size_t);
+ Port.send_n(D2Hdata, D2Hsize);
+ }
+ for (uint32_t idx = 0; idx < NumRecvXfers; idx++) {
+ void *H2Ddata = (void *)*((uint64_t *)argptr);
+ argptr += sizeof(void *);
+ argptr += sizeof(size_t);
+ uint64_t recv_size;
+ void *buf = nullptr;
+ Port.recv_n(&buf, &recv_size,
+ [&](uint64_t) { return reinterpret_cast<void *>(H2Ddata); });
+ }
+ unsigned long long Ret;
+ Port.recv([&](rpc::Buffer *Buffer, uint32_t) {
+ Ret = static_cast<unsigned long long>(Buffer->data[0]);
+ });
+#ifdef __NVPTX__
+ free(bufdata);
+#else
+ LIBC_NAMESPACE::free(bufdata);
+#endif
+ return Ret;
+}
+
+//} // end namespace rpc
+//} // end namespace LIBC_NAMESPACE_DECL
+} // end extern "C"
diff --git a/libc/src/__support/RPC/emissary_rpc_server.h b/libc/src/__support/RPC/emissary_rpc_server.h
new file mode 100644
index 0000000000000..bf6757d9c3b64
--- /dev/null
+++ b/libc/src/__support/RPC/emissary_rpc_server.h
@@ -0,0 +1,515 @@
+//===-- Shared memory RPC server instantiation ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is intended to be used externally as part of the `shared/`
+// interface. Consider this an extenion of rpc_server.h to support emissary
+// APIs. rpc_server.h must be included first.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_RPC_EMISSARY_RPC_SERVER_H
+#define LLVM_LIBC_SRC___SUPPORT_RPC_EMISSARY_RPC_SERVER_H
+
+#include "../clang/lib/Headers/EmissaryIds.h"
+#include "rpc_server.h"
+#include <string.h>
+#include <unordered_map>
+
+namespace EmissaryExternal {
+extern "C" {
+/// Called by EmissaryTop for all MPI emissary API functions
+__attribute((weak)) EmissaryReturn_t EmissaryMPI(char *data, emisArgBuf_t *ab,
+ emis_argptr_t *arg[]);
+/// Called by EmissaryTop for all HDF5 Emissary API functions
+__attribute((weak)) EmissaryReturn_t EmissaryHDF5(char *data, emisArgBuf_t *ab,
+ emis_argptr_t *arg[]);
+/// Called by EmissaryTop to support user-defined emissary API
+__attribute((weak)) EmissaryReturn_t EmissaryReserve(char *data,
+ emisArgBuf_t *ab,
+ emis_argptr_t *arg[]);
+/// Called by EmissaryTop to support Fortran IO runtime
+__attribute((weak)) EmissaryReturn_t EmissaryFortrt(char *data,
+ emisArgBuf_t *ab,
+ emis_argptr_t *arg[]);
+/// Called by EmissaryTop to support printf/fprintf/asan report externally
+__attribute((weak)) EmissaryReturn_t EmissaryPrint(char *data, emisArgBuf_t *ab,
+ emis_argptr_t *arg[]);
+} // end extern "C"
+} // namespace EmissaryExternal
+
+// We would like to get llvm typeID enum from Type.h. e.g.
+// #include ".../llvm/include/llvm/IR/Type.h"
+// But we cannot include LLVM headers in a runtime function.
+// So we a have a manual copy of llvm TypeID enum from Type.h
+// The codegen for _emissary_exec puts this ID in the key for
+// each arg and the host runtime needs to decode this key.
+#if 1
+enum TypeID {
+ // PrimitiveTypes
+ HalfTyID = 0, ///< 16-bit floating point type
+ BFloatTyID, ///< 16-bit floating point type (7-bit significand)
+ FloatTyID, ///< 32-bit floating point type
+ DoubleTyID, ///< 64-bit floating point type
+ X86_FP80TyID, ///< 80-bit floating point type (X87)
+ FP128TyID, ///< 128-bit floating point type (112-bit significand)
+ PPC_FP128TyID, ///< 128-bit floating point type (two 64-bits, PowerPC)
+ VoidTyID, ///< type with no size
+ LabelTyID, ///< Labels
+ MetadataTyID, ///< Metadata
+ X86_AMXTyID, ///< AMX vectors (8192 bits, X86 specific)
+ TokenTyID, ///< Tokens
+
+ // Derived types... see DerivedTypes.h file.
+ IntegerTyID, ///< Arbitrary bit width integers
+ ByteTyID, ///< Arbitrary bit width bytes
+ FunctionTyID, ///< Functions
+ PointerTyID, ///< Pointers
+ StructTyID, ///< Structures
+ ArrayTyID, ///< Arrays
+ FixedVectorTyID, ///< Fixed width SIMD vector type
+ ScalableVectorTyID, ///< Scalable SIMD vector type
+ TypedPointerTyID, ///< Typed pointer used by some GPU targets
+ TargetExtTyID, ///< Target extension type
+};
+#endif
+
+namespace LIBC_NAMESPACE_DECL {
+namespace internal {
+
+// emisExtractArgBuf extract ArgBuf using protocol EmitEmissaryExec makes.
+static void emisExtractArgBuf(char *data, emisArgBuf_t *ab) {
+
+ uint32_t *int32_data = (uint32_t *)data;
+ ab->DataLen = int32_data[0];
+ ab->NumArgs = int32_data[1];
+
+ // Note: while the data buffer contains all args including strings,
+ // ab->DataLen does not include strings. It only counts header, keys,
+ // and aligned numerics.
+
+ ab->keyptr = data + (2 * sizeof(int));
+ ab->argptr = ab->keyptr + (ab->NumArgs * sizeof(int));
+ ab->strptr = data + (size_t)ab->DataLen;
+ int alignfill = 0;
+ if (((size_t)ab->argptr) % (size_t)8) {
+ ab->argptr += 4;
+ alignfill = 4;
+ }
+
+ // Extract the two emissary identifiers and number of send
+ // and recv device data transfers. These are 4 16 bit values
+ // packed into a single 64-bit field.
+ uint64_t arg1 = *(uint64_t *)ab->argptr;
+ ab->emisid = (unsigned int)((arg1 >> 48) & 0xFFFF);
+ ab->emisfnid = (unsigned int)((arg1 >> 32) & 0xFFFF);
+ ab->NumSendXfers = (unsigned int)((arg1 >> 16) & 0xFFFF);
+ ab->NumRecvXfers = (unsigned int)((arg1) & 0xFFFF);
+
+ // skip the uint64_t emissary id arg which is first arg in _emissary_exec.
+ ab->keyptr += sizeof(int);
+ ab->argptr += sizeof(uint64_t);
+ ab->NumArgs -= 1;
+
+ // data_not_used used for testing consistency.
+ ab->data_not_used =
+ (size_t)(ab->DataLen) - (((size_t)(3 + ab->NumArgs) * sizeof(int)) +
+ sizeof(uint64_t) + alignfill);
+
+ // Ensure first arg after emissary id arg is aligned.
+ if (((size_t)ab->argptr) % (size_t)8) {
+ ab->argptr += 4;
+ ab->data_not_used -= 4;
+ }
+}
+
+/// Get uint32 value extended to uint64_t value from a char ptr
+static uint64_t getuint32(char *val) {
+ uint32_t i32 = *(uint32_t *)val;
+ return (uint64_t)i32;
+}
+
+/// Get uint64_t value from a char ptr
+static uint64_t getuint64(char *val) { return *(uint64_t *)val; }
+
+// build argument array to create call to variadic wrappers
+static uint32_t
+EmissaryBuildVargs(int NumArgs, char *keyptr, char *dataptr, char *strptr,
+ unsigned long long *data_not_used, emis_argptr_t *a[],
+ std::unordered_map<void *, void *> *D2HAddrList) {
+ size_t num_bytes;
+ size_t bytes_consumed;
+ size_t strsz;
+ size_t fillerNeeded;
+
+ uint argcount = 0;
+
+ for (int argnum = 0; argnum < NumArgs; argnum++) {
+ num_bytes = 0;
+ strsz = 0;
+ unsigned int key = *(unsigned int *)keyptr;
+ unsigned int llvmID = key >> 16;
+ unsigned int numbits = (key << 16) >> 16;
+
+ switch (llvmID) {
+ case FloatTyID: ///< 2: 32-bit floating point type
+ case DoubleTyID: ///< 3: 64-bit floating point type
+ case FP128TyID: ///< 5: 128-bit floating point type (112-bit mantissa)
+ num_bytes = numbits / 8;
+ bytes_consumed = num_bytes;
+ fillerNeeded = ((size_t)dataptr) % num_bytes;
+ if (fillerNeeded) {
+ dataptr += fillerNeeded;
+ bytes_consumed += fillerNeeded;
+ }
+ if ((*data_not_used) < bytes_consumed)
+ return _ERC_DATA_USED_ERROR;
+
+ if (num_bytes == 4)
+ a[argcount] = (emis_argptr_t *)getuint32(dataptr);
+ else
+ a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+ break;
+
+ case IntegerTyID: ///< 11: Arbitrary bit width integers
+ num_bytes = numbits / 8;
+ bytes_consumed = num_bytes;
+ fillerNeeded = ((size_t)dataptr) % num_bytes;
+ if (fillerNeeded) {
+ dataptr += fillerNeeded;
+ bytes_consumed += fillerNeeded;
+ }
+ if ((*data_not_used) < bytes_consumed)
+ return _ERC_DATA_USED_ERROR;
+
+ if (num_bytes == 4)
+ a[argcount] = (emis_argptr_t *)getuint32(dataptr);
+ else
+ a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+ break;
+
+ case PointerTyID: { ///< 15: Pointers
+ if (numbits == 1) { // This is a pointer to string
+ num_bytes = 4;
+ bytes_consumed = num_bytes;
+ strsz = (size_t)*(unsigned int *)dataptr;
+ if ((*data_not_used) < bytes_consumed)
+ return _ERC_DATA_USED_ERROR;
+ a[argcount] = (emis_argptr_t *)((char *)strptr);
+ } else {
+ num_bytes = 8;
+ bytes_consumed = num_bytes;
+ fillerNeeded = ((size_t)dataptr) % num_bytes;
+ if (fillerNeeded) {
+ dataptr += fillerNeeded; // dataptr is now aligned
+ bytes_consumed += fillerNeeded;
+ }
+ if ((*data_not_used) < bytes_consumed)
+ return _ERC_DATA_USED_ERROR;
+ a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+ }
+ if (D2HAddrList) {
+ auto found = D2HAddrList->find((void *)a[argcount]);
+ if (found != D2HAddrList->end())
+ a[argcount] = (emis_argptr_t *)found->second;
+ }
+ } break;
+
+ case HalfTyID: ///< 1: 16-bit floating point type
+ case ArrayTyID: ///< 14: Arrays
+ case StructTyID: ///< 13: Structures
+ case FunctionTyID: ///< 12: Functions
+ case TokenTyID: ///< 10: Tokens
+ case MetadataTyID: ///< 8: Metadata
+ case LabelTyID: ///< 7: Labels
+ case PPC_FP128TyID: ///< 6: 128-bit floating point type (two 64-bits,
+ ///< PowerPC)
+ case X86_FP80TyID: ///< 4: 80-bit floating point type (X87)
+ case FixedVectorTyID: ///< 16: Fixed width SIMD vector type
+ case ScalableVectorTyID: ///< 17: Scalable SIMD vector type
+ case TypedPointerTyID: ///< Typed pointer used by some GPU targets
+ case TargetExtTyID: ///< Target extension type
+ case VoidTyID:
+ return _ERC_UNSUPPORTED_ID_ERROR;
+ break;
+ default:
+ return _ERC_INVALID_ID_ERROR;
+ }
+ // Move to next argument
+ dataptr += num_bytes;
+ strptr += strsz;
+ *data_not_used -= bytes_consumed;
+ keyptr += 4;
+ argcount++;
+ }
+ return _ERC_SUCCESS;
+}
+
+// Utility to skip two args in the ArgBuf
+static void emisSkipXferArgSet(emisArgBuf_t *ab) {
+ // Skip the ptr and size of the Xfer
+ ab->NumArgs -= 2;
+ ab->keyptr += 2 * sizeof(uint32_t);
+ ab->argptr += 2 * sizeof(void *);
+ ab->data_not_used -= 2 * sizeof(void *);
+}
+
+static EmissaryReturn_t
+EmissaryTop(char *data, emisArgBuf_t *ab,
+ std::unordered_map<void *, void *> *D2HAddrList) {
+ EmissaryReturn_t result = 0;
+ emis_argptr_t **args = (emis_argptr_t **)aligned_alloc(
+ sizeof(emis_argptr_t), ab->NumArgs * sizeof(emis_argptr_t *));
+
+ switch (ab->emisid) {
+ case EMIS_ID_INVALID: {
+ fprintf(stderr, "Emissary (host execution) got invalid EMIS_ID\n");
+ result = 0;
+ break;
+ }
+ case EMIS_ID_PRINT: {
+ result = EmissaryExternal::EmissaryPrint(data, ab, args);
+ break;
+ }
+ case EMIS_ID_MPI: {
+ if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+ &(ab->data_not_used), &args[0],
+ D2HAddrList) != _ERC_SUCCESS) {
+ return (EmissaryReturn_t)0;
+ }
+ result = EmissaryExternal::EmissaryMPI(data, ab, args);
+ break;
+ }
+ case EMIS_ID_HDF5: {
+ if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+ &(ab->data_not_used), &args[0],
+ D2HAddrList) != _ERC_SUCCESS)
+ return (EmissaryReturn_t)0;
+ result = EmissaryExternal::EmissaryHDF5(data, ab, args);
+ break;
+ }
+ case EMIS_ID_FORTRT: {
+ if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+ &(ab->data_not_used), &args[0],
+ D2HAddrList) != _ERC_SUCCESS)
+ return (EmissaryReturn_t)0;
+ result = EmissaryExternal::EmissaryFortrt(data, ab, args);
+ break;
+ break;
+ }
+
+ case EMIS_ID_RESERVE: {
+ if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+ &(ab->data_not_used), &args[0],
+ D2HAddrList) != _ERC_SUCCESS)
+ return (EmissaryReturn_t)0;
+ result = EmissaryExternal::EmissaryReserve(data, ab, args);
+ break;
+ }
+ default:
+ fprintf(stderr,
+ "Emissary (host execution) EMIS_ID:%d fnid:%d not supported\n",
+ ab->emisid, ab->emisfnid);
+ }
+ free(args);
+ return result;
+}
+
+// -----------------------------------------------------------------
+// -- Handle OFFLOAD_EMISSARY and OFFLOAD_EMISSARY_DM opcodes --
+// -- handle_emissary_impl calls EmissaryTop for each active lane --
+// -----------------------------------------------------------------
+template <uint32_t NumLanes>
+LIBC_INLINE static ::rpc::Status
+handle_emissary_impl(::rpc::Server::Port &port) {
+
+ switch (port.get_opcode()) {
+
+ // This case handles the device function __llvm_emissary_rpc for emissary
+ // APIs that require no d2h or h2d memory transfer.
+ case OFFLOAD_EMISSARY: {
+ uint64_t Sizes[NumLanes] = {0};
+ unsigned long long Results[NumLanes] = {0};
+ void *buf_ptrs[NumLanes] = {nullptr};
+ port.recv_n(buf_ptrs, Sizes, [&](uint64_t Size) { return new char[Size]; });
+ uint32_t id = 0;
+ for (void *buffer_ptr : buf_ptrs) {
+ if (buffer_ptr) {
+ emisArgBuf_t ab;
+ emisExtractArgBuf((char *)buffer_ptr, &ab);
+ Results[id++] = EmissaryTop((char *)buffer_ptr, &ab, nullptr);
+ }
+ }
+ port.send([&](::rpc::Buffer *Buffer, uint32_t ID) {
+ Buffer->data[0] = static_cast<uint64_t>(Results[ID]);
+ delete[] reinterpret_cast<char *>(buf_ptrs[ID]);
+ });
+ break;
+ }
+
+ // This case handles the device function __llvm_emissary_rpc_dm for emissary
+ // APIs require D2H or H2D transfer vectors to be processed through the port.
+ // FIXME: test with multiple transfer vectors of the same type.
+ case OFFLOAD_EMISSARY_DM: {
+ uint64_t Sizes[NumLanes] = {0};
+ unsigned long long Results[NumLanes] = {0};
+ void *buf_ptrs[NumLanes] = {nullptr};
+ port.recv_n(buf_ptrs, Sizes, [&](uint64_t Size) { return new char[Size]; });
+
+ uint32_t id = 0;
+ emisArgBuf_t AB[NumLanes];
+ std::unordered_map<void *, void *> D2HAddrList;
+ void *Xfers[NumLanes] = {nullptr};
+ void *devXfers[NumLanes] = {nullptr};
+ uint64_t XferSzs[NumLanes] = {0};
+ uint32_t numSendXfers = 0;
+ id = 0;
+
+ for (void *buffer_ptr : buf_ptrs) {
+ if (buffer_ptr) {
+
+ emisArgBuf_t *ab = &AB[id];
+ emisExtractArgBuf((char *)buffer_ptr, ab);
+ for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++) {
+ numSendXfers++;
+ devXfers[id] = (void *)*((uint64_t *)ab->argptr);
+ XferSzs[id] = (size_t)*((size_t *)(ab->argptr + sizeof(void *)));
+ emisSkipXferArgSet(ab);
+ }
+ // Allocate the host space for the receive Xfers
+ for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+ void *devAddr = (void *)*((uint64_t *)ab->argptr);
+ size_t devSz = (((size_t)*((size_t *)(ab->argptr + sizeof(void *)))) &
+ 0x00000000FFFFFFFF);
+ void *hostAddr = new char[devSz];
+ D2HAddrList.insert(std::pair<void *, void *>(devAddr, hostAddr));
+ emisSkipXferArgSet(ab);
+ }
+ id++;
+ }
+ }
+
+ // recv_n for device send_n into new host-allocated Xfers
+ if (numSendXfers)
+ port.recv_n(Xfers, XferSzs,
+ [&](uint64_t Size) { return new char[Size]; });
+
+ // Xfers now contains just allocated host addrs for sends and
+ // devXfers contains corresponding devAddr for those sends
+ // Build map to pass to Emissary
+ id = 0;
+ for (void *Xfer : Xfers) {
+ if (Xfer) {
+ D2HAddrList.insert(std::pair<void *, void *>(devXfers[id], Xfer));
+ id++;
+ }
+ }
+
+ // Call EmissaryTop for each active lane
+ id = 0;
+ for (void *buffer_ptr : buf_ptrs) {
+ if (buffer_ptr) {
+ emisArgBuf_t *ab = &AB[id];
+ emisExtractArgBuf((char *)buffer_ptr, ab);
+ for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++)
+ emisSkipXferArgSet(ab);
+ for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++)
+ emisSkipXferArgSet(ab);
+ Results[id] = EmissaryTop((char *)buffer_ptr, ab, &D2HAddrList);
+ id++;
+ }
+ }
+
+ // Process send_n for the H2D Xfers.
+ void *recvXfers[NumLanes] = {nullptr};
+ uint64_t recvXferSzs[NumLanes] = {0};
+ id = 0;
+ uint32_t numRecvXfers = 0;
+ for (void *buffer_ptr : buf_ptrs) {
+ if (buffer_ptr) {
+ emisArgBuf_t *ab = &AB[id];
+ // Reset ArgBuf tracker
+ emisExtractArgBuf((char *)buffer_ptr, ab);
+ for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++)
+ emisSkipXferArgSet(ab);
+ for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+ numRecvXfers++;
+ void *devAddr = (void *)*((uint64_t *)ab->argptr);
+ recvXfers[id] = D2HAddrList[devAddr];
+ recvXferSzs[id] =
+ (((uint64_t)*((size_t *)(ab->argptr + sizeof(void *)))) &
+ 0x00000000FFFFFFFF);
+ emisSkipXferArgSet(ab);
+ }
+ id++;
+ }
+ }
+ if (numRecvXfers)
+ port.send_n(recvXfers, recvXferSzs);
+
+ // Cleanup all host allocated transfer buffers
+ id = 0;
+ for (void *buffer_ptr : buf_ptrs) {
+ if (buffer_ptr) {
+ emisArgBuf_t *ab = &AB[id];
+ // Reset the ArgBuf tracker ab
+ emisExtractArgBuf((char *)buffer_ptr, ab);
+ // Cleanup host allocated send Xfers
+ for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++) {
+ void *devAddr = (void *)*((uint64_t *)ab->argptr);
+ void *hostAddr = D2HAddrList[devAddr];
+ delete[] reinterpret_cast<char *>(hostAddr);
+ emisSkipXferArgSet(ab);
+ }
+ // Cleanup host allocated bufs
+ for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+ void *devAddr = (void *)*((uint64_t *)ab->argptr);
+ void *hostAddr = D2HAddrList[devAddr];
+ delete[] reinterpret_cast<char *>(hostAddr);
+ emisSkipXferArgSet(ab);
+ }
+ id++;
+ }
+ }
+
+ port.send([&](::rpc::Buffer *Buffer, uint32_t ID) {
+ Buffer->data[0] = static_cast<uint64_t>(Results[ID]);
+ delete[] reinterpret_cast<char *>(buf_ptrs[ID]);
+ });
+
+ break;
+ } // END CASE OFFLOAD_EMISSARY_DM
+
+ default: {
+ return ::rpc::RPC_UNHANDLED_OPCODE;
+ break;
+ }
+ }
+ return ::rpc::RPC_SUCCESS;
+} // end handle_emissary_impl
+
+} // namespace internal
+} // namespace LIBC_NAMESPACE_DECL
+
+namespace LIBC_NAMESPACE_DECL {
+namespace rpc {
+LIBC_INLINE ::rpc::Status handleEmissaryOpcodes(::rpc::Server::Port &port,
+ uint32_t NumLanes) {
+ if (NumLanes == 1)
+ return internal::handle_emissary_impl<1>(port);
+ else if (NumLanes == 32)
+ return internal::handle_emissary_impl<32>(port);
+ else if (NumLanes == 64)
+ return internal::handle_emissary_impl<64>(port);
+ else
+ return ::rpc::RPC_ERROR;
+}
+
+} // namespace rpc
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_RPC_EMISSARY_RPC_SERVER_H
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index 83cde630ebf40..4a8a0647e727f 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -115,6 +115,9 @@ runServer(plugin::GenericDeviceTy &Device, void *Buffer,
if (Status == rpc::RPC_UNHANDLED_OPCODE)
Status = LIBC_NAMESPACE::shared::handle_libc_opcodes(*Port, NumLanes);
+ if (Status == rpc::RPC_UNHANDLED_OPCODE)
+ Status = LIBC_NAMESPACE::shared::handleEmissaryOpcodes(*Port, NumLanes);
+
#ifdef OFFLOAD_HAS_FLANG_RT
if (Status == rpc::RPC_UNHANDLED_OPCODE)
Status = static_cast<rpc::Status>(
>From 205aed3c859e12e6e8130fc69d357e5cbce9bfe8 Mon Sep 17 00:00:00 2001
From: gregrodgers <Gregory.Rodgers at amd.com>
Date: Fri, 20 Mar 2026 09:43:37 -0500
Subject: [PATCH 2/2] [OpenMP] remove need to copy llvm::type::TypeID to the
Emissary runtime
---
clang/lib/CodeGen/CGEmitEmissaryExec.cpp | 43 ++++++++++++-
clang/lib/Headers/EmissaryIds.h | 7 ++
libc/src/__support/RPC/emissary_rpc_server.h | 67 ++------------------
3 files changed, 54 insertions(+), 63 deletions(-)
diff --git a/clang/lib/CodeGen/CGEmitEmissaryExec.cpp b/clang/lib/CodeGen/CGEmitEmissaryExec.cpp
index d9511244a0b8c..48585a8f3b496 100644
--- a/clang/lib/CodeGen/CGEmitEmissaryExec.cpp
+++ b/clang/lib/CodeGen/CGEmitEmissaryExec.cpp
@@ -134,6 +134,46 @@ static llvm::Function *GetEmissaryExecDeclaration(CodeGenModule &CGM,
// A macro to pack the llvm type ID and numbits into 4-byte key
#define _PACK_TY_BITLEN(x, y) ((uint32_t)x << 16) | ((uint32_t)y)
+static EmisTyID getEmisTyID(llvm::Type::TypeID tyid) {
+ switch (tyid) {
+ case llvm::Type::HalfTyID: ///< 16-bit floating point type
+ case llvm::Type::X86_FP80TyID: ///< 80-bit floating point type (X87)
+ case llvm::Type::BFloatTyID: ///< 16-bit floating point type (7-bit
+ ///< significand)
+ return EmisInvalidTy;
+ case llvm::Type::FloatTyID: ///< 32-bit floating point type
+ case llvm::Type::DoubleTyID: ///< 64-bit floating point type
+ case llvm::Type::FP128TyID: ///< 128-bit floating point type (112-bit
+ ///< significand)
+ return EmisFloatTy;
+ case llvm::Type::PPC_FP128TyID: ///< 128-bit floating point type (two 64-bits,
+ ///< PowerPC)
+ case llvm::Type::VoidTyID: ///< type with no size
+ case llvm::Type::LabelTyID: ///< Labels
+ case llvm::Type::MetadataTyID: ///< Metadata
+ case llvm::Type::X86_AMXTyID: ///< AMX vectors (8192 bits, X86 specific)
+ case llvm::Type::TokenTyID: ///< Tokens
+ return EmisInvalidTy;
+ // Derived types... see DerivedTypes.h file.
+ case llvm::Type::IntegerTyID: ///< Arbitrary bit width integers
+ return EmisIntegerTy;
+ case llvm::Type::ByteTyID: ///< Arbitrary bit width bytes
+ case llvm::Type::FunctionTyID: ///< Functions
+ return EmisInvalidTy;
+ case llvm::Type::PointerTyID: ///< Pointers
+ return EmisPointerTy;
+ case llvm::Type::StructTyID: ///< Structures
+ case llvm::Type::ArrayTyID: ///< Arrays
+ case llvm::Type::FixedVectorTyID: ///< Fixed width SIMD vector type
+ case llvm::Type::ScalableVectorTyID: ///< Scalable SIMD vector type
+ case llvm::Type::TypedPointerTyID: ///< Typed pointer used by some GPU targets
+ case llvm::Type::TargetExtTyID: ///< Target extension type
+ return EmisInvalidTy;
+ default:
+ return EmisInvalidTy;
+ }
+}
+
// ----- External function EmitEmissaryExec called from CGExpr.cpp -----
RValue CodeGenFunction::EmitEmissaryExec(const CallExpr *E) {
assert(getTarget().getTriple().isAMDGCN() ||
@@ -278,6 +318,7 @@ RValue CodeGenFunction::EmitEmissaryExec(const CallExpr *E) {
llvm::Type *ty = Args[I].getRValue(*this).getScalarVal()->getType();
llvm::Type::TypeID argtypeid =
Args[I].getRValue(*this).getScalarVal()->getType()->getTypeID();
+ EmisTyID emis_tyid = getEmisTyID(argtypeid);
// Get type size in bits. Usually 64 or 32.
uint32_t numbits = 0;
@@ -291,7 +332,7 @@ RValue CodeGenFunction::EmitEmissaryExec(const CallExpr *E) {
numbits = ty->getScalarSizeInBits();
// Create a key that combines llvm typeID and size
llvm::Value *Key =
- llvm::ConstantInt::get(Int32Ty, _PACK_TY_BITLEN(argtypeid, numbits));
+ llvm::ConstantInt::get(Int32Ty, _PACK_TY_BITLEN(emis_tyid, numbits));
P = Builder.CreateStructGEP(DataStructTy, BufferPtr, I + 2);
Builder.CreateAlignedStore(Key, P, DL.getPrefTypeAlign(Key->getType()));
}
diff --git a/clang/lib/Headers/EmissaryIds.h b/clang/lib/Headers/EmissaryIds.h
index 921bc5453b848..8056ed0e7ee83 100644
--- a/clang/lib/Headers/EmissaryIds.h
+++ b/clang/lib/Headers/EmissaryIds.h
@@ -31,6 +31,13 @@ _emissary_exec(const unsigned long long int, ...);
((unsigned long long)a << 48) | ((unsigned long long)b << 32) | \
((unsigned long long)c << 16) | ((unsigned long long)d)
+enum EmisTyID {
+ EmisInvalidTy = 0,
+ EmisFloatTy,
+ EmisIntegerTy,
+ EmisPointerTy,
+};
+
/// These are the various Emissary APIs currently defined.
/// MPI, HDF5, and, RESERVE are "external" Emissary APIs whose device stubs and
/// host runtime support are provided by library maintainers typically in the
diff --git a/libc/src/__support/RPC/emissary_rpc_server.h b/libc/src/__support/RPC/emissary_rpc_server.h
index bf6757d9c3b64..6b52984634e03 100644
--- a/libc/src/__support/RPC/emissary_rpc_server.h
+++ b/libc/src/__support/RPC/emissary_rpc_server.h
@@ -42,42 +42,6 @@ __attribute((weak)) EmissaryReturn_t EmissaryPrint(char *data, emisArgBuf_t *ab,
} // end extern "C"
} // namespace EmissaryExternal
-// We would like to get llvm typeID enum from Type.h. e.g.
-// #include ".../llvm/include/llvm/IR/Type.h"
-// But we cannot include LLVM headers in a runtime function.
-// So we a have a manual copy of llvm TypeID enum from Type.h
-// The codegen for _emissary_exec puts this ID in the key for
-// each arg and the host runtime needs to decode this key.
-#if 1
-enum TypeID {
- // PrimitiveTypes
- HalfTyID = 0, ///< 16-bit floating point type
- BFloatTyID, ///< 16-bit floating point type (7-bit significand)
- FloatTyID, ///< 32-bit floating point type
- DoubleTyID, ///< 64-bit floating point type
- X86_FP80TyID, ///< 80-bit floating point type (X87)
- FP128TyID, ///< 128-bit floating point type (112-bit significand)
- PPC_FP128TyID, ///< 128-bit floating point type (two 64-bits, PowerPC)
- VoidTyID, ///< type with no size
- LabelTyID, ///< Labels
- MetadataTyID, ///< Metadata
- X86_AMXTyID, ///< AMX vectors (8192 bits, X86 specific)
- TokenTyID, ///< Tokens
-
- // Derived types... see DerivedTypes.h file.
- IntegerTyID, ///< Arbitrary bit width integers
- ByteTyID, ///< Arbitrary bit width bytes
- FunctionTyID, ///< Functions
- PointerTyID, ///< Pointers
- StructTyID, ///< Structures
- ArrayTyID, ///< Arrays
- FixedVectorTyID, ///< Fixed width SIMD vector type
- ScalableVectorTyID, ///< Scalable SIMD vector type
- TypedPointerTyID, ///< Typed pointer used by some GPU targets
- TargetExtTyID, ///< Target extension type
-};
-#endif
-
namespace LIBC_NAMESPACE_DECL {
namespace internal {
@@ -145,20 +109,16 @@ EmissaryBuildVargs(int NumArgs, char *keyptr, char *dataptr, char *strptr,
size_t bytes_consumed;
size_t strsz;
size_t fillerNeeded;
-
uint argcount = 0;
-
for (int argnum = 0; argnum < NumArgs; argnum++) {
num_bytes = 0;
strsz = 0;
unsigned int key = *(unsigned int *)keyptr;
- unsigned int llvmID = key >> 16;
+ unsigned int emis_id = key >> 16;
unsigned int numbits = (key << 16) >> 16;
- switch (llvmID) {
- case FloatTyID: ///< 2: 32-bit floating point type
- case DoubleTyID: ///< 3: 64-bit floating point type
- case FP128TyID: ///< 5: 128-bit floating point type (112-bit mantissa)
+ switch (emis_id) {
+ case EmisFloatTy:
num_bytes = numbits / 8;
bytes_consumed = num_bytes;
fillerNeeded = ((size_t)dataptr) % num_bytes;
@@ -175,7 +135,7 @@ EmissaryBuildVargs(int NumArgs, char *keyptr, char *dataptr, char *strptr,
a[argcount] = (emis_argptr_t *)getuint64(dataptr);
break;
- case IntegerTyID: ///< 11: Arbitrary bit width integers
+ case EmisIntegerTy:
num_bytes = numbits / 8;
bytes_consumed = num_bytes;
fillerNeeded = ((size_t)dataptr) % num_bytes;
@@ -192,7 +152,7 @@ EmissaryBuildVargs(int NumArgs, char *keyptr, char *dataptr, char *strptr,
a[argcount] = (emis_argptr_t *)getuint64(dataptr);
break;
- case PointerTyID: { ///< 15: Pointers
+ case EmisPointerTy: {
if (numbits == 1) { // This is a pointer to string
num_bytes = 4;
bytes_consumed = num_bytes;
@@ -219,23 +179,6 @@ EmissaryBuildVargs(int NumArgs, char *keyptr, char *dataptr, char *strptr,
}
} break;
- case HalfTyID: ///< 1: 16-bit floating point type
- case ArrayTyID: ///< 14: Arrays
- case StructTyID: ///< 13: Structures
- case FunctionTyID: ///< 12: Functions
- case TokenTyID: ///< 10: Tokens
- case MetadataTyID: ///< 8: Metadata
- case LabelTyID: ///< 7: Labels
- case PPC_FP128TyID: ///< 6: 128-bit floating point type (two 64-bits,
- ///< PowerPC)
- case X86_FP80TyID: ///< 4: 80-bit floating point type (X87)
- case FixedVectorTyID: ///< 16: Fixed width SIMD vector type
- case ScalableVectorTyID: ///< 17: Scalable SIMD vector type
- case TypedPointerTyID: ///< Typed pointer used by some GPU targets
- case TargetExtTyID: ///< Target extension type
- case VoidTyID:
- return _ERC_UNSUPPORTED_ID_ERROR;
- break;
default:
return _ERC_INVALID_ID_ERROR;
}
More information about the cfe-commits
mailing list