[Openmp-commits] [clang] [llvm] [openmp] [OpenMP] support for Emissary APIs as discussed in 89169 (PR #175265)
Greg Rodgers via Openmp-commits
openmp-commits at lists.llvm.org
Mon Jan 12 04:45:09 PST 2026
https://github.com/gregrodgers updated https://github.com/llvm/llvm-project/pull/175265
>From cdcedd36b2a590e443974941bd3d4873abe21796 Mon Sep 17 00:00:00 2001
From: gregrodgers <Gregory.Rodgers at amd.com>
Date: Fri, 9 Jan 2026 17:01:48 -0600
Subject: [PATCH 1/2] [OpenMP] support for Emissary APIs as discussed in 89169
---
clang/include/clang/Basic/LangOptions.def | 1 +
clang/include/clang/Options/Options.td | 6 +
clang/lib/CodeGen/CGEmitEmissaryExec.cpp | 371 ++++++++++++++
clang/lib/CodeGen/CGExpr.cpp | 11 +
clang/lib/CodeGen/CMakeLists.txt | 1 +
clang/lib/CodeGen/CodeGenFunction.h | 1 +
clang/lib/Driver/ToolChains/Clang.cpp | 7 +
clang/lib/Headers/CMakeLists.txt | 2 +
clang/lib/Headers/EmissaryIds.h | 97 ++++
clang/lib/Headers/EmissaryMPI.h | 202 ++++++++
clang/lib/Headers/llvm_libc_wrappers/stdio.h | 17 +
offload/include/Shared/RPCOpcodes.h | 2 +
offload/libomptarget/CMakeLists.txt | 1 +
offload/plugins-nextgen/common/CMakeLists.txt | 15 +
.../plugins-nextgen/common/include/Emissary.h | 263 ++++++++++
.../plugins-nextgen/common/src/Emissary.cpp | 259 ++++++++++
.../common/src/EmissaryFortrt.cpp | 470 ++++++++++++++++++
.../common/src/EmissaryPrint.cpp | 423 ++++++++++++++++
offload/plugins-nextgen/common/src/RPC.cpp | 152 ++++++
openmp/device/CMakeLists.txt | 10 +
openmp/device/src/EmissaryFortrt.cpp | 144 ++++++
openmp/device/src/EmissaryPrint.cpp | 79 +++
openmp/device/src/Misc.cpp | 62 +++
23 files changed, 2596 insertions(+)
create mode 100644 clang/lib/CodeGen/CGEmitEmissaryExec.cpp
create mode 100644 clang/lib/Headers/EmissaryIds.h
create mode 100644 clang/lib/Headers/EmissaryMPI.h
create mode 100644 offload/plugins-nextgen/common/include/Emissary.h
create mode 100644 offload/plugins-nextgen/common/src/Emissary.cpp
create mode 100644 offload/plugins-nextgen/common/src/EmissaryFortrt.cpp
create mode 100644 offload/plugins-nextgen/common/src/EmissaryPrint.cpp
create mode 100644 openmp/device/src/EmissaryFortrt.cpp
create mode 100644 openmp/device/src/EmissaryPrint.cpp
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 8cba1dbaee24e..69ff43cef7307 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -227,6 +227,7 @@ LANGOPT(OpenMPExtensions , 1, 1, NotCompatible, "Enable all Clang extensions fo
LANGOPT(OpenMPSimd , 1, 0, NotCompatible, "Use SIMD only OpenMP support.")
LANGOPT(OpenMPUseTLS , 1, 0, NotCompatible, "Use TLS for threadprivates or runtime calls")
LANGOPT(OpenMPIsTargetDevice , 1, 0, NotCompatible, "Generate code only for OpenMP target device")
+LANGOPT(UseEmissaryPrint , 1, 0, NotCompatible, "Enables use of certain IO functions with Emissary rather than LIBC")
LANGOPT(OpenMPCUDAMode , 1, 0, NotCompatible, "Generate code for OpenMP pragmas in SIMT/SPMD mode")
LANGOPT(OpenMPIRBuilder , 1, 0, NotCompatible, "Use the experimental OpenMP-IR-Builder codegen path.")
LANGOPT(OpenMPCUDANumSMs , 32, 0, NotCompatible, "Number of SMs for CUDA devices.")
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 6a72931727a7c..8cf017d8effd7 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -8940,6 +8940,12 @@ def fopenmp_host_ir_file_path : Separate<["-"], "fopenmp-host-ir-file-path">,
} // let Visibility = [CC1Option, FC1Option]
+defm use_emissary_print: BoolFOption<"use-emissary-print",
+ LangOpts<"UseEmissaryPrint">, DefaultTrue,
+ PosFlag<SetTrue, [], [ClangOption, CC1Option],
+ "Enable use of Emissary printf/fprint overriding device libc printf/fprintf">,
+ NegFlag<SetFalse>>;
+
//===----------------------------------------------------------------------===//
// Coarray Options
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/CGEmitEmissaryExec.cpp b/clang/lib/CodeGen/CGEmitEmissaryExec.cpp
new file mode 100644
index 0000000000000..cc574d1ffdcf2
--- /dev/null
+++ b/clang/lib/CodeGen/CGEmitEmissaryExec.cpp
@@ -0,0 +1,371 @@
+//===------- CGEmitEmissaryExec.cpp - Codegen for _emissary_exec --==------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// EmitEmissaryExec:
+//
+// When a device call to the varadic function _emissary_exec is encountered
+// (in CGExpr.cpp) EmitEmissaryExec does these steps:
+//
+// 1. If string lens are runtime dependent, Emit code to determine runtime len.
+// 2. Emits call to allocate memory __llvm_emissary_premalloc,
+// 3. Emit stores of each arg into arg buffer,
+// 4. Emits call to function __llvm_emissary_rpc or __llvm_emissary_rpc_dm
+//
+// The arg buffer is a struct that contains the length, number of args, an
+// array of 4-byte keys that represent the type of of each arg, an array of
+// aligned "data" values for each arg, and finally the runtime string values.
+// If an arg is a string the data value is the runtime length of the string.
+// Each 4-byte key contains the llvm type ID and the number of bits for the
+// type. encoded by the macro _PACK_TY_BITLEN(x,y) ((uint32_t)x << 16) |
+// ((uint32_t)y)
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../../clang/lib/Headers/EmissaryIds.h"
+#include "CodeGenFunction.h"
+#include "clang/Basic/Builtins.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h"
+
+using namespace clang;
+using namespace CodeGen;
+
+// These static helper functions support EmitEmissaryExec.
+static llvm::Function *GetOmpStrlenDeclaration(CodeGenModule &CGM) {
+ auto &M = CGM.getModule();
+ // Args are pointer to char and maxstringlen
+ llvm::Type *ArgTypes[] = {CGM.Int8PtrTy, CGM.Int32Ty};
+ llvm::FunctionType *OmpStrlenFTy =
+ llvm::FunctionType::get(CGM.Int32Ty, ArgTypes, false);
+ if (auto *F = M.getFunction("__strlen_max")) {
+ assert(F->getFunctionType() == OmpStrlenFTy);
+ return F;
+ }
+ llvm::Function *FN = llvm::Function::Create(
+ OmpStrlenFTy, llvm::GlobalVariable::ExternalLinkage, "__strlen_max", &M);
+ return FN;
+}
+
+// Deterimines if an expression is a string with variable lenth
+static bool isVarString(const clang::Expr *argX, const clang::Type *argXTy,
+ const llvm::Value *Arg) {
+ if ((argXTy->isPointerType() || argXTy->isConstantArrayType()) &&
+ argXTy->getPointeeOrArrayElementType()->isCharType() && !argX->isLValue())
+ return true;
+ // Ensure the VarDecl has an inititalizer
+ if (const auto *DRE = dyn_cast<DeclRefExpr>(argX))
+ if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl()))
+ if (!VD->getInit() ||
+ !llvm::isa<StringLiteral>(VD->getInit()->IgnoreImplicit()))
+ return true;
+ return false;
+}
+
+// Deterimines if an argument is a string
+static bool isString(const clang::Type *argXTy) {
+ if ((argXTy->isPointerType() || argXTy->isConstantArrayType()) &&
+ argXTy->getPointeeOrArrayElementType()->isCharType())
+ return true;
+ else
+ return false;
+}
+
+// Gets a string literal to write into the transfer buffer
+static const StringLiteral *getSL(const clang::Expr *argX,
+ const clang::Type *argXTy) {
+ // String in argX has known constant length
+ if (!argXTy->isConstantArrayType()) {
+ // Allow constant string to be a declared variable,
+ // But it must be constant and initialized.
+ const DeclRefExpr *DRE = cast<DeclRefExpr>(argX);
+ const VarDecl *VarD = cast<VarDecl>(DRE->getDecl());
+ argX = VarD->getInit()->IgnoreImplicit();
+ }
+ const StringLiteral *SL = cast<StringLiteral>(argX);
+ return SL;
+}
+
+// Returns a function pointer to __llvm_emissary_premalloc
+static llvm::Function *GetEmissaryAllocDeclaration(CodeGenModule &CGM) {
+ auto &M = CGM.getModule();
+ // clang::CodeGen::CodeGenTypes &CGT = CGM.getTypes();
+ const char *_executeName = "__llvm_emissary_premalloc";
+ llvm::Type *ArgTypes[] = {CGM.Int32Ty};
+ llvm::Function *FN;
+ // Maybe this should be pointer to char instead of pointer to void
+ llvm::FunctionType *VargsFnAllocFuncType = llvm::FunctionType::get(
+ CGM.getTypes().ConvertType(
+ CGM.getContext().getPointerType(CGM.getContext().VoidTy)),
+ ArgTypes, false);
+ if (!(FN = M.getFunction(_executeName)))
+ FN = llvm::Function::Create(VargsFnAllocFuncType,
+ llvm::GlobalVariable::ExternalLinkage,
+ _executeName, &M);
+ assert(FN->getFunctionType() == VargsFnAllocFuncType);
+ return FN;
+}
+
+// Returns a function pointer to __llvm_emissary_rpc
+static llvm::Function *GetEmissaryExecDeclaration(CodeGenModule &CGM,
+ bool hasXfers) {
+ const char *_executeName =
+ hasXfers ? "__llvm_emissary_rpc_dm" : "__llvm_emissary_rpc";
+ auto &M = CGM.getModule();
+ llvm::Type *ArgTypes[] = {
+ CGM.Int32Ty, CGM.getTypes().ConvertType(CGM.getContext().getPointerType(
+ CGM.getContext().VoidTy))};
+ llvm::Function *FN;
+ llvm::FunctionType *VarfnFuncType =
+ llvm::FunctionType::get(CGM.Int64Ty, ArgTypes, false);
+ if (!(FN = M.getFunction(_executeName)))
+ FN = llvm::Function::Create(
+ VarfnFuncType, llvm::GlobalVariable::ExternalLinkage, _executeName, &M);
+ assert(FN->getFunctionType() == VarfnFuncType);
+ return FN;
+}
+
+// A macro to pack the llvm type ID and numbits into 4-byte key
+#define _PACK_TY_BITLEN(x, y) ((uint32_t)x << 16) | ((uint32_t)y)
+
+// ----- External function EmitEmissaryExec called from CGExpr.cpp -----
+RValue CodeGenFunction::EmitEmissaryExec(const CallExpr *E) {
+ assert(getTarget().getTriple().isAMDGCN() ||
+ getTarget().getTriple().isNVPTX());
+ assert(E->getNumArgs() >= 1); // _emissary_exec always has at least one arg.
+ const llvm::DataLayout &DL = CGM.getDataLayout();
+ CallArgList Args;
+
+ EmitCallArgs(Args,
+ E->getDirectCallee()->getType()->getAs<FunctionProtoType>(),
+ E->arguments(), E->getDirectCallee(),
+ /* ParamsToSkip = */ 0);
+
+ // We don't know how to emit non-scalar varargs.
+ if (std::any_of(Args.begin() + 1, Args.end(), [&](const CallArg &A) {
+ return !A.getRValue(*this).isScalar();
+ })) {
+ CGM.ErrorUnsupported(E, "non-scalar arg in GPU vargs function");
+ return RValue::get(llvm::ConstantInt::get(IntTy, 0));
+ }
+ // NumArgs always includes emisid, but E->getNumArgs() could be 1 less if
+ // inserted it above.
+ unsigned NumArgs = (unsigned)Args.size();
+ llvm::SmallVector<llvm::Type *, 32> ArgTypes;
+ llvm::SmallVector<llvm::Value *, 32> VarStrLengths;
+ llvm::Value *TotalVarStrsLength = llvm::ConstantInt::get(Int32Ty, 0);
+ bool hasVarStrings = false;
+ ArgTypes.push_back(Int32Ty); // 1st field in struct is total DataLen
+ ArgTypes.push_back(Int32Ty); // 2nd field in struct will be num args
+ // An array of 4-byte keys that describe the arg type
+ for (unsigned I = 0; I < NumArgs; ++I)
+ ArgTypes.push_back(Int32Ty);
+
+ // Track the size of the numeric data length and string length
+ unsigned DataLen_CT =
+ (unsigned)(DL.getTypeAllocSize(Int32Ty)) * (NumArgs + 2);
+ unsigned AllStringsLen_CT = 0;
+
+ // --- 1st Pass over Args to create ArgTypes and count size ---
+ size_t structOffset = 4 * (NumArgs + 2);
+ for (unsigned I = 0; I < NumArgs; I++) {
+ llvm::Value *Arg = Args[I].getRValue(*this).getScalarVal();
+ llvm::Type *ArgType = Arg->getType();
+ // Skip string processing on arg0 which may not be in E->getArg(0)
+ if (I != 0) {
+ const Expr *argX = E->getArg(I)->IgnoreParenCasts();
+ auto *argXTy = argX->getType().getTypePtr();
+ if (isString(argXTy)) {
+ if (isVarString(argX, argXTy, Arg)) {
+ hasVarStrings = true;
+ if (auto *PtrTy = dyn_cast<llvm::PointerType>(ArgType))
+ if (PtrTy->getPointerAddressSpace()) {
+ Arg = Builder.CreateAddrSpaceCast(Arg, CGM.Int8PtrTy);
+ ArgType = Arg->getType();
+ }
+ llvm::Value *VarStrLen =
+ Builder.CreateCall(GetOmpStrlenDeclaration(CGM),
+ {Arg, llvm::ConstantInt::get(Int32Ty, 1024)});
+ VarStrLengths.push_back(VarStrLen);
+ TotalVarStrsLength = Builder.CreateAdd(TotalVarStrsLength, VarStrLen,
+ "sum_of_var_strings_length");
+ ArgType = Int32Ty;
+ } else {
+ const StringLiteral *SL = getSL(argX, argXTy);
+ StringRef ArgString = SL->getString();
+ AllStringsLen_CT += ((int)ArgString.size() + 1);
+ // change ArgType from char ptr to int to contain string length
+ ArgType = Int32Ty;
+ }
+ } // end of processing string argument
+ } // End of skip 1st arg
+ // if ArgTypeSize is >4 bytes we need to insert dummy align
+ // values in the struct so all stores can be aligned .
+ // These dummy fields must be inserted before the arg.
+ //
+ // In the pass below where the stores are generated careful
+ // tracking of the index into the struct is necessary.
+ size_t needsPadding = (structOffset % (size_t)DL.getTypeAllocSize(ArgType));
+ if (needsPadding) {
+ DataLen_CT += (unsigned)needsPadding;
+ structOffset += needsPadding;
+ ArgTypes.push_back(Int32Ty); // could assert that needsPadding == 4 here
+ }
+
+ ArgTypes.push_back(ArgType);
+ DataLen_CT += ((int)DL.getTypeAllocSize(ArgType));
+ structOffset += (size_t)DL.getTypeAllocSize(ArgType);
+ }
+
+ // --- Generate call to __llvm_emissary_premalloc to get data pointer
+ if (hasVarStrings)
+ TotalVarStrsLength = Builder.CreateAdd(
+ TotalVarStrsLength,
+ llvm::ConstantInt::get(Int32Ty, AllStringsLen_CT + DataLen_CT),
+ "total_buffer_size");
+ llvm::Value *BufferLen =
+ hasVarStrings
+ ? TotalVarStrsLength
+ : llvm::ConstantInt::get(Int32Ty, AllStringsLen_CT + DataLen_CT);
+ llvm::Value *DataStructPtr =
+ Builder.CreateCall(GetEmissaryAllocDeclaration(CGM), {BufferLen});
+
+ // --- Cast the generic return pointer to be a struct in device global memory
+ llvm::StructType *DataStructTy =
+ llvm::StructType::create(ArgTypes, "varfn_args_store");
+ unsigned AS = getContext().getTargetAddressSpace(LangAS::cuda_device);
+ llvm::Value *BufferPtr = Builder.CreatePointerCast(
+ DataStructPtr, llvm::PointerType::get(CGM.getLLVMContext(), AS),
+ "varfn_args_store_casted");
+ // --- Header of struct contains length and NumArgs ---
+ llvm::Value *DataLenField = llvm::ConstantInt::get(Int32Ty, DataLen_CT);
+ llvm::Value *P = Builder.CreateStructGEP(DataStructTy, BufferPtr, 0);
+ Builder.CreateAlignedStore(DataLenField, P,
+ DL.getPrefTypeAlign(DataLenField->getType()));
+ llvm::Value *NumArgsField = llvm::ConstantInt::get(Int32Ty, NumArgs);
+ P = Builder.CreateStructGEP(DataStructTy, BufferPtr, 1);
+ Builder.CreateAlignedStore(NumArgsField, P,
+ DL.getPrefTypeAlign(NumArgsField->getType()));
+
+ // --- 2nd Pass: create array of 4-byte keys to describe each arg
+ for (unsigned I = 0; I < NumArgs; I++) {
+ llvm::Type *ty = Args[I].getRValue(*this).getScalarVal()->getType();
+ llvm::Type::TypeID argtypeid =
+ Args[I].getRValue(*this).getScalarVal()->getType()->getTypeID();
+
+ // Get type size in bits. Usually 64 or 32.
+ uint32_t numbits = 0;
+ if (I > 0 &&
+ isString(E->getArg(I)->IgnoreParenCasts()->getType().getTypePtr()))
+ // The llvm typeID for string is pointer. Since pointer numbits is 0,
+ // we set numbits to 1 to distinguish pointer type ID as string pointer.
+ numbits = 1;
+ else
+ numbits = ty->getScalarSizeInBits();
+ // Create a key that combines llvm typeID and size
+ llvm::Value *Key =
+ llvm::ConstantInt::get(Int32Ty, _PACK_TY_BITLEN(argtypeid, numbits));
+ P = Builder.CreateStructGEP(DataStructTy, BufferPtr, I + 2);
+ Builder.CreateAlignedStore(Key, P, DL.getPrefTypeAlign(Key->getType()));
+ }
+
+ // --- 3rd Pass: Store data values for each arg ---
+ unsigned varstring_index = 0;
+ unsigned structIndex = 2 + NumArgs;
+ structOffset = 4 * structIndex;
+ bool hasXfers;
+ for (unsigned I = 0; I < NumArgs; I++) {
+ llvm::Value *Arg;
+ if (I == 0) {
+ Arg = Args[I].getKnownRValue().getScalarVal();
+ llvm::ConstantInt *CI = llvm::dyn_cast<llvm::ConstantInt>(Arg);
+ uint64_t uint64value = CI->getZExtValue();
+ uint32_t lower_32 = (uint32_t)(uint64value & 0xFFFFFFFF);
+ hasXfers = lower_32 ? true : false;
+ } else {
+ const Expr *argX = E->getArg(I)->IgnoreParenCasts();
+ auto *argXTy = argX->getType().getTypePtr();
+ if (isString(argXTy)) {
+ if (isVarString(argX, argXTy, Arg)) {
+ Arg = VarStrLengths[varstring_index];
+ varstring_index++;
+ } else {
+ const StringLiteral *SL = getSL(argX, argXTy);
+ StringRef ArgString = SL->getString();
+ int ArgStrLen = (int)ArgString.size() + 1;
+ // Change Arg from a char pointer to the integer string length
+ Arg = llvm::ConstantInt::get(Int32Ty, ArgStrLen);
+ }
+ } else {
+ Arg = Args[I].getKnownRValue().getScalarVal();
+ }
+ }
+ size_t structElementSize = (size_t)DL.getTypeAllocSize(Arg->getType());
+ size_t needsPadding = (structOffset % structElementSize);
+ if (needsPadding) {
+ // Skip over dummy fields in struct to align
+ structOffset += needsPadding; // should assert needsPadding == 4
+ structIndex++;
+ }
+ P = Builder.CreateStructGEP(DataStructTy, BufferPtr, structIndex);
+ Builder.CreateAlignedStore(Arg, P, DL.getPrefTypeAlign(Arg->getType()));
+ structOffset += structElementSize;
+ structIndex++;
+ }
+
+ // --- 4th Pass: memcpy all strings after the data values ---
+ // bitcast the struct in device global memory as a char buffer
+ Address BufferPtrByteAddr =
+ Address(Builder.CreatePointerCast(
+ BufferPtr, llvm::PointerType::get(CGM.getLLVMContext(), AS),
+ "_casted"),
+ Int8Ty, CharUnits::fromQuantity(1));
+
+ // BufferPtrByteAddr is a pointer to where we want to write the next string
+ BufferPtrByteAddr = Builder.CreateConstInBoundsByteGEP(
+ BufferPtrByteAddr, CharUnits::fromQuantity(DataLen_CT));
+ varstring_index = 0;
+ // Skip string processing on arg0 which may not be in E->getArg(0)
+ for (unsigned I = 1; I < NumArgs; ++I) {
+ llvm::Value *Arg = Args[I].getKnownRValue().getScalarVal();
+ const Expr *argX = E->getArg(I)->IgnoreParenCasts();
+ auto *argXTy = argX->getType().getTypePtr();
+ if (isString(argXTy)) {
+ if (isVarString(argX, argXTy, Arg)) {
+ llvm::Value *varStrLength = VarStrLengths[varstring_index];
+ varstring_index++;
+ Address SrcAddr = Address(Arg, Int8Ty, CharUnits::fromQuantity(1));
+ Builder.CreateMemCpy(BufferPtrByteAddr, SrcAddr, varStrLength);
+ // update BufferPtrByteAddr for next string memcpy
+ llvm::Value *PtrAsInt = BufferPtrByteAddr.emitRawPointer(*this);
+ BufferPtrByteAddr =
+ Address(Builder.CreateGEP(Int8Ty, PtrAsInt,
+ ArrayRef<llvm::Value *>(varStrLength)),
+ Int8Ty, CharUnits::fromQuantity(1));
+ } else {
+ const StringLiteral *SL = getSL(argX, argXTy);
+ StringRef ArgString = SL->getString();
+ int ArgStrLen = (int)ArgString.size() + 1;
+ Address SrcAddr = CGM.GetAddrOfConstantStringFromLiteral(SL);
+ Builder.CreateMemCpy(BufferPtrByteAddr, SrcAddr, ArgStrLen);
+ // update BufferPtrByteAddr for next memcpy
+ BufferPtrByteAddr = Builder.CreateConstInBoundsByteGEP(
+ BufferPtrByteAddr, CharUnits::fromQuantity(ArgStrLen));
+ }
+ }
+ }
+ // --- Generate call to __llvm_emissary_rpc and return RValue
+ llvm::Value *emis_rc = Builder.CreateCall(
+ GetEmissaryExecDeclaration(CGM, hasXfers), {BufferLen, DataStructPtr});
+ // truncate long long int to int for printf return value.
+ if ((E->getDirectCallee()->getNameAsString() == "fprintf") ||
+ (E->getDirectCallee()->getNameAsString() == "printf"))
+ emis_rc = Builder.CreateTrunc(emis_rc, CGM.Int32Ty, "emis_rc");
+ return RValue::get(emis_rc);
+}
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 6309c37788f0c..a4ee21d968b5e 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -6829,6 +6829,17 @@ RValue CodeGenFunction::EmitCall(QualType CalleeType,
StaticOperator = true;
}
+ // Replace calls to _emissary_exec found in emissary device stubs with calls
+ // to either __llvm_emissary_rpc or __llvm_emissary_rpc_dm. Before the call
+ // EmitEmissaryExec generates code to allocate an arg buffer and to fill the
+ // arg buffer.
+ if ((CGM.getTriple().isAMDGCN() || CGM.getTriple().isNVPTX()) && FnType &&
+ dyn_cast<FunctionProtoType>(FnType) &&
+ dyn_cast<FunctionProtoType>(FnType)->isVariadic() &&
+ (E->getDirectCallee()->getNameAsString() == "_emissary_exec") &&
+ CGM.getLangOpts().OpenMP)
+ return EmitEmissaryExec(E);
+
auto Arguments = E->arguments();
if (StaticOperator) {
// If we're calling a static operator, we need to emit the object argument
diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt
index dbbc35b372f42..f699cee7fea11 100644
--- a/clang/lib/CodeGen/CMakeLists.txt
+++ b/clang/lib/CodeGen/CMakeLists.txt
@@ -62,6 +62,7 @@ add_clang_library(clangCodeGen
CGAtomic.cpp
CGBlocks.cpp
CGBuiltin.cpp
+ CGEmitEmissaryExec.cpp
CGCUDANV.cpp
CGCUDARuntime.cpp
CGCXX.cpp
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 855e43631f436..3e962e5d0e34c 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4708,6 +4708,7 @@ class CodeGenFunction : public CodeGenTypeCache {
RValue EmitNVPTXDevicePrintfCallExpr(const CallExpr *E);
RValue EmitAMDGPUDevicePrintfCallExpr(const CallExpr *E);
+ RValue EmitEmissaryExec(const CallExpr *E);
RValue EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
const CallExpr *E, ReturnValueSlot ReturnValue);
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 4399eb475be75..b3be55c9d7d3d 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7813,6 +7813,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
}
}
+ // To opt-out of emissary printf/fprint set -fno-use-emissary-print. This
+ // will use the slower device libc for printf/fprintf. In the default mode
+ // (use-emissary-print), all other device libc functions are still active.
+ if (Args.hasFlag(options::OPT_fuse_emissary_print,
+ options::OPT_fno_use_emissary_print, true))
+ CmdArgs.push_back("-DOFFLOAD_ENABLE_EMISSARY_PRINT");
+
if (Triple.isAMDGPU()) {
handleAMDGPUCodeObjectVersionOptions(D, Args, CmdArgs);
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index c92b370b88d2d..c1edefa5f6aaa 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -41,6 +41,8 @@ set(core_files
tgmath.h
unwind.h
varargs.h
+ EmissaryIds.h
+ EmissaryMPI.h
)
set(arm_common_files
diff --git a/clang/lib/Headers/EmissaryIds.h b/clang/lib/Headers/EmissaryIds.h
new file mode 100644
index 0000000000000..f9ff616a0aacb
--- /dev/null
+++ b/clang/lib/Headers/EmissaryIds.h
@@ -0,0 +1,97 @@
+//===- openmp/device/include/EmissaryIds.h enum & headers ----- C++ -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines Emissary API identifiers. This header is used by both host
+// and device compilations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OFFLOAD_EMISSARY_IDS_H
+#define OFFLOAD_EMISSARY_IDS_H
+/// The sets of emissary APIs under development
+typedef enum {
+ EMIS_ID_INVALID,
+ EMIS_ID_FORTRT,
+ EMIS_ID_PRINT,
+ EMIS_ID_MPI,
+ EMIS_ID_HDF5,
+ EMIS_ID_RESERVE,
+} offload_emis_id_t;
+
+typedef enum {
+ _print_INVALID,
+ _printf_idx,
+ _fprintf_idx,
+ _ockl_asan_report_idx,
+} offload_emis_print_t;
+
+/// The vargs function used by emissary API device stubs
+unsigned long long _emissary_exec(unsigned long long, ...);
+
+//#define _PACK_EMIS_IDS(x, y) \
+// ((unsigned long long)x << 32) | ((unsigned long long)y)
+
+#define _PACK_EMIS_IDS(a, b, c, d) \
+ ((unsigned long long)a << 48) | ((unsigned long long)b << 32) | \
+ ((unsigned long long)c << 16) | ((unsigned long long)d)
+
+typedef enum {
+ _FortranAio_INVALID,
+ _FortranAioBeginExternalListOutput_idx,
+ _FortranAioOutputAscii_idx,
+ _FortranAioOutputInteger32_idx,
+ _FortranAioEndIoStatement_idx,
+ _FortranAioOutputInteger8_idx,
+ _FortranAioOutputInteger16_idx,
+ _FortranAioOutputInteger64_idx,
+ _FortranAioOutputReal32_idx,
+ _FortranAioOutputReal64_idx,
+ _FortranAioOutputComplex32_idx,
+ _FortranAioOutputComplex64_idx,
+ _FortranAioOutputLogical_idx,
+ _FortranAAbort_idx,
+ _FortranAStopStatementText_idx,
+ _FortranAioBeginExternalFormattedOutput_idx,
+ _FortranAStopStatement_idx,
+} offload_emis_fortrt_idx;
+
+/// This structure is created by emisExtractArgBuf to get information
+/// from the data buffer passed by rpc.
+typedef struct {
+ unsigned int DataLen;
+ unsigned int NumArgs;
+ unsigned int emisid;
+ unsigned int emisfnid;
+ unsigned int NumSendXfers;
+ unsigned int NumRecvXfers;
+ unsigned long long data_not_used;
+ char *keyptr;
+ char *argptr;
+ char *strptr;
+} emisArgBuf_t;
+
+typedef unsigned long long EmissaryReturn_t;
+typedef unsigned long long emis_argptr_t;
+typedef EmissaryReturn_t emisfn_t(void *, ...);
+
+#define MAXVARGS 32
+
+typedef enum service_rc {
+ _ERC_SUCCESS = 0,
+ _ERC_STATUS_ERROR = 1,
+ _ERC_DATA_USED_ERROR = 2,
+ _ERC_ADDINT_ERROR = 3,
+ _ERC_ADDFLOAT_ERROR = 4,
+ _ERC_ADDSTRING_ERROR = 5,
+ _ERC_UNSUPPORTED_ID_ERROR = 6,
+ _ERC_INVALID_ID_ERROR = 7,
+ _ERC_ERROR_INVALID_REQUEST = 8,
+ _ERC_EXCEED_MAXVARGS_ERROR = 9,
+} service_rc;
+
+#endif // OFFLOAD_EMISSARY_IDS_H
diff --git a/clang/lib/Headers/EmissaryMPI.h b/clang/lib/Headers/EmissaryMPI.h
new file mode 100644
index 0000000000000..c9529e1ea73ce
--- /dev/null
+++ b/clang/lib/Headers/EmissaryMPI.h
@@ -0,0 +1,202 @@
+//===---------------- openmp/device/include/EmissaryMPI.h ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// EmissaryMPI.h This include must be included by MPI application
+//
+//===----------------------------------------------------------------------===//
+#include "EmissaryIds.h"
+#include <stdarg.h>
+#include <unordered_map>
+
+typedef enum {
+ _MPI_INVALID,
+ _MPI_Send_idx,
+ _MPI_Recv_idx,
+ _MPI_Allreduce_idx,
+ _MPI_Reduce_idx,
+} offload_emis_mpi_t;
+
+// -------- DELETE THIS BLOCK WHEN MPI_Type_size on GPU WORKS ----------------
+// Emissary_Initialize_MPI builds a table of lengths for each each MPI_Datatype.
+// That table is called _mpi_type_lens and passed to the device.
+// We need this because we do not yet have a GPU version of MPI_Type_size.
+// If we did we can avoid the table search for datatype_size. This is
+// how MPI datatype length should be calculated.
+// int datatype_size;
+// MPI_Type_size(v2,&datatype_size) ;
+// Delete this table search when we have a working MPI_Type_size.
+#define _MPI_DATATYPES 5
+typedef struct mpi_type_len_t {
+ uint64_t dt_signature;
+ uint32_t dt_size;
+} mpi_type_len_t;
+#pragma omp begin declare target
+mpi_type_len_t _mpi_type_lens[_MPI_DATATYPES];
+#pragma omp end declare target
+void Emissary_Initialize_MPI() {
+ MPI_Datatype _mpi_int = MPI_INT;
+ MPI_Datatype _mpi_float = MPI_FLOAT;
+ MPI_Datatype _mpi_unsigned = MPI_UNSIGNED;
+ MPI_Datatype _mpi_double = MPI_DOUBLE;
+ MPI_Datatype _mpi_char = MPI_CHAR;
+ _mpi_type_lens[0] = {(uint64_t)_mpi_int, 4};
+ _mpi_type_lens[1] = {(uint64_t)_mpi_unsigned, 4};
+ _mpi_type_lens[2] = {(uint64_t)_mpi_float, 4};
+ _mpi_type_lens[3] = {(uint64_t)_mpi_double, 8};
+ _mpi_type_lens[4] = {(uint64_t)_mpi_char, 1};
+#pragma omp target update to(_mpi_type_lens[0 : _MPI_DATATYPES])
+}
+// -------- END BLOCK TO DELETE WHEN MPI_Type_size on GPU WORKS ---------------
+
+/// Device stubs must use the identical host API interface.
+/// Stubs call _emissary_exec with additional args that include
+/// the identifier and additional D2H and H2D transfer vectors.
+/// whose params include an identifier
+///
+#if defined(__NVPTX__) || defined(__AMDGCN__)
+
+extern "C" int MPI_Send(const void *buf, int count, MPI_Datatype datatype,
+ int dest, int tag, MPI_Comm comm) {
+ uint64_t dt_signature = (uint64_t)datatype;
+ int datatype_size = 8; // Default in case we do not have it in our table.
+ for (int i = 0; i < _MPI_DATATYPES; i++)
+ if (_mpi_type_lens[i].dt_signature == dt_signature) {
+ datatype_size = _mpi_type_lens[i].dt_size;
+ break;
+ }
+ return (int)_emissary_exec(
+ // The emissary identifier is a static 64 bit field that encodes
+ // the emissary id, emissary function, D2H Xfer cnt, and H2D Xfer cnt.
+ _PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Send_idx, 1, 0),
+ // Each D2H transfer vector needs a pair of args to describe the xfer
+ // The first is the device pointer, the 2nd is size.
+ buf, (int)count * datatype_size,
+ // These are the actual 6 MPI_Send Args passed directly from the params
+ buf, count, datatype, dest, tag, comm);
+}
+extern "C" int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source,
+ int tag, MPI_Comm comm, MPI_Status *st) {
+ uint64_t dt_signature = (uint64_t)datatype;
+ int datatype_size = 8;
+ for (int i = 0; i < _MPI_DATATYPES; i++)
+ if (_mpi_type_lens[i].dt_signature == dt_signature) {
+ datatype_size = _mpi_type_lens[i].dt_size;
+ break;
+ }
+ return (int)_emissary_exec(_PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Recv_idx, 0, 1),
+ buf,
+ (int)count * datatype_size, // This is a H2D Xfer
+ buf, count, datatype, source, tag, comm, st);
+}
+extern "C" int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count,
+ MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) {
+ return (int)_emissary_exec(
+ _PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Allreduce_idx, 1, 1), sendbuf, recvbuf,
+ count, datatype, op, comm);
+}
+extern "C" int MPI_Reduce(const void *sendbuf, void *recvbuf, int count,
+ MPI_Datatype datatype, MPI_Op op, int root,
+ MPI_Comm comm) {
+ return (int)_emissary_exec(_PACK_EMIS_IDS(EMIS_ID_MPI, _MPI_Reduce_idx, 1, 1),
+ sendbuf, recvbuf, count, datatype, op, root, comm);
+}
+
+#else
+
+/// Host variadic wrapper functions.
+extern "C" {
+extern int V_MPI_Send(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ void *v0 = va_arg(args, void *);
+ int v1 = va_arg(args, int);
+ MPI_Datatype v2 = va_arg(args, MPI_Datatype);
+ int v3 = va_arg(args, int);
+ int v4 = va_arg(args, int);
+ MPI_Comm v5 = va_arg(args, MPI_Comm);
+ va_end(args);
+ int rval = MPI_Send(v0, v1, v2, v3, v4, v5);
+ return rval;
+}
+extern int V_MPI_Recv(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ void *v0 = va_arg(args, void *);
+ int v1 = va_arg(args, int);
+ MPI_Datatype v2 = va_arg(args, MPI_Datatype);
+ int v3 = va_arg(args, int);
+ int v4 = va_arg(args, int);
+ MPI_Comm v5 = va_arg(args, MPI_Comm);
+ MPI_Status *v6 = va_arg(args, MPI_Status *);
+ va_end(args);
+ int rval = MPI_Recv(v0, v1, v2, v3, v4, v5, v6);
+ return rval;
+}
+extern int V_MPI_Allreduce(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ void *buf = va_arg(args, void *);
+ void *recvbuf = va_arg(args, void *);
+ int count = va_arg(args, int);
+ MPI_Datatype datatype = va_arg(args, MPI_Datatype);
+ MPI_Op op = va_arg(args, MPI_Op);
+ MPI_Comm comm = va_arg(args, MPI_Comm);
+ va_end(args);
+ int rval = MPI_Allreduce(buf, recvbuf, count, datatype, op, comm);
+ return rval;
+}
+extern int V_MPI_Reduce(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ void *buf = va_arg(args, void *);
+ void *recvbuf = va_arg(args, void *);
+ int count = va_arg(args, int);
+ MPI_Datatype datatype = va_arg(args, MPI_Datatype);
+ MPI_Op op = va_arg(args, MPI_Op);
+ int root = va_arg(args, int);
+ MPI_Comm comm = va_arg(args, MPI_Comm);
+ va_end(args);
+ int rval = MPI_Reduce(buf, recvbuf, count, datatype, op, root, comm);
+ return rval;
+}
+
+/// EmissaryMPI function selector
+EmissaryReturn_t EmissaryMPI(char *data, emisArgBuf_t *ab, emis_argptr_t *a[]) {
+
+ switch (ab->emisfnid) {
+ case _MPI_Send_idx: {
+ void *fnptr = (void *)V_MPI_Send;
+ int return_value_int =
+ V_MPI_Send(fnptr, a[0], a[1], a[2], a[3], a[4], a[5]);
+ return (EmissaryReturn_t)return_value_int;
+ }
+ case _MPI_Recv_idx: {
+ void *fnptr = (void *)V_MPI_Recv;
+ int return_value_int =
+ V_MPI_Recv(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6]);
+ return (EmissaryReturn_t)return_value_int;
+ }
+ case _MPI_Allreduce_idx: {
+ void *fnptr = (void *)V_MPI_Allreduce;
+ int return_value_int =
+ V_MPI_Allreduce(fnptr, a[0], a[1], a[2], a[3], a[4], a[5]);
+ return (EmissaryReturn_t)return_value_int;
+ }
+ case _MPI_Reduce_idx: {
+ void *fnptr = (void *)V_MPI_Reduce;
+ int return_value_int =
+ V_MPI_Reduce(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6]);
+ return (EmissaryReturn_t)return_value_int;
+ }
+ }
+ return (EmissaryReturn_t)0;
+}
+
+} // end extern "C"
+
+#endif
diff --git a/clang/lib/Headers/llvm_libc_wrappers/stdio.h b/clang/lib/Headers/llvm_libc_wrappers/stdio.h
index 0c3e44823da70..e8c0250cfb01b 100644
--- a/clang/lib/Headers/llvm_libc_wrappers/stdio.h
+++ b/clang/lib/Headers/llvm_libc_wrappers/stdio.h
@@ -21,6 +21,23 @@
#define __LIBC_ATTRS
#endif
+// To turn off emissary print (and this macro) set -fno-use-emissary-print.
+#ifdef OFFLOAD_ENABLE_EMISSARY_PRINT
+#if defined(__NVPTX__) || defined(__AMDGCN__)
+#include <EmissaryIds.h>
+#define fprintf(...) \
+ _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _fprintf_idx, 0, 0), \
+ __VA_ARGS__);
+#define printf(...) \
+ _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _printf_idx, 0, 0), __VA_ARGS__);
+#define fputc(c, stream) fprintf(stream, "%c", (unsigned char)(c))
+#define putc(c, stream) fprintf(stream, "%c", (unsigned char)(c))
+#define putchar(c) printf("%c", (char)(c))
+#define fputs(str, stream) fprintf((stream), "%s", (str))
+#define puts(str) fprintf(stdout, "%s", (str))
+#endif
+#endif
+
// Some headers provide these as macros. Temporarily undefine them so they do
// not conflict with any definitions for the GPU.
diff --git a/offload/include/Shared/RPCOpcodes.h b/offload/include/Shared/RPCOpcodes.h
index beee29df1f707..fbdc839620cec 100644
--- a/offload/include/Shared/RPCOpcodes.h
+++ b/offload/include/Shared/RPCOpcodes.h
@@ -18,6 +18,8 @@
typedef enum {
OFFLOAD_HOST_CALL = LLVM_OFFLOAD_OPCODE(0),
+ OFFLOAD_EMISSARY = LLVM_OFFLOAD_OPCODE(1),
+ OFFLOAD_EMISSARY_DM = LLVM_OFFLOAD_OPCODE(2),
} offload_opcode_t;
#undef LLVM_OFFLOAD_OPCODE
diff --git a/offload/libomptarget/CMakeLists.txt b/offload/libomptarget/CMakeLists.txt
index 93e684e53bf17..d3247c09a50a8 100644
--- a/offload/libomptarget/CMakeLists.txt
+++ b/offload/libomptarget/CMakeLists.txt
@@ -57,6 +57,7 @@ target_compile_definitions(omptarget PRIVATE
foreach(plugin IN LISTS LIBOMPTARGET_PLUGINS_TO_BUILD)
target_link_libraries(omptarget PRIVATE omptarget.rtl.${plugin})
endforeach()
+target_link_libraries(omptarget PRIVATE flang_rt.runtime)
target_compile_options(omptarget PRIVATE ${offload_compile_flags})
target_link_options(omptarget PRIVATE ${offload_link_flags})
diff --git a/offload/plugins-nextgen/common/CMakeLists.txt b/offload/plugins-nextgen/common/CMakeLists.txt
index ea0910abf95d5..0232fe2d83f28 100644
--- a/offload/plugins-nextgen/common/CMakeLists.txt
+++ b/offload/plugins-nextgen/common/CMakeLists.txt
@@ -7,6 +7,15 @@ tablegen(OFFLOAD include/OffloadErrcodes.inc -gen-errcodes -I ${CMAKE_CURRENT_SO
tablegen(OFFLOAD include/OffloadInfo.inc -gen-info -I ${CMAKE_CURRENT_SOURCE_DIR}/../../liboffload/API)
add_public_tablegen_target(PluginErrcodes)
+if(OFFLOAD_ENABLE_EMISSARY_APIS)
+ set(emissary_sources
+ src/Emissary.cpp
+ src/EmissaryFortrt.cpp
+ src/EmissaryPrint.cpp
+ )
+ set(emissary_headers ${CMAKE_CURRENT_SOURCE_DIR}/../../../openmp/device/include)
+endif()
+
# NOTE: Don't try to build `PluginInterface` using `add_llvm_library` because we
# don't want to export `PluginInterface` while `add_llvm_library` requires that.
add_library(PluginCommon OBJECT
@@ -16,6 +25,7 @@ add_library(PluginCommon OBJECT
src/RPC.cpp
src/OffloadError.cpp
src/Utils/ELF.cpp
+ ${emissary_sources}
)
add_dependencies(PluginCommon intrinsics_gen PluginErrcodes)
@@ -34,6 +44,10 @@ endif()
# Include the RPC server from the `libc` project.
include(FindLibcCommonUtils)
target_link_libraries(PluginCommon PRIVATE llvm-libc-common-utilities)
+if(OFFLOAD_ENABLE_EMISSARY_APIS)
+ target_link_libraries(PluginCommon PRIVATE flang_rt.runtime
+ -L${CMAKE_BINARY_DIR}/../../lib -L${CMAKE_INSTALL_PREFIX}/lib)
+endif()
# Define the TARGET_NAME and DEBUG_PREFIX.
target_compile_definitions(PluginCommon PRIVATE
@@ -50,6 +64,7 @@ target_include_directories(PluginCommon PUBLIC
${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
${LIBOMPTARGET_BINARY_INCLUDE_DIR}
${LIBOMPTARGET_INCLUDE_DIR}
+ ${emissary_headers}
)
set_target_properties(PluginCommon PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/offload/plugins-nextgen/common/include/Emissary.h b/offload/plugins-nextgen/common/include/Emissary.h
new file mode 100644
index 0000000000000..317c6ea51136b
--- /dev/null
+++ b/offload/plugins-nextgen/common/include/Emissary.h
@@ -0,0 +1,263 @@
+//===-- offload/plugins-nextgen/common/include/Emissary.h ------ C++ ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines emissary helper functions. This include is only used for host
+// compilation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OFFLOAD_EMISSARY_H
+#define OFFLOAD_EMISSARY_H
+
+#include "EmissaryIds.h"
+#include "RPC.h"
+// #include "shared/rpc.h"
+#include "shared/rpc_server.h"
+
+extern "C" {
+
+/// Called by rpc after receiving emissary argument buffer
+EmissaryReturn_t Emissary(char *data, emisArgBuf_t *ab,
+ std::unordered_map<void *, void *> *D2HAddrList);
+
+/// Called by Emissary for all Fortrt emissary functions
+EmissaryReturn_t
+EmissaryFortrt(char *data, emisArgBuf_t *ab,
+ std::unordered_map<void *, void *> *D2HAddrList);
+
+/// Called by Emissary for all misc print functions
+EmissaryReturn_t EmissaryPrint(char *data, emisArgBuf_t *ab);
+
+/// Called by Emissary for all MPI emissary API functions
+__attribute((weak)) EmissaryReturn_t EmissaryMPI(char *data, emisArgBuf_t *ab,
+ emis_argptr_t *arg[]);
+
+/// Called by Emissary for all HDF5 Emissary API functions
+__attribute((weak)) EmissaryReturn_t EmissaryHDF5(char *data, emisArgBuf_t *ab,
+ emis_argptr_t *arg[]);
+
+/// Called by Emissary to support user-defined emissary API
+__attribute((weak)) EmissaryReturn_t EmissaryReserve(char *data,
+ emisArgBuf_t *ab,
+ emis_argptr_t *arg[]);
+
+/// emisExtractArgBuf is called within the "case OFFLOAD_EMISSARY:" stanza
+/// in offload/plugins-nextgen/common/src/RPC.cpp to build the emisArgBuf_t
+/// structure from the emissary data buffer sent to the CPU by rpc.
+/// This buffer is created by clang CodeGen when variadic function
+/// _emissary_exec(...) is encountered when compiling any emissary device
+/// stub to define the device function.
+void emisExtractArgBuf(char *buf, emisArgBuf_t *ab);
+
+/// Move the ArgBuf tracker past a set of XferSrgs
+void emisSkipXferArgSet(emisArgBuf_t *ab);
+
+/// Get uint32 value extended to uint64_t value from a char ptr
+uint64_t getuint32(char *val);
+/// Get uint64_t value from a char ptr
+uint64_t getuint64(char *val);
+/// Get a function pointer from a char ptr
+void *getfnptr(char *val);
+
+/// Builds the array of pointers passed to V_ functions
+uint32_t EmissaryBuildVargs(int NumArgs, char *keyptr, char *dataptr,
+ char *strptr, unsigned long long *data_not_used,
+ emis_argptr_t *a[],
+ std::unordered_map<void *, void *> *D2HAddrList);
+
+} // end extern "C"
+
+/// Call the associated V_ function
+template <typename T, typename FT>
+extern T EmissaryCallFnptr(uint32_t NumArgs, void *fnptr, emis_argptr_t *a[]);
+
+// We would like to get llvm typeID enum from Type.h. e.g.
+// #include ".../llvm/include/llvm/IR/Type.h"
+// But we cannot include LLVM headers in a runtime function.
+// So we a have a manual copy of llvm TypeID enum from Type.h
+// The codegen for _emissary_exec puts this ID in the key for
+// each arg and the host runtime needs to decode this key.
+enum TypeID {
+ // PrimitiveTypes
+ HalfTyID = 0, ///< 16-bit floating point type
+ BFloatTyID, ///< 16-bit floating point type (7-bit significand)
+ FloatTyID, ///< 32-bit floating point type
+ DoubleTyID, ///< 64-bit floating point type
+ X86_FP80TyID, ///< 80-bit floating point type (X87)
+ FP128TyID, ///< 128-bit floating point type (112-bit significand)
+ PPC_FP128TyID, ///< 128-bit floating point type (two 64-bits, PowerPC)
+ VoidTyID, ///< type with no size
+ LabelTyID, ///< Labels
+ MetadataTyID, ///< Metadata
+ X86_AMXTyID, ///< AMX vectors (8192 bits, X86 specific)
+ TokenTyID, ///< Tokens
+
+ // Derived types... see DerivedTypes.h file.
+ IntegerTyID, ///< Arbitrary bit width integers
+ FunctionTyID, ///< Functions
+ PointerTyID, ///< Pointers
+ StructTyID, ///< Structures
+ ArrayTyID, ///< Arrays
+ FixedVectorTyID, ///< Fixed width SIMD vector type
+ ScalableVectorTyID, ///< Scalable SIMD vector type
+ TypedPointerTyID, ///< Typed pointer used by some GPU targets
+ TargetExtTyID, ///< Target extension type
+};
+
+template <typename T, typename FT>
+extern T EmissaryCallFnptr(uint32_t NumArgs, void *fnptr, emis_argptr_t *a[]) {
+ T rv;
+ FT *vfnptr = (FT *)fnptr;
+ switch (NumArgs) {
+ case 1:
+ rv = (T)vfnptr(fnptr, a[0]);
+ break;
+ case 2:
+ rv = (T)vfnptr(fnptr, a[0], a[1]);
+ break;
+ case 3:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2]);
+ break;
+ case 4:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3]);
+ break;
+ case 5:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4]);
+ break;
+ case 6:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5]);
+ break;
+ case 7:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6]);
+ break;
+ case 8:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7]);
+ break;
+ case 9:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8]);
+ break;
+ case 10:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9]);
+ break;
+ case 11:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10]);
+ break;
+ case 12:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11]);
+ break;
+ case 13:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11], a[12]);
+ break;
+ case 14:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11], a[12], a[13]);
+ break;
+ case 15:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11], a[12], a[13], a[14]);
+ break;
+ case 16:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11], a[12], a[13], a[14], a[15]);
+ break;
+ case 17:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16]);
+ break;
+ case 18:
+ rv =
+ (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17]);
+ break;
+ case 19:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+ a[18]);
+ break;
+ case 20:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+ a[18], a[19]);
+ break;
+ case 21:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+ a[18], a[19], a[20]);
+ break;
+ case 22:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+ a[18], a[19], a[20], a[21]);
+ break;
+ case 23:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+ a[18], a[19], a[20], a[21], a[22]);
+ break;
+ case 24:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+ a[18], a[19], a[20], a[21], a[22], a[23]);
+ break;
+ case 25:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+ a[18], a[19], a[20], a[21], a[22], a[23], a[24]);
+ break;
+ case 26:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+ a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25]);
+ break;
+ case 27:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+ a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25],
+ a[26]);
+ break;
+ case 28:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+ a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25],
+ a[26], a[27]);
+ break;
+ case 29:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+ a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25],
+ a[26], a[27], a[28]);
+ break;
+ case 30:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+ a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25],
+ a[26], a[27], a[28], a[29]);
+ break;
+ case 31:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+ a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25],
+ a[26], a[27], a[28], a[29], a[30]);
+ break;
+ case 32:
+ rv = (T)vfnptr(fnptr, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+ a[9], a[10], a[11], a[12], a[13], a[14], a[15], a[16], a[17],
+ a[18], a[19], a[20], a[21], a[22], a[23], a[24], a[25],
+ a[26], a[27], a[28], a[29], a[30], a[31]);
+ break;
+ default:
+ rv = 0;
+ }
+ return rv;
+}
+
+#endif // OFFLOAD_EMISSARY_H
diff --git a/offload/plugins-nextgen/common/src/Emissary.cpp b/offload/plugins-nextgen/common/src/Emissary.cpp
new file mode 100644
index 0000000000000..0be0e426ee050
--- /dev/null
+++ b/offload/plugins-nextgen/common/src/Emissary.cpp
@@ -0,0 +1,259 @@
+//===----- ioffload/plugins-nexgen/common/include/Emissary.cpp ---- C++ ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RPC.h"
+
+#include "Shared/Debug.h"
+#include "Shared/RPCOpcodes.h"
+
+#include "PluginInterface.h"
+
+#include "Emissary.h"
+#include "EmissaryIds.h"
+#include "shared/rpc.h"
+#include "shared/rpc_opcodes.h"
+#include <unordered_map>
+
+extern "C" EmissaryReturn_t
+Emissary(char *data, emisArgBuf_t *ab,
+ std::unordered_map<void *, void *> *D2HAddrList) {
+ EmissaryReturn_t result = 0;
+ emis_argptr_t **args = (emis_argptr_t **)aligned_alloc(
+ sizeof(emis_argptr_t), ab->NumArgs * sizeof(emis_argptr_t *));
+
+ switch (ab->emisid) {
+ case EMIS_ID_INVALID: {
+ fprintf(stderr, "Emissary (host execution) got invalid EMIS_ID\n");
+ result = 0;
+ break;
+ }
+ case EMIS_ID_FORTRT: {
+ result = EmissaryFortrt(data, ab, D2HAddrList);
+ break;
+ }
+ case EMIS_ID_PRINT: {
+ result = EmissaryPrint(data, ab);
+ break;
+ }
+ case EMIS_ID_MPI: {
+ if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+ &(ab->data_not_used), &args[0],
+ D2HAddrList) != _ERC_SUCCESS)
+ return (EmissaryReturn_t)0;
+ result = EmissaryMPI(data, ab, args);
+ break;
+ }
+ case EMIS_ID_HDF5: {
+ if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+ &(ab->data_not_used), &args[0],
+ D2HAddrList) != _ERC_SUCCESS)
+ return (EmissaryReturn_t)0;
+ result = EmissaryHDF5(data, ab, args);
+ break;
+ }
+ case EMIS_ID_RESERVE: {
+ if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+ &(ab->data_not_used), &args[0],
+ D2HAddrList) != _ERC_SUCCESS)
+ return (EmissaryReturn_t)0;
+ result = EmissaryReserve(data, ab, args);
+ break;
+ }
+ default:
+ fprintf(stderr,
+ "Emissary (host execution) EMIS_ID:%d fnid:%d not supported\n",
+ ab->emisid, ab->emisfnid);
+ }
+ free(args);
+ return result;
+}
+
+// emisExtractArgBuf reverses protocol that codegen in EmitEmissaryExec makes.
+extern "C" void emisExtractArgBuf(char *data, emisArgBuf_t *ab) {
+
+ uint32_t *int32_data = (uint32_t *)data;
+ ab->DataLen = int32_data[0];
+ ab->NumArgs = int32_data[1];
+
+ // Note: while the data buffer contains all args including strings,
+ // ab->DataLen does not include strings. It only counts header, keys,
+ // and aligned numerics.
+
+ ab->keyptr = data + (2 * sizeof(int));
+ ab->argptr = ab->keyptr + (ab->NumArgs * sizeof(int));
+ ab->strptr = data + (size_t)ab->DataLen;
+ int alignfill = 0;
+ if (((size_t)ab->argptr) % (size_t)8) {
+ ab->argptr += 4;
+ alignfill = 4;
+ }
+
+ // Extract the two emissary identifiers and number of send
+ // and recv device data transfers. These are 4 16 bit values
+ // packed into a single 64-bit field.
+ uint64_t arg1 = *(uint64_t *)ab->argptr;
+ ab->emisid = (unsigned int)((arg1 >> 48) & 0xFFFF);
+ ab->emisfnid = (unsigned int)((arg1 >> 32) & 0xFFFF);
+ ab->NumSendXfers = (unsigned int)((arg1 >> 16) & 0xFFFF);
+ ab->NumRecvXfers = (unsigned int)((arg1) & 0xFFFF);
+
+ // skip the uint64_t emissary id arg which is first arg in _emissary_exec.
+ ab->keyptr += sizeof(int);
+ ab->argptr += sizeof(uint64_t);
+ ab->NumArgs -= 1;
+
+ // data_not_used used for testing consistency.
+ ab->data_not_used =
+ (size_t)(ab->DataLen) - (((size_t)(3 + ab->NumArgs) * sizeof(int)) +
+ sizeof(uint64_t) + alignfill);
+
+ // Ensure first arg after emissary id arg is aligned.
+ if (((size_t)ab->argptr) % (size_t)8) {
+ ab->argptr += 4;
+ ab->data_not_used -= 4;
+ }
+}
+
+/// Get uint32 value extended to uint64_t value from a char ptr
+extern "C" uint64_t getuint32(char *val) {
+ uint32_t i32 = *(uint32_t *)val;
+ return (uint64_t)i32;
+}
+
+/// Get uint64_t value from a char ptr
+extern "C" uint64_t getuint64(char *val) { return *(uint64_t *)val; }
+
+/// Get a function pointer from a char ptr
+extern "C" void *getfnptr(char *val) {
+ uint64_t ival = *(uint64_t *)val;
+ return (void *)ival;
+}
+
+// build argument array
+extern "C" uint32_t
+EmissaryBuildVargs(int NumArgs, char *keyptr, char *dataptr, char *strptr,
+ unsigned long long *data_not_used, emis_argptr_t *a[],
+ std::unordered_map<void *, void *> *D2HAddrList) {
+ size_t num_bytes;
+ size_t bytes_consumed;
+ size_t strsz;
+ size_t fillerNeeded;
+
+ uint argcount = 0;
+
+ for (int argnum = 0; argnum < NumArgs; argnum++) {
+ num_bytes = 0;
+ strsz = 0;
+ unsigned int key = *(unsigned int *)keyptr;
+ unsigned int llvmID = key >> 16;
+ unsigned int numbits = (key << 16) >> 16;
+
+ switch (llvmID) {
+ case FloatTyID: ///< 2: 32-bit floating point type
+ case DoubleTyID: ///< 3: 64-bit floating point type
+ case FP128TyID: ///< 5: 128-bit floating point type (112-bit mantissa)
+ num_bytes = numbits / 8;
+ bytes_consumed = num_bytes;
+ fillerNeeded = ((size_t)dataptr) % num_bytes;
+ if (fillerNeeded) {
+ dataptr += fillerNeeded;
+ bytes_consumed += fillerNeeded;
+ }
+ if ((*data_not_used) < bytes_consumed)
+ return _ERC_DATA_USED_ERROR;
+
+ if (num_bytes == 4)
+ a[argcount] = (emis_argptr_t *)getuint32(dataptr);
+ else
+ a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+ break;
+
+ case IntegerTyID: ///< 11: Arbitrary bit width integers
+ num_bytes = numbits / 8;
+ bytes_consumed = num_bytes;
+ fillerNeeded = ((size_t)dataptr) % num_bytes;
+ if (fillerNeeded) {
+ dataptr += fillerNeeded;
+ bytes_consumed += fillerNeeded;
+ }
+ if ((*data_not_used) < bytes_consumed)
+ return _ERC_DATA_USED_ERROR;
+
+ if (num_bytes == 4)
+ a[argcount] = (emis_argptr_t *)getuint32(dataptr);
+ else
+ a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+ // fprintf(stderr, "Arg:%d IntegerType bytes:%ld val:%ld\n",argcount,
+ // num_bytes,(uint64_t) a[argcount]);
+ break;
+
+ case PointerTyID: { ///< 15: Pointers
+ if (numbits == 1) { // This is a pointer to string
+ num_bytes = 4;
+ bytes_consumed = num_bytes;
+ strsz = (size_t) * (unsigned int *)dataptr;
+ if ((*data_not_used) < bytes_consumed)
+ return _ERC_DATA_USED_ERROR;
+ a[argcount] = (emis_argptr_t *)((char *)strptr);
+ } else {
+ num_bytes = 8;
+ bytes_consumed = num_bytes;
+ fillerNeeded = ((size_t)dataptr) % num_bytes;
+ if (fillerNeeded) {
+ dataptr += fillerNeeded; // dataptr is now aligned
+ bytes_consumed += fillerNeeded;
+ }
+ if ((*data_not_used) < bytes_consumed)
+ return _ERC_DATA_USED_ERROR;
+ a[argcount] = (emis_argptr_t *)getuint64(dataptr);
+ }
+ auto found = D2HAddrList->find((void *)a[argcount]);
+ if (found != D2HAddrList->end())
+ a[argcount] = (emis_argptr_t *)found->second;
+ // fprintf(stderr, "Arg:%d pointer Type bytes:%ld val:%p\n",argcount,
+ // num_bytes, (void*) a[argcount]);
+ } break;
+
+ case HalfTyID: ///< 1: 16-bit floating point type
+ case ArrayTyID: ///< 14: Arrays
+ case StructTyID: ///< 13: Structures
+ case FunctionTyID: ///< 12: Functions
+ case TokenTyID: ///< 10: Tokens
+ case MetadataTyID: ///< 8: Metadata
+ case LabelTyID: ///< 7: Labels
+ case PPC_FP128TyID: ///< 6: 128-bit floating point type (two 64-bits,
+ ///< PowerPC)
+ case X86_FP80TyID: ///< 4: 80-bit floating point type (X87)
+ case FixedVectorTyID: ///< 16: Fixed width SIMD vector type
+ case ScalableVectorTyID: ///< 17: Scalable SIMD vector type
+ case TypedPointerTyID: ///< Typed pointer used by some GPU targets
+ case TargetExtTyID: ///< Target extension type
+ case VoidTyID:
+ return _ERC_UNSUPPORTED_ID_ERROR;
+ break;
+ default:
+ return _ERC_INVALID_ID_ERROR;
+ }
+
+ // Move to next argument
+ dataptr += num_bytes;
+ strptr += strsz;
+ *data_not_used -= bytes_consumed;
+ keyptr += 4;
+ argcount++;
+ }
+ return _ERC_SUCCESS;
+}
+
+extern "C" void emisSkipXferArgSet(emisArgBuf_t *ab) {
+ // Skip the ptr and size of the Xfer
+ ab->NumArgs -= 2;
+ ab->keyptr += 2 * sizeof(uint32_t);
+ ab->argptr += 2 * sizeof(void *);
+ ab->data_not_used -= 2 * sizeof(void *);
+}
diff --git a/offload/plugins-nextgen/common/src/EmissaryFortrt.cpp b/offload/plugins-nextgen/common/src/EmissaryFortrt.cpp
new file mode 100644
index 0000000000000..7853357b1329a
--- /dev/null
+++ b/offload/plugins-nextgen/common/src/EmissaryFortrt.cpp
@@ -0,0 +1,470 @@
+//===---- offload/plugins-nextgen/common/src/EmissaryFortrt.cpp ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Host support for Fortran runtime Emissary API
+//
+//===----------------------------------------------------------------------===//
+#include "PluginInterface.h"
+#include "RPC.h"
+#include "Shared/Debug.h"
+#include "Shared/RPCOpcodes.h"
+#include "shared/rpc.h"
+#include "shared/rpc_opcodes.h"
+#include <Emissary.h>
+#include <EmissaryIds.h>
+#include <assert.h>
+#include <cstring>
+#include <ctype.h>
+#include <list>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <tuple>
+#include <vector>
+
+// Headers for Host Fortran Runtime API as built in llvm/flang/runtime
+extern "C" {
+void *_FortranAioBeginExternalListOutput(uint32_t a1, const char *a2,
+ uint32_t a3);
+void *_FortranAioBeginExternalFormattedOutput(const char *ptr1, uint64_t x1,
+ void *ptr2, uint32_t x2,
+ const char *ptr3, uint32_t x3);
+bool _FortranAioOutputAscii(void *a1, char *a2, uint64_t a3);
+bool _FortranAioOutputInteger32(void *a1, uint32_t a2);
+uint32_t _FortranAioEndIoStatement(void *a1);
+bool _FortranAioOutputInteger8(void *cookie, int8_t n);
+bool _FortranAioOutputInteger16(void *cookie, int16_t n);
+bool _FortranAioOutputInteger64(void *cookie, int64_t n);
+bool _FortranAioOutputReal32(void *cookie, float x);
+bool _FortranAioOutputReal64(void *cookie, double x);
+bool _FortranAioOutputComplex32(void *cookie, float re, float im);
+bool _FortranAioOutputComplex64(void *cookie, double re, double im);
+bool _FortranAioOutputLogical(void *cookie, bool truth);
+void _FortranAAbort();
+void _FortranAStopStatementText(char *errmsg, int64_t a1, bool a2, bool a3);
+void _FortranAStopStatement(int32_t a1, bool a2, bool a3);
+
+// Save the cookie because deferred functions have execution reordered.
+static void *_list_started_cookie = nullptr;
+extern void *V_FortranAioBeginExternalListOutput(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ int32_t v0 = va_arg(args, int32_t);
+ const char *v1 = va_arg(args, const char *);
+ int32_t v2 = va_arg(args, int32_t);
+ va_end(args);
+ void *cookie = _FortranAioBeginExternalListOutput(v0, v1, v2);
+ _list_started_cookie = cookie;
+ return cookie;
+}
+extern void *V_FortranAioBeginExternalFormattedOutput(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ const char *p0 = va_arg(args, const char *);
+ int64_t v0 = va_arg(args, int64_t);
+ void *p1 = va_arg(args, void *);
+ int32_t v1 = va_arg(args, int32_t);
+ const char *p2 = va_arg(args, const char *);
+ int32_t v2 = va_arg(args, int32_t);
+ va_end(args);
+ void *cookie =
+ _FortranAioBeginExternalFormattedOutput(p0, v0, p1, v1, p2, v2);
+ _list_started_cookie = cookie;
+ return cookie;
+}
+extern bool V_FortranAioOutputAscii(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ void *v0 = va_arg(args, void *);
+ char *v1 = va_arg(args, char *);
+ uint64_t v2 = va_arg(args, uint64_t);
+ va_end(args);
+ v0 = _list_started_cookie;
+ return _FortranAioOutputAscii(v0, v1, v2);
+}
+extern bool V_FortranAioOutputInteger32(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ void *v0 = va_arg(args, void *);
+ uint32_t v1 = va_arg(args, uint32_t);
+ va_end(args);
+ v0 = _list_started_cookie;
+ return _FortranAioOutputInteger32(v0, v1);
+}
+extern uint32_t V_FortranAioEndIoStatement(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ void *v0 = va_arg(args, void *);
+ va_end(args);
+ v0 = _list_started_cookie;
+ uint32_t rv = _FortranAioEndIoStatement(v0);
+ return rv;
+}
+extern bool V_FortranAioOutputInteger8(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ void *v0 = va_arg(args, void *);
+ uint32_t v1 = va_arg(args, uint32_t);
+ va_end(args);
+ v0 = _list_started_cookie;
+ return _FortranAioOutputInteger8(v0, v1);
+}
+extern bool V_FortranAioOutputInteger16(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ void *v0 = va_arg(args, void *);
+ uint32_t v1 = va_arg(args, uint32_t);
+ va_end(args);
+ v0 = _list_started_cookie;
+ return _FortranAioOutputInteger16(v0, v1);
+}
+extern bool V_FortranAioOutputInteger64(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ void *v0 = va_arg(args, void *);
+ uint32_t v1 = va_arg(args, uint32_t);
+ va_end(args);
+ v0 = _list_started_cookie;
+ return _FortranAioOutputInteger64(v0, v1);
+}
+extern bool V_FortranAioOutputReal32(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ void *v0 = va_arg(args, void *);
+ uint64_t v1 = va_arg(args, uint64_t);
+ va_end(args);
+ v0 = _list_started_cookie;
+ double dv;
+ memcpy(&dv, &v1, 8);
+ return _FortranAioOutputReal32(v0, (float)dv);
+}
+extern bool V_FortranAioOutputReal64(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ void *cookie = va_arg(args, void *);
+ uint64_t v1 = va_arg(args, uint64_t);
+ va_end(args);
+ cookie = _list_started_cookie;
+ double dv;
+ memcpy(&dv, &v1, 8);
+ return _FortranAioOutputReal64(cookie, dv);
+}
+extern bool V_FortranAioOutputComplex32(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ void *v0 = va_arg(args, void *);
+ uint64_t v1 = va_arg(args, uint64_t);
+ uint64_t v2 = va_arg(args, uint64_t);
+ va_end(args);
+ v0 = _list_started_cookie;
+ double dv1, dv2;
+ memcpy(&dv1, &v1, 8);
+ memcpy(&dv2, &v2, 8);
+ return _FortranAioOutputComplex32(v0, (float)dv1, (float)dv2);
+}
+extern bool V_FortranAioOutputComplex64(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ void *v0 = va_arg(args, void *);
+ uint64_t v1 = va_arg(args, uint64_t);
+ uint64_t v2 = va_arg(args, uint64_t);
+ va_end(args);
+ v0 = _list_started_cookie;
+ double dv1, dv2;
+ memcpy(&dv1, &v1, 8);
+ memcpy(&dv2, &v2, 8);
+ return _FortranAioOutputComplex64(v0, dv1, dv2);
+}
+extern bool V_FortranAioOutputLogical(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ void *v0 = va_arg(args, void *);
+ uint32_t v1 = va_arg(args, uint32_t);
+ va_end(args);
+ v0 = _list_started_cookie;
+ return _FortranAioOutputLogical(v0, v1);
+}
+extern void V_FortranAAbort(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ va_end(args);
+ _FortranAAbort();
+ // Now return to device to run abort from stub
+}
+extern void V_FortranAStopStatementText(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ char *errmsg = va_arg(args, char *);
+ int64_t a1 = va_arg(args, int64_t);
+ uint32_t a2 = va_arg(args, uint32_t);
+ uint32_t a3 = va_arg(args, uint32_t);
+ va_end(args);
+ bool b2 = (bool)a2;
+ bool b3 = (bool)a3;
+ _FortranAStopStatementText(errmsg, a1, b2, b3);
+}
+extern void V_FortranAStopStatement(void *fnptr, ...) {
+ va_list args;
+ va_start(args, fnptr);
+ int32_t a1 = va_arg(args, int32_t);
+ uint32_t a2 = va_arg(args, uint32_t);
+ uint32_t a3 = va_arg(args, uint32_t);
+ va_end(args);
+ bool b2 = (bool)a2;
+ bool b3 = (bool)a3;
+ _FortranAStopStatement(a1, b2, b3);
+}
+} // end extern "C"
+
+// Static vars used to defer functions to reorder execution by thread and team.
+static uint32_t _deferred_fn_count = 0;
+static uint32_t _deferred_begin_statements = 0;
+static uint32_t _deferred_end_statements = 0;
+static uint64_t _max_num_threads = 0;
+static uint64_t _max_num_teams = 0;
+
+// structure for deferred functions
+typedef struct {
+ uint32_t NumArgs; // The number of args in arg_array
+ void *fnptr; // The function pointer for this index
+ uint64_t fn_idx; // The function index, good for debug
+ uint32_t dfnid; // The dvoideferred function id, in order received
+ uint64_t *arg_array; // ptr to malloced arg_array
+ char *c_ptr; // ptr to null terminated char string
+ char *c_ptr2; // ptr to null terminated char string
+ uint64_t thread_num;
+ uint64_t num_threads;
+ uint64_t team_num;
+ uint64_t num_teams;
+ EmissaryReturn_t return_value;
+} deferred_entry_t;
+
+static std::vector<deferred_entry_t *> *_deferred_fns_ptr;
+// static std::list<deferred_entry_t *> _deferred_fns;
+//
+
+extern "C" EmissaryReturn_t
+EmissaryFortrt(char *data, emisArgBuf_t *ab,
+ std::unordered_map<void *, void *> *D2HAddrList) {
+ EmissaryReturn_t return_value = (EmissaryReturn_t)0;
+
+ if (ab->DataLen == 0)
+ return _ERC_SUCCESS;
+
+ void *fnptr;
+ if (ab->NumArgs <= 0)
+ return _ERC_ERROR_INVALID_REQUEST;
+
+ emis_argptr_t *a[MAXVARGS];
+ if (EmissaryBuildVargs(ab->NumArgs, ab->keyptr, ab->argptr, ab->strptr,
+ &ab->data_not_used, a, D2HAddrList) != _ERC_SUCCESS)
+ return _ERC_ERROR_INVALID_REQUEST;
+
+ // std::list<deferred_entry_t *> _deferred_fns;
+ if (!_deferred_fns_ptr)
+ _deferred_fns_ptr = new std::vector<deferred_entry_t *>;
+
+ char *c_ptr = nullptr;
+ char *c_ptr2 = nullptr;
+ bool defer_for_reorder = true;
+ bool run_deferred_functions = false;
+ switch (ab->emisfnid) {
+ case _FortranAioBeginExternalListOutput_idx: {
+ _deferred_begin_statements++;
+ fnptr = (void *)V_FortranAioBeginExternalListOutput;
+ size_t slen = std::strlen((char *)a[5]) + 1;
+ c_ptr = (char *)aligned_alloc(sizeof(uint64_t *), slen);
+ if (!c_ptr)
+ fprintf(stderr, "MALLOC FAILED for c_ptr size:%ld \n", slen);
+ std::strncpy(c_ptr, (char *)a[5], slen - 1);
+ c_ptr[slen - 1] = (char)0;
+ a[5] = (emis_argptr_t *)c_ptr;
+ break;
+ }
+ case _FortranAioBeginExternalFormattedOutput_idx: {
+ _deferred_begin_statements++;
+ fnptr = (void *)V_FortranAioBeginExternalFormattedOutput;
+ size_t slen = std::strlen((char *)a[8]) + 1;
+ c_ptr = (char *)aligned_alloc(sizeof(uint64_t *), slen);
+ if (!c_ptr)
+ fprintf(stderr, "MALLOC FAILED for c_ptr size:%ld \n", slen);
+ std::strncpy(c_ptr, (char *)a[8], slen - 1);
+ c_ptr[slen - 1] = (char)0;
+ a[8] = (emis_argptr_t *)c_ptr;
+
+ slen = std::strlen((char *)a[4]) + 1;
+ c_ptr2 = (char *)aligned_alloc(sizeof(uint64_t *), slen);
+ if (!c_ptr2)
+ fprintf(stderr, "MALLOC FAILED for c_ptr2 size:%ld \n", slen);
+ std::strncpy(c_ptr2, (char *)a[4], slen - 1);
+ c_ptr2[slen - 1] = (char)0;
+ a[4] = (emis_argptr_t *)c_ptr2;
+ break;
+ }
+ case _FortranAioOutputAscii_idx: {
+ fnptr = (void *)V_FortranAioOutputAscii;
+
+ size_t slen = (size_t)a[6] + 1;
+ c_ptr = (char *)aligned_alloc(sizeof(uint64_t *), slen);
+ if (!c_ptr)
+ fprintf(stderr, "MALLOC FAILED for c_ptr size:%ld \n", slen);
+ std::strncpy(c_ptr, (char *)a[5], slen - 1);
+ c_ptr[slen - 1] = (char)0;
+ a[5] = (emis_argptr_t *)c_ptr;
+
+ break;
+ }
+ case _FortranAioOutputInteger32_idx: {
+ fnptr = (void *)V_FortranAioOutputInteger32;
+ break;
+ }
+ case _FortranAioEndIoStatement_idx: {
+ _deferred_end_statements++;
+ fnptr = (void *)V_FortranAioEndIoStatement;
+ // We cannot use last tread and team number to trigger running deferred
+ // functions because its warp could finish early (out of order). So, if
+ // this is the last FortranAioEndIoStatement by count of begin statements,
+ // then run the deferred functions ordered by team and thread number.
+ if (_deferred_end_statements == _deferred_begin_statements)
+ run_deferred_functions = true;
+ break;
+ }
+ case _FortranAioOutputInteger8_idx: {
+ fnptr = (void *)V_FortranAioOutputInteger8;
+ break;
+ }
+ case _FortranAioOutputInteger16_idx: {
+ fnptr = (void *)V_FortranAioOutputInteger16;
+ break;
+ }
+ case _FortranAioOutputInteger64_idx: {
+ fnptr = (void *)V_FortranAioOutputInteger64;
+ break;
+ }
+ case _FortranAioOutputReal32_idx: {
+ fnptr = (void *)V_FortranAioOutputReal32;
+ break;
+ }
+ case _FortranAioOutputReal64_idx: {
+ fnptr = (void *)V_FortranAioOutputReal64;
+ break;
+ }
+ case _FortranAioOutputComplex32_idx: {
+ fnptr = (void *)V_FortranAioOutputComplex32;
+ break;
+ }
+ case _FortranAioOutputComplex64_idx: {
+ fnptr = (void *)V_FortranAioOutputComplex64;
+ break;
+ }
+ case _FortranAioOutputLogical_idx: {
+ fnptr = (void *)V_FortranAioOutputLogical;
+ break;
+ }
+ case _FortranAAbort_idx: {
+ defer_for_reorder = false;
+ fnptr = (void *)V_FortranAAbort;
+ break;
+ }
+ case _FortranAStopStatementText_idx: {
+ defer_for_reorder = false;
+ fnptr = (void *)V_FortranAStopStatementText;
+ break;
+ }
+ case _FortranAStopStatement_idx: {
+ defer_for_reorder = false;
+ fnptr = (void *)V_FortranAStopStatement;
+ break;
+ }
+ case _FortranAio_INVALID:
+ default: {
+ defer_for_reorder = false;
+ break;
+ }
+ } // end of switch
+
+ if (defer_for_reorder) {
+ _deferred_fn_count++;
+ deferred_entry_t *q = new deferred_entry_t;
+
+ q->dfnid = _deferred_fn_count - 1;
+ q->thread_num = (uint64_t)a[0];
+ q->num_threads = (uint64_t)a[1];
+ _max_num_threads =
+ (q->num_threads > _max_num_threads) ? q->num_threads : _max_num_threads;
+ q->team_num = (uint64_t)a[2];
+ q->num_teams = (uint64_t)a[3];
+ _max_num_teams =
+ (q->num_teams > _max_num_teams) ? q->num_teams : _max_num_teams;
+ q->NumArgs = ab->NumArgs - 4;
+ q->fnptr = fnptr;
+ q->fn_idx = ab->emisfnid;
+ uint64_t *arg_array = (uint64_t *)aligned_alloc(
+ sizeof(uint64_t), (ab->NumArgs - 4) * sizeof(uint64_t));
+ if (!arg_array)
+ fprintf(stderr, " MALLOC FAILED for arg_array size:%ld \n",
+ sizeof(uint64_t) * (ab->NumArgs - 4));
+ for (uint32_t i = 0; i < ab->NumArgs - 4; i++) {
+ uint64_t val = (uint64_t)a[i + 4];
+ arg_array[i] = val;
+ }
+ q->arg_array = arg_array;
+ q->return_value = (EmissaryReturn_t)0;
+ q->c_ptr = c_ptr;
+ q->c_ptr2 = c_ptr2;
+ _deferred_fns_ptr->push_back(q);
+ } else {
+ // execute a non deferred function
+ return_value = EmissaryCallFnptr<EmissaryReturn_t, emisfn_t>(
+ ab->NumArgs - 4, fnptr, &a[4]);
+ }
+
+ if (run_deferred_functions) {
+ // This specific team and thread ordering does not reflect the
+ // actual non-deterministic ordering.
+ for (uint32_t team_num = 0; team_num < _max_num_teams; team_num++) {
+ for (uint32_t thread_num = 0; thread_num < _max_num_threads;
+ thread_num++) {
+ for (auto q : *_deferred_fns_ptr) {
+ if ((thread_num == q->thread_num) && (team_num == q->team_num)) {
+ for (uint32_t i = 0; i < q->NumArgs; i++)
+ a[i] = (emis_argptr_t *)q->arg_array[i];
+ q->return_value = EmissaryCallFnptr<EmissaryReturn_t, emisfn_t>(
+ q->NumArgs, q->fnptr, a);
+ }
+ // Only the return value for the last end statement is returned.
+ return_value = q->return_value;
+ }
+ }
+ }
+
+ // Reset static deferred function counters and free memory
+ for (auto q : *_deferred_fns_ptr) {
+ if (q->c_ptr)
+ free(q->c_ptr);
+ if (q->c_ptr2)
+ free(q->c_ptr2);
+ free(q->arg_array);
+ delete q;
+ }
+ _deferred_fns_ptr->clear();
+ _deferred_fn_count = 0;
+ _deferred_begin_statements = 0;
+ _deferred_end_statements = 0;
+ _max_num_threads = 0;
+ _max_num_teams = 0;
+ delete _deferred_fns_ptr;
+ _deferred_fns_ptr = nullptr;
+ } // end run_deferred_functions
+
+ return return_value;
+} // end EmissaryFortrt
diff --git a/offload/plugins-nextgen/common/src/EmissaryPrint.cpp b/offload/plugins-nextgen/common/src/EmissaryPrint.cpp
new file mode 100644
index 0000000000000..80c55d3c5a5fd
--- /dev/null
+++ b/offload/plugins-nextgen/common/src/EmissaryPrint.cpp
@@ -0,0 +1,423 @@
+//===--- offload/plugins-nextgen/common/src/EmissaryPrint.cpp ----- C++ ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Host support for misc emissary API.
+//
+//===----------------------------------------------------------------------===//
+#include <Emissary.h>
+#include <EmissaryIds.h>
+#include <assert.h>
+#include <cstring>
+#include <ctype.h>
+#include <list>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static service_rc emissary_printf(uint *rc, emisArgBuf_t *ab);
+static service_rc emissary_fprintf(uint *rc, emisArgBuf_t *ab);
+
+extern "C" EmissaryReturn_t EmissaryPrint(char *data, emisArgBuf_t *ab) {
+ uint32_t return_value;
+ service_rc rc;
+ switch (ab->emisfnid) {
+ case _printf_idx: {
+ rc = emissary_printf(&return_value, ab);
+ break;
+ }
+ case _fprintf_idx: {
+ rc = emissary_fprintf(&return_value, ab);
+ break;
+ }
+ case _ockl_asan_report_idx: {
+ fprintf(stderr, " asan_report not yet implemented\n");
+ return_value = 0;
+ rc = _ERC_STATUS_ERROR;
+ break;
+ }
+ case _print_INVALID:
+ default: {
+ fprintf(stderr, " INVALID emissary function id (%d) for PRINT API \n",
+ ab->emisfnid);
+ return_value = 0;
+ rc = _ERC_STATUS_ERROR;
+ break;
+ }
+ }
+ if (rc != _ERC_SUCCESS)
+ fprintf(stderr, "HOST failure in _emissary_execute_print rc:%d\n", rc);
+
+ return (EmissaryReturn_t)return_value;
+}
+
+// NUMFPREGS and FPREGSZ are part of x86 vargs ABI that
+// is recreated with the printf support.
+#define NUMFPREGS 8
+#define FPREGSZ 16
+
+typedef int uint128_t __attribute__((mode(TI)));
+struct emissary_pfIntRegs {
+ uint64_t rdi, rsi, rdx, rcx, r8, r9;
+};
+typedef struct emissary_pfIntRegs emissary_pfIntRegs_t; // size = 48 bytes
+
+struct emissary_pfRegSaveArea {
+ emissary_pfIntRegs_t iregs;
+ uint128_t freg[NUMFPREGS];
+};
+typedef struct emissary_pfRegSaveArea
+ emissary_pfRegSaveArea_t; // size = 304 bytes
+
+struct emissary_ValistExt {
+ uint32_t gp_offset; /* offset to next available gpr in reg_save_area */
+ uint32_t fp_offset; /* offset to next available fpr in reg_save_area */
+ void *overflow_arg_area; /* args that are passed on the stack */
+ emissary_pfRegSaveArea_t *reg_save_area; /* int and fp registers */
+ size_t overflow_size;
+} __attribute__((packed));
+typedef struct emissary_ValistExt emissary_ValistExt_t;
+
+// Handle overflow when building the va_list for vprintf
+static service_rc emissary_pfGetOverflow(emissary_ValistExt_t *valist,
+ size_t needsize) {
+ if (needsize < valist->overflow_size)
+ return _ERC_SUCCESS;
+
+ // Make the overflow area bigger
+ size_t stacksize;
+ void *newstack;
+ if (valist->overflow_size == 0) {
+ // Make initial save area big to reduce mallocs
+ stacksize = (FPREGSZ * NUMFPREGS) * 2;
+ if (needsize > stacksize)
+ stacksize = needsize; // maybe a big string
+ } else {
+ // Initial save area not big enough, double it
+ stacksize = valist->overflow_size * 2;
+ }
+ if (!(newstack = malloc(stacksize))) {
+ return _ERC_STATUS_ERROR;
+ }
+ memset(newstack, 0, stacksize);
+ if (valist->overflow_size) {
+ memcpy(newstack, valist->overflow_arg_area, valist->overflow_size);
+ free(valist->overflow_arg_area);
+ }
+ valist->overflow_arg_area = newstack;
+ valist->overflow_size = stacksize;
+ return _ERC_SUCCESS;
+}
+
+// Add an integer to the va_list for vprintf
+static service_rc emissary_pfAddInteger(emissary_ValistExt_t *valist, char *val,
+ size_t valsize, size_t *stacksize) {
+ uint64_t ival;
+ switch (valsize) {
+ case 1:
+ ival = *(uint8_t *)val;
+ break;
+ case 2:
+ ival = *(uint32_t *)val;
+ break;
+ case 4:
+ ival = (*(uint32_t *)val);
+ break;
+ case 8:
+ ival = *(uint64_t *)val;
+ break;
+ default: {
+ return _ERC_STATUS_ERROR;
+ }
+ }
+ // Always copy 8 bytes, sizeof(ival)
+ if ((valist->gp_offset + sizeof(ival)) <= sizeof(emissary_pfIntRegs_t)) {
+ memcpy(((char *)valist->reg_save_area + valist->gp_offset), &ival,
+ sizeof(ival));
+ valist->gp_offset += sizeof(ival);
+ return _ERC_SUCCESS;
+ }
+ // Ensure valist overflow area is big enough
+ size_t needsize = (size_t)*stacksize + sizeof(ival);
+ if (emissary_pfGetOverflow(valist, needsize) != _ERC_SUCCESS)
+ return _ERC_STATUS_ERROR;
+ // Copy to overflow
+ memcpy((char *)(valist->overflow_arg_area) + (size_t)*stacksize, &ival,
+ sizeof(ival));
+
+ *stacksize += sizeof(ival);
+ return _ERC_SUCCESS;
+}
+
+// Add a String argument when building va_list for vprintf
+static service_rc emissary_pfAddString(emissary_ValistExt_t *valist, char *val,
+ size_t strsz, size_t *stacksize) {
+ size_t valsize =
+ sizeof(char *); // ABI captures pointer to string, not string
+ if ((valist->gp_offset + valsize) <= sizeof(emissary_pfIntRegs_t)) {
+ memcpy(((char *)valist->reg_save_area + valist->gp_offset), val, valsize);
+ valist->gp_offset += valsize;
+ return _ERC_SUCCESS;
+ }
+ size_t needsize = (size_t)*stacksize + valsize;
+ if (emissary_pfGetOverflow(valist, needsize) != _ERC_SUCCESS)
+ return _ERC_STATUS_ERROR;
+ memcpy((char *)(valist->overflow_arg_area) + (size_t)*stacksize, val,
+ valsize);
+ *stacksize += valsize;
+ return _ERC_SUCCESS;
+}
+
+// Add a floating point value when building va_list for vprintf
+static service_rc emissary_pfAddFloat(emissary_ValistExt_t *valist,
+ char *numdata, size_t valsize,
+ size_t *stacksize) {
+ // we could use load because doubles are now aligned
+ double dval;
+ if (valsize == 4) {
+ float fval;
+ memcpy(&fval, numdata, 4);
+ dval = (double)fval; // Extend single to double per abi
+ } else if (valsize == 8) {
+ memcpy(&dval, numdata, 8);
+ } else {
+ return _ERC_STATUS_ERROR;
+ }
+ if ((valist->fp_offset + FPREGSZ) <= sizeof(emissary_pfRegSaveArea_t)) {
+ memcpy(((char *)valist->reg_save_area + (size_t)(valist->fp_offset)), &dval,
+ sizeof(double));
+ valist->fp_offset += FPREGSZ;
+ return _ERC_SUCCESS;
+ }
+ size_t needsize = (size_t)*stacksize + sizeof(double);
+ if (emissary_pfGetOverflow(valist, needsize) != _ERC_SUCCESS)
+ return _ERC_STATUS_ERROR;
+ memcpy((char *)(valist->overflow_arg_area) + (size_t)*stacksize, &dval,
+ sizeof(double));
+ // move only by the size of the double (8 bytes)
+ *stacksize += sizeof(double);
+ return _ERC_SUCCESS;
+}
+
+// Build an extended va_list for vprintf by unpacking the buffer
+static service_rc emissary_pfBuildValist(emissary_ValistExt_t *valist,
+ int NumArgs, char *keyptr,
+ char *dataptr, char *strptr,
+ unsigned long long *data_not_used) {
+ emissary_pfRegSaveArea_t *regs;
+ size_t regs_size = sizeof(*regs);
+ regs = (emissary_pfRegSaveArea_t *)malloc(regs_size);
+ if (!regs)
+ return _ERC_STATUS_ERROR;
+ memset(regs, 0, regs_size);
+ *valist = (emissary_ValistExt_t){
+ .gp_offset = 0,
+ .fp_offset = 0,
+ .overflow_arg_area = NULL,
+ .reg_save_area = regs,
+ .overflow_size = 0,
+ };
+
+ size_t num_bytes;
+ size_t bytes_consumed;
+ size_t strsz;
+ size_t fillerNeeded;
+
+ size_t stacksize = 0;
+
+ for (int argnum = 0; argnum < NumArgs; argnum++) {
+ num_bytes = 0;
+ strsz = 0;
+ unsigned int key = *(unsigned int *)keyptr;
+ unsigned int llvmID = key >> 16;
+ unsigned int numbits = (key << 16) >> 16;
+ switch (llvmID) {
+ case FloatTyID: ///< 2: 32-bit floating point type
+ case DoubleTyID: ///< 3: 64-bit floating point type
+ case FP128TyID: ///< 5: 128-bit floating point type (112-bit mantissa)
+ num_bytes = numbits / 8;
+ bytes_consumed = num_bytes;
+ fillerNeeded = ((size_t)dataptr) % num_bytes;
+ if (fillerNeeded) {
+ dataptr += fillerNeeded;
+ bytes_consumed += fillerNeeded;
+ }
+ if ((*data_not_used) < bytes_consumed)
+ return _ERC_DATA_USED_ERROR;
+ if (valist->fp_offset == 0)
+ valist->fp_offset = sizeof(emissary_pfIntRegs_t);
+ if (emissary_pfAddFloat(valist, dataptr, num_bytes, &stacksize))
+ return _ERC_ADDFLOAT_ERROR;
+ break;
+
+ case IntegerTyID: ///< 11: Arbitrary bit width integers
+ num_bytes = numbits / 8;
+ bytes_consumed = num_bytes;
+ fillerNeeded = ((size_t)dataptr) % num_bytes;
+ if (fillerNeeded) {
+ dataptr += fillerNeeded;
+ bytes_consumed += fillerNeeded;
+ }
+ if ((*data_not_used) < bytes_consumed)
+ return _ERC_DATA_USED_ERROR;
+ if (emissary_pfAddInteger(valist, dataptr, num_bytes, &stacksize))
+ return _ERC_ADDINT_ERROR;
+ break;
+
+ case PointerTyID: ///< 15: Pointers
+ if (numbits == 1) { // This is a pointer to string
+ num_bytes = 4;
+ bytes_consumed = num_bytes;
+ strsz = (size_t) * (unsigned int *)dataptr;
+ if ((*data_not_used) < bytes_consumed)
+ return _ERC_DATA_USED_ERROR;
+ if (emissary_pfAddString(valist, (char *)&strptr, strsz, &stacksize))
+ return _ERC_ADDSTRING_ERROR;
+ } else {
+ num_bytes = 8;
+ bytes_consumed = num_bytes;
+ fillerNeeded = ((size_t)dataptr) % num_bytes;
+ if (fillerNeeded) {
+ dataptr += fillerNeeded; // dataptr is now aligned
+ bytes_consumed += fillerNeeded;
+ }
+ if ((*data_not_used) < bytes_consumed)
+ return _ERC_DATA_USED_ERROR;
+ if (emissary_pfAddInteger(valist, dataptr, num_bytes, &stacksize))
+ return _ERC_ADDINT_ERROR;
+ }
+ break;
+
+ case HalfTyID: ///< 1: 16-bit floating point type
+ case ArrayTyID: ///< 14: Arrays
+ case StructTyID: ///< 13: Structures
+ case FunctionTyID: ///< 12: Functions
+ case TokenTyID: ///< 10: Tokens
+ case MetadataTyID: ///< 8: Metadata
+ case LabelTyID: ///< 7: Labels
+ case PPC_FP128TyID: ///< 6: 128-bit floating point type (two 64-bits,
+ ///< PowerPC)
+ case X86_FP80TyID: ///< 4: 80-bit floating point type (X87)
+ case FixedVectorTyID: ///< 16: Fixed width SIMD vector type
+ case ScalableVectorTyID: ///< 17: Scalable SIMD vector type
+ case TypedPointerTyID: ///< Typed pointer used by some GPU targets
+ case TargetExtTyID: ///< Target extension type
+ case VoidTyID:
+ return _ERC_UNSUPPORTED_ID_ERROR;
+ break;
+ default:
+ return _ERC_INVALID_ID_ERROR;
+ }
+
+ dataptr += num_bytes;
+ strptr += strsz;
+ *data_not_used -= bytes_consumed;
+ keyptr += 4;
+ }
+ return _ERC_SUCCESS;
+} // end emissary_pfBuildValist
+
+/*
+ * The buffer to pack arguments for all vargs functions has thes 4 sections:
+ * 1. Header datalen 4 bytes
+ * numargs 4 bytes
+ * 2. keyptr A 4-byte key for each arg including string args
+ * Each 4-byte key contains llvmID and numbits to
+ * describe the datatype.
+ * 3. argptr Ths data values for each argument.
+ * Each arg is aligned according to its size.
+ * If the field is a string
+ * the dataptr contains the string length.
+ * 4. strptr Exection time string values
+ */
+static service_rc emissary_fprintf(uint *rc, emisArgBuf_t *ab) {
+
+ if (ab->DataLen == 0)
+ return _ERC_SUCCESS;
+
+ char *fmtstr = ab->strptr;
+ FILE *fileptr = (FILE *)*((size_t *)ab->argptr);
+
+ // Skip past the file pointer
+ ab->NumArgs--;
+ ab->keyptr += 4;
+ ab->argptr += sizeof(FILE *);
+ ab->data_not_used -= sizeof(FILE *);
+
+ // Skip past the format string
+ ab->NumArgs--;
+ ab->keyptr += 4;
+ size_t abstrsz = (size_t) * (unsigned int *)ab->argptr;
+ ab->strptr += abstrsz;
+ ab->argptr += 4;
+ ab->data_not_used -= 4;
+
+ emissary_ValistExt_t valist;
+ va_list *real_va_list;
+ real_va_list = (va_list *)&valist;
+
+ if (emissary_pfBuildValist(&valist, ab->NumArgs, ab->keyptr, ab->argptr,
+ ab->strptr, &ab->data_not_used) != _ERC_SUCCESS)
+ return _ERC_ERROR_INVALID_REQUEST;
+
+ // Roll back offsets and save stack pointer
+ valist.gp_offset = 0;
+ valist.fp_offset = sizeof(emissary_pfIntRegs_t);
+ void *save_stack = valist.overflow_arg_area;
+ *rc = vfprintf(fileptr, fmtstr, *real_va_list);
+ if (valist.reg_save_area)
+ free(valist.reg_save_area);
+ if (save_stack)
+ free(save_stack);
+ return _ERC_SUCCESS;
+}
+
+static service_rc emissary_printf(uint *rc, emisArgBuf_t *ab) {
+ if (ab->DataLen == 0)
+ return _ERC_SUCCESS;
+
+ char *fmtstr = ab->strptr;
+
+ // Skip past the format string
+ ab->NumArgs--;
+ ab->keyptr += 4;
+ size_t abstrsz = (size_t) * (unsigned int *)ab->argptr;
+ ab->strptr += abstrsz;
+ ab->argptr += 4;
+ ab->data_not_used -= 4;
+
+ emissary_ValistExt_t valist;
+ va_list *real_va_list;
+ real_va_list = (va_list *)&valist;
+
+ if (emissary_pfBuildValist(&valist, ab->NumArgs, ab->keyptr, ab->argptr,
+ ab->strptr, &ab->data_not_used) != _ERC_SUCCESS)
+ return _ERC_ERROR_INVALID_REQUEST;
+
+ // Roll back offsets and save stack pointer for
+ valist.gp_offset = 0;
+ valist.fp_offset = sizeof(emissary_pfIntRegs_t);
+ void *save_stack = valist.overflow_arg_area;
+ *rc = vprintf(fmtstr, *real_va_list);
+ if (valist.reg_save_area)
+ free(valist.reg_save_area);
+ if (save_stack)
+ free(save_stack);
+ return _ERC_SUCCESS;
+}
+
+extern "C" void *global_allocate(uint32_t bufsz) {
+ return malloc((size_t)bufsz);
+}
+extern "C" int global_free(void *ptr) {
+ free(ptr);
+ return 0;
+}
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index 8d6704733970d..c3f041aa26637 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -13,9 +13,11 @@
#include "PluginInterface.h"
+#include "Emissary.h"
#include "shared/rpc.h"
#include "shared/rpc_opcodes.h"
#include "shared/rpc_server.h"
+#include <unordered_map>
using namespace llvm;
using namespace omp;
@@ -63,6 +65,156 @@ rpc::Status handleOffloadOpcodes(plugin::GenericDeviceTy &Device,
});
break;
}
+
+ // This case handles the device function __llvm_emissary_rpc for emissary
+ // APIs that require no d2h or h2d memory transfer.
+ case OFFLOAD_EMISSARY: {
+ uint64_t Sizes[NumLanes] = {0};
+ unsigned long long Results[NumLanes] = {0};
+ void *buf_ptrs[NumLanes] = {nullptr};
+ Port.recv_n(buf_ptrs, Sizes, [&](uint64_t Size) { return new char[Size]; });
+ uint32_t id = 0;
+ for (void *buffer_ptr : buf_ptrs) {
+ if (buffer_ptr) {
+ emisArgBuf_t ab;
+ emisExtractArgBuf((char *)buffer_ptr, &ab);
+ Results[id++] = Emissary((char *)buffer_ptr, &ab, nullptr);
+ }
+ }
+ Port.send([&](rpc::Buffer *Buffer, uint32_t ID) {
+ Buffer->data[0] = static_cast<uint64_t>(Results[ID]);
+ delete[] reinterpret_cast<char *>(buf_ptrs[ID]);
+ });
+ break;
+ }
+
+ // This case handles the device function __llvm_emissary_rpc_dm for emissary
+ // APIs require D2H or H2D transfer vectors to be processed through the Port.
+ // FIXME: test with multiple transfer vectors of the same type.
+ case OFFLOAD_EMISSARY_DM: {
+ uint64_t Sizes[NumLanes] = {0};
+ unsigned long long Results[NumLanes] = {0};
+ void *buf_ptrs[NumLanes] = {nullptr};
+ Port.recv_n(buf_ptrs, Sizes, [&](uint64_t Size) { return new char[Size]; });
+
+ uint32_t id = 0;
+ emisArgBuf_t AB[NumLanes];
+ std::unordered_map<void *, void *> D2HAddrList;
+ void *Xfers[NumLanes] = {nullptr};
+ void *devXfers[NumLanes] = {nullptr};
+ uint64_t XferSzs[NumLanes] = {0};
+ uint32_t numSendXfers = 0;
+ id = 0;
+ for (void *buffer_ptr : buf_ptrs) {
+ if (buffer_ptr) {
+ emisArgBuf_t *ab = &AB[id];
+ emisExtractArgBuf((char *)buffer_ptr, ab);
+ for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++) {
+ numSendXfers++;
+ devXfers[id] = (void *)*((uint64_t *)ab->argptr);
+ XferSzs[id] = (size_t) * ((size_t *)(ab->argptr + sizeof(void *)));
+ emisSkipXferArgSet(ab);
+ }
+ // Allocate the host space for the receive Xfers
+ for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+ void *devAddr = (void *)*((uint64_t *)ab->argptr);
+ size_t devSz = (size_t) * ((size_t *)(ab->argptr + sizeof(void *)));
+ void *hostAddr = new char[devSz];
+ D2HAddrList.insert(std::pair<void *, void *>(devAddr, hostAddr));
+ emisSkipXferArgSet(ab);
+ }
+ id++;
+ }
+ }
+ // recv_n for device send_n into new host-allocated Xfers
+ if (numSendXfers)
+ Port.recv_n(Xfers, XferSzs,
+ [&](uint64_t Size) { return new char[Size]; });
+
+ // Xfers now contains just allocated host addrs for sends and
+ // devXfers contains corresponding devAddr for those sends
+ // Build map to pass to Emissary
+ id = 0;
+ for (void *Xfer : Xfers) {
+ if (Xfer) {
+ D2HAddrList.insert(std::pair<void *, void *>(devXfers[id], Xfer));
+ id++;
+ }
+ }
+
+ // Call Emissary for each active lane
+ id = 0;
+ for (void *buffer_ptr : buf_ptrs) {
+ if (buffer_ptr) {
+ emisArgBuf_t *ab = &AB[id];
+ emisExtractArgBuf((char *)buffer_ptr, ab);
+ for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++)
+ emisSkipXferArgSet(ab);
+ for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++)
+ emisSkipXferArgSet(ab);
+ Results[id] = Emissary((char *)buffer_ptr, ab, &D2HAddrList);
+ id++;
+ }
+ }
+
+ // Process send_n for the H2D Xfers.
+ void *recvXfers[NumLanes] = {nullptr};
+ uint64_t recvXferSzs[NumLanes] = {0};
+ id = 0;
+ uint32_t numRecvXfers = 0;
+ for (void *buffer_ptr : buf_ptrs) {
+ if (buffer_ptr) {
+ emisArgBuf_t *ab = &AB[id];
+ // Reset ArgBuf tracker
+ emisExtractArgBuf((char *)buffer_ptr, ab);
+ for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++)
+ emisSkipXferArgSet(ab);
+ for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+ numRecvXfers++;
+ void *devAddr = (void *)*((uint64_t *)ab->argptr);
+ recvXfers[id] = D2HAddrList[devAddr];
+ recvXferSzs[id] =
+ (uint64_t) * ((size_t *)(ab->argptr + sizeof(void *)));
+ emisSkipXferArgSet(ab);
+ }
+ id++;
+ }
+ }
+ if (numRecvXfers)
+ Port.send_n(recvXfers, recvXferSzs);
+ // Cleanup all host allocated transfer buffers
+ id = 0;
+ for (void *buffer_ptr : buf_ptrs) {
+ if (buffer_ptr) {
+ emisArgBuf_t *ab = &AB[id];
+ // Reset the ArgBuf tracker ab
+ emisExtractArgBuf((char *)buffer_ptr, ab);
+ // Cleanup host allocated send Xfers
+ for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++) {
+ void *devAddr = (void *)*((uint64_t *)ab->argptr);
+ void *hostAddr = D2HAddrList[devAddr];
+ delete[] reinterpret_cast<char *>(hostAddr);
+ emisSkipXferArgSet(ab);
+ }
+ // Cleanup host allocated bufs
+ for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
+ void *devAddr = (void *)*((uint64_t *)ab->argptr);
+ void *hostAddr = D2HAddrList[devAddr];
+ delete[] reinterpret_cast<char *>(hostAddr);
+ emisSkipXferArgSet(ab);
+ }
+ id++;
+ }
+ }
+
+ Port.send([&](rpc::Buffer *Buffer, uint32_t ID) {
+ Buffer->data[0] = static_cast<uint64_t>(Results[ID]);
+ delete[] reinterpret_cast<char *>(buf_ptrs[ID]);
+ });
+
+ break;
+ } // END CASE OFFLOAD_EMISSARY
+
default:
return rpc::RPC_UNHANDLED_OPCODE;
break;
diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt
index 54cfdfef440a5..48873601e4f8f 100644
--- a/openmp/device/CMakeLists.txt
+++ b/openmp/device/CMakeLists.txt
@@ -7,6 +7,15 @@ if(LLVM_VERSION_MAJOR AND NOT (CMAKE_CXX_COMPILER_ID MATCHES "[Cc]lang" AND
" is not 'Clang ${req_ver}'.")
endif()
+option(OFFLOAD_ENABLE_EMISSARY_APIS "Enable build of GPU Emissary APIs" ON)
+if(OFFLOAD_ENABLE_EMISSARY_APIS)
+ add_definitions(-DOFFLOAD_ENABLE_EMISSARY_APIS)
+ set(emissary_sources
+ ${CMAKE_CURRENT_SOURCE_DIR}/src/EmissaryFortrt.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/src/EmissaryPrint.cpp
+ )
+endif()
+
set(src_files
${CMAKE_CURRENT_SOURCE_DIR}/src/Allocator.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/Configuration.cpp
@@ -23,6 +32,7 @@ set(src_files
${CMAKE_CURRENT_SOURCE_DIR}/src/Tasking.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/DeviceUtils.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/Workshare.cpp
+ ${emissary_sources}
)
list(APPEND compile_options -flto)
diff --git a/openmp/device/src/EmissaryFortrt.cpp b/openmp/device/src/EmissaryFortrt.cpp
new file mode 100644
index 0000000000000..a0d45f4ae2080
--- /dev/null
+++ b/openmp/device/src/EmissaryFortrt.cpp
@@ -0,0 +1,144 @@
+//===- EmissaryFortrt.cpp - Fortran Runtime emissary API ----- ---- c++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Device stubs for Fortran Runtime emissary API
+//
+//===----------------------------------------------------------------------===//
+
+#include "DeviceTypes.h"
+#include "EmissaryIds.h"
+#include "Shared/RPCOpcodes.h"
+#include "shared/rpc.h"
+
+unsigned long long _emissary_exec(unsigned long long, ...);
+
+extern "C" {
+
+// The clang compiler will generate calls to this only when a string length is
+// not a compile time constant.
+uint32_t __strlen_max(char *instr, uint32_t maxstrlen) {
+ for (uint32_t i = 0; i < maxstrlen; i++)
+ if (instr[i] == (char)0)
+ return (uint32_t)(i + 1);
+ return maxstrlen;
+}
+
+uint32_t omp_get_thread_num();
+uint32_t omp_get_num_threads();
+uint32_t omp_get_team_num();
+uint32_t omp_get_num_teams();
+
+// All Fortran Runtime Functions pass 4 extra args to assist with
+// defered execution and debug. The host variadic wrappers do not use
+// these arguments when calling the actual Fortran runtime.
+#define _EXTRA_ARGS \
+ omp_get_thread_num(), omp_get_num_threads(), omp_get_team_num(), \
+ omp_get_num_teams()
+#define _START_ARGS(idx) _PACK_EMIS_IDS(EMIS_ID_FORTRT, idx, 0, 0), _EXTRA_ARGS,
+
+void *_FortranAioBeginExternalListOutput(uint32_t a1, const char *a2,
+ uint32_t a3) {
+ void *cookie = (void *)_emissary_exec(
+ _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioBeginExternalListOutput_idx, 0,
+ 0),
+ _EXTRA_ARGS, a1, a2, a3);
+ return cookie;
+}
+
+void *_FortranAioBeginExternalFormattedOutput(char *fmt, uint64_t fmtlen,
+ void *ptr, uint32_t val1,
+ char *source_name,
+ uint32_t val2) {
+ fmt[fmtlen - 1] = (char)0;
+ void *cookie = (void *)_emissary_exec(
+ _PACK_EMIS_IDS(EMIS_ID_FORTRT,
+ _FortranAioBeginExternalFormattedOutput_idx, 0, 0),
+ _EXTRA_ARGS, fmt, fmtlen, ptr, val1, source_name, val2);
+ return cookie;
+}
+
+bool _FortranAioOutputAscii(void *a1, char *a2, uint64_t a3) {
+ // insert null terminating char so _emissary_exec can correctly
+ // calculate runtime str length
+ a2[a3 - 1] = (char)0;
+ return (bool)_emissary_exec(
+ _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputAscii_idx, 0, 0),
+ _EXTRA_ARGS, a1, a2, a3);
+}
+bool _FortranAioOutputInteger32(void *a1, uint32_t a2) {
+ return (bool)_emissary_exec(
+ _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputInteger32_idx, 0, 0),
+ _EXTRA_ARGS, a1, a2);
+}
+uint32_t _FortranAioEndIoStatement(void *a1) {
+ return (uint32_t)_emissary_exec(
+ _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioEndIoStatement_idx, 0, 0),
+ _EXTRA_ARGS, a1);
+}
+bool _FortranAioOutputInteger8(void *cookie, int8_t n) {
+ return (bool)_emissary_exec(
+ _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputInteger8_idx, 0, 0),
+ _EXTRA_ARGS, cookie, n);
+}
+bool _FortranAioOutputInteger16(void *cookie, int16_t n) {
+ return (bool)_emissary_exec(
+ _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputInteger16_idx, 0, 0),
+ _EXTRA_ARGS, cookie, n);
+}
+bool _FortranAioOutputInteger64(void *cookie, int64_t n) {
+ return (bool)_emissary_exec(
+ _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputInteger64_idx, 0, 0),
+ _EXTRA_ARGS, cookie, n);
+}
+bool _FortranAioOutputReal32(void *cookie, float x) {
+ return (bool)_emissary_exec(
+ _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputReal32_idx, 0, 0),
+ _EXTRA_ARGS, cookie, x);
+}
+bool _FortranAioOutputReal64(void *cookie, double x) {
+ return (bool)_emissary_exec(
+ _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputReal64_idx, 0, 0),
+ _EXTRA_ARGS, cookie, x);
+}
+bool _FortranAioOutputComplex32(void *cookie, float re, float im) {
+ return (bool)_emissary_exec(
+ _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputComplex32_idx, 0, 0),
+ _EXTRA_ARGS, cookie, re, im);
+}
+bool _FortranAioOutputComplex64(void *cookie, double re, double im) {
+ return (bool)_emissary_exec(
+ _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputComplex64_idx, 0, 0),
+ _EXTRA_ARGS, cookie, re, im);
+}
+bool _FortranAioOutputLogical(void *cookie, bool barg) {
+ return (bool)_emissary_exec(
+ _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAioOutputLogical_idx, 0, 0),
+ _EXTRA_ARGS, cookie, barg);
+}
+void _FortranAAbort() {
+ _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAAbort_idx, 0, 0),
+ _EXTRA_ARGS);
+ // When host service _FortranAAbort finishes, we must die from the device.
+ __builtin_trap();
+}
+void _FortranAStopStatement(int32_t a1, bool a2, bool a3) {
+ _emissary_exec(
+ _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAStopStatement_idx, 0, 0),
+ _EXTRA_ARGS, a1, a2, a3);
+ __builtin_trap();
+}
+void _FortranAStopStatementText(char *errmsg, int64_t a1, bool a2, bool a3) {
+ errmsg[a1 - 1] = (char)0;
+ _emissary_exec(
+ _PACK_EMIS_IDS(EMIS_ID_FORTRT, _FortranAStopStatementText_idx, 0, 0),
+ _EXTRA_ARGS, errmsg, a1, a2, a3);
+ __builtin_trap();
+}
+
+} // end extern "C"
+#undef _EXTRA_ARGS
diff --git a/openmp/device/src/EmissaryPrint.cpp b/openmp/device/src/EmissaryPrint.cpp
new file mode 100644
index 0000000000000..80bd0262d5ed7
--- /dev/null
+++ b/openmp/device/src/EmissaryPrint.cpp
@@ -0,0 +1,79 @@
+//===----------- EmissaryPrint.cpp - Misc Emissary API ------------ c++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Device stubs for misc emissary API
+//
+//===----------------------------------------------------------------------===//
+
+#include "Allocator.h"
+#include "Configuration.h"
+#include "DeviceTypes.h"
+#include "Shared/RPCOpcodes.h"
+#include "shared/rpc.h"
+
+#include "Debug.h"
+#include "EmissaryIds.h"
+
+#if 0
+__attribute__((flatten, always_inline)) void f90print_(char *s) {
+ _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _printf_idx,0,0), "%s\n", s);
+}
+__attribute__((flatten, always_inline)) void f90printi_(char *s, int *i) {
+ _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _printf_idx,0,0), "%s $d\n", s, *i);
+}
+__attribute__((flatten, always_inline)) void f90printl_(char *s, long *i) {
+ _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _printf_idx,0,0), "%s %ld\n", s, *i);
+}
+__attribute__((flatten, always_inline)) void f90printf_(char *s, float *f) {
+ _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _printf_idx,0,0), "%s %f\n", s, *f);
+}
+__attribute__((flatten, always_inline)) void f90printd_(char *s, double *d) {
+ _emissary_exec(_PACK_EMIS_IDS(EMIS_ID_PRINT, _printf_idx,0,0), "%s %g\n", s, *d);
+}
+#endif
+
+// This definition of __ockl_devmem_request and __ockl_sanitizer_report needs to
+// override the weak symbol for __ockl_devmem_request and
+// __ockl_sanitizer_report in rocm device lib ockl.bc because ockl uses
+// hostcall but OpenMP uses rpc.
+//
+extern "C" {
+__attribute__((noinline)) int
+__ockl_sanitizer_report(uint64_t addr, uint64_t pc, uint64_t wgidx,
+ uint64_t wgidy, uint64_t wgidz, uint64_t wave_id,
+ uint64_t is_read, uint64_t access_size) {
+ unsigned long long rc = _emissary_exec(
+ _PACK_EMIS_IDS(EMIS_ID_PRINT, _ockl_asan_report_idx, 0, 0), addr, pc,
+ wgidx, wgidy, wgidz, wave_id, is_read, access_size);
+ return ((int)rc);
+}
+#if SANITIZER_AMDGPU
+__attribute__((noinline)) uint64_t __asan_malloc_impl(uint64_t bufsz,
+ uint64_t pc);
+__attribute__((noinline)) void __asan_free_impl(uint64_t ptr, uint64_t pc);
+#endif
+
+__attribute__((flatten, always_inline)) char *global_allocate(uint32_t bufsz) {
+#if SANITIZER_AMDGPU
+ return (char *)__asan_malloc_impl(bufsz,
+ (uint64_t)__builtin_return_address(0));
+#else
+ return (char *)malloc((uint64_t)bufsz);
+#endif
+}
+__attribute__((flatten, always_inline)) int global_free(void *ptr) {
+#if SANITIZER_AMDGPU
+ __asan_free_impl((uint64_t)ptr, (uint64_t)__builtin_return_address(0));
+#else
+ free(ptr);
+#endif
+ return 0;
+}
+
+//
+} // end extern "C"
diff --git a/openmp/device/src/Misc.cpp b/openmp/device/src/Misc.cpp
index 5d5a2a383f2b2..fc300d60ea752 100644
--- a/openmp/device/src/Misc.cpp
+++ b/openmp/device/src/Misc.cpp
@@ -134,6 +134,68 @@ unsigned long long __llvm_omp_host_call(void *fn, void *data, size_t size) {
Port.close();
return Ret;
}
+
+// Calls to __llvm_omp_emissary_rpc and __llvm_omp_emissary_premalloc are
+// generated by device codegen at _emissary_exec call sites.
+// See clang/lib/CodeGen/CGEmitEmissaryExec.cpp
+void *__llvm_emissary_premalloc(uint32_t sz) {
+ return omp_alloc((size_t)sz, omp_default_mem_alloc);
+}
+
+unsigned long long __llvm_emissary_rpc(uint32_t sz32, void *bufdata) {
+ rpc::Client::Port Port = ompx::impl::Client.open<OFFLOAD_EMISSARY>();
+ Port.send_n(bufdata, (size_t)sz32);
+ unsigned long long Ret;
+ Port.recv([&](rpc::Buffer *Buffer, uint32_t) {
+ Ret = static_cast<unsigned long long>(Buffer->data[0]);
+ });
+ omp_free(bufdata, omp_default_mem_alloc);
+ Port.close();
+ return Ret;
+}
+
+// This is for emissary APIs that require d2h or h2d memory transfers.
+unsigned long long __llvm_emissary_rpc_dm(uint32_t sz32, void *bufdata) {
+ rpc::Client::Port Port = ompx::impl::Client.open<OFFLOAD_EMISSARY_DM>();
+ Port.send_n(bufdata, (size_t)sz32);
+
+ char *data = (char *)bufdata;
+ uint32_t *int32_data = (uint32_t *)data;
+ uint32_t NumArgs = int32_data[1];
+ char *keyptr = data + (2 * sizeof(int));
+ char *argptr = keyptr + (NumArgs * sizeof(int));
+ if (((size_t)argptr) % (size_t)8)
+ argptr += 4; // argptr must be aligned
+ uint64_t arg1 = *(uint64_t *)argptr;
+ uint32_t NumSendXfers = (unsigned int)((arg1 >> 16) & 0xFFFF);
+ uint32_t NumRecvXfers = (unsigned int)((arg1) & 0xFFFF);
+ // Skip by arg1 and process Send and Recv Xfers if any
+ argptr += sizeof(uint64_t);
+ for (uint32_t idx = 0; idx < NumSendXfers; idx++) {
+ void *D2Hdata = (void *)*((uint64_t *)argptr);
+ argptr += sizeof(void *);
+ size_t D2Hsize = (size_t) * ((size_t *)argptr);
+ argptr += sizeof(size_t);
+ Port.send_n(D2Hdata, D2Hsize);
+ }
+ for (uint32_t idx = 0; idx < NumRecvXfers; idx++) {
+ void *H2Ddata = (void *)*((uint64_t *)argptr);
+ argptr += sizeof(void *);
+ argptr += sizeof(size_t);
+ uint64_t recv_size;
+ void *buf = nullptr;
+ Port.recv_n(&buf, &recv_size,
+ [&](uint64_t) { return reinterpret_cast<void *>(H2Ddata); });
+ }
+
+ unsigned long long Ret;
+ Port.recv([&](rpc::Buffer *Buffer, uint32_t) {
+ Ret = static_cast<unsigned long long>(Buffer->data[0]);
+ });
+ omp_free(bufdata, omp_default_mem_alloc);
+ Port.close();
+ return Ret;
+}
}
// C++ ABI helpers.
>From e61bf8f536e5dfa3b1fc61d6dbbddfa96386a110 Mon Sep 17 00:00:00 2001
From: gregrodgers <Gregory.Rodgers at amd.com>
Date: Mon, 12 Jan 2026 06:43:48 -0600
Subject: [PATCH 2/2] [clang-format] apply more recent clang-format. Only
clang-format changes with this commit
---
clang/include/clang/Options/Options.td | 12 +++++++-----
clang/lib/Headers/EmissaryIds.h | 4 ++--
offload/plugins-nextgen/common/src/Emissary.cpp | 2 +-
offload/plugins-nextgen/common/src/EmissaryPrint.cpp | 6 +++---
offload/plugins-nextgen/common/src/RPC.cpp | 6 +++---
openmp/device/src/Misc.cpp | 2 +-
6 files changed, 17 insertions(+), 15 deletions(-)
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 8cf017d8effd7..5e9c2e35a431a 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -8940,11 +8940,13 @@ def fopenmp_host_ir_file_path : Separate<["-"], "fopenmp-host-ir-file-path">,
} // let Visibility = [CC1Option, FC1Option]
-defm use_emissary_print: BoolFOption<"use-emissary-print",
- LangOpts<"UseEmissaryPrint">, DefaultTrue,
- PosFlag<SetTrue, [], [ClangOption, CC1Option],
- "Enable use of Emissary printf/fprint overriding device libc printf/fprintf">,
- NegFlag<SetFalse>>;
+defm use_emissary_print
+ : BoolFOption<"use-emissary-print", LangOpts<"UseEmissaryPrint">,
+ DefaultTrue,
+ PosFlag<SetTrue, [], [ClangOption, CC1Option],
+ "Enable use of Emissary printf/fprint overriding "
+ "device libc printf/fprintf">,
+ NegFlag<SetFalse>>;
//===----------------------------------------------------------------------===//
// Coarray Options
diff --git a/clang/lib/Headers/EmissaryIds.h b/clang/lib/Headers/EmissaryIds.h
index f9ff616a0aacb..3c38f3d24bdaa 100644
--- a/clang/lib/Headers/EmissaryIds.h
+++ b/clang/lib/Headers/EmissaryIds.h
@@ -33,8 +33,8 @@ typedef enum {
/// The vargs function used by emissary API device stubs
unsigned long long _emissary_exec(unsigned long long, ...);
-//#define _PACK_EMIS_IDS(x, y) \
-// ((unsigned long long)x << 32) | ((unsigned long long)y)
+// #define _PACK_EMIS_IDS(x, y) \
+// ((unsigned long long)x << 32) | ((unsigned long long)y)
#define _PACK_EMIS_IDS(a, b, c, d) \
((unsigned long long)a << 48) | ((unsigned long long)b << 32) | \
diff --git a/offload/plugins-nextgen/common/src/Emissary.cpp b/offload/plugins-nextgen/common/src/Emissary.cpp
index 0be0e426ee050..063e1aafe9c34 100644
--- a/offload/plugins-nextgen/common/src/Emissary.cpp
+++ b/offload/plugins-nextgen/common/src/Emissary.cpp
@@ -196,7 +196,7 @@ EmissaryBuildVargs(int NumArgs, char *keyptr, char *dataptr, char *strptr,
if (numbits == 1) { // This is a pointer to string
num_bytes = 4;
bytes_consumed = num_bytes;
- strsz = (size_t) * (unsigned int *)dataptr;
+ strsz = (size_t)*(unsigned int *)dataptr;
if ((*data_not_used) < bytes_consumed)
return _ERC_DATA_USED_ERROR;
a[argcount] = (emis_argptr_t *)((char *)strptr);
diff --git a/offload/plugins-nextgen/common/src/EmissaryPrint.cpp b/offload/plugins-nextgen/common/src/EmissaryPrint.cpp
index 80c55d3c5a5fd..88d92b3274ae1 100644
--- a/offload/plugins-nextgen/common/src/EmissaryPrint.cpp
+++ b/offload/plugins-nextgen/common/src/EmissaryPrint.cpp
@@ -276,7 +276,7 @@ static service_rc emissary_pfBuildValist(emissary_ValistExt_t *valist,
if (numbits == 1) { // This is a pointer to string
num_bytes = 4;
bytes_consumed = num_bytes;
- strsz = (size_t) * (unsigned int *)dataptr;
+ strsz = (size_t)*(unsigned int *)dataptr;
if ((*data_not_used) < bytes_consumed)
return _ERC_DATA_USED_ERROR;
if (emissary_pfAddString(valist, (char *)&strptr, strsz, &stacksize))
@@ -355,7 +355,7 @@ static service_rc emissary_fprintf(uint *rc, emisArgBuf_t *ab) {
// Skip past the format string
ab->NumArgs--;
ab->keyptr += 4;
- size_t abstrsz = (size_t) * (unsigned int *)ab->argptr;
+ size_t abstrsz = (size_t)*(unsigned int *)ab->argptr;
ab->strptr += abstrsz;
ab->argptr += 4;
ab->data_not_used -= 4;
@@ -389,7 +389,7 @@ static service_rc emissary_printf(uint *rc, emisArgBuf_t *ab) {
// Skip past the format string
ab->NumArgs--;
ab->keyptr += 4;
- size_t abstrsz = (size_t) * (unsigned int *)ab->argptr;
+ size_t abstrsz = (size_t)*(unsigned int *)ab->argptr;
ab->strptr += abstrsz;
ab->argptr += 4;
ab->data_not_used -= 4;
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index c3f041aa26637..9474982637135 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -112,13 +112,13 @@ rpc::Status handleOffloadOpcodes(plugin::GenericDeviceTy &Device,
for (uint32_t idx = 0; idx < ab->NumSendXfers; idx++) {
numSendXfers++;
devXfers[id] = (void *)*((uint64_t *)ab->argptr);
- XferSzs[id] = (size_t) * ((size_t *)(ab->argptr + sizeof(void *)));
+ XferSzs[id] = (size_t)*((size_t *)(ab->argptr + sizeof(void *)));
emisSkipXferArgSet(ab);
}
// Allocate the host space for the receive Xfers
for (uint32_t idx = 0; idx < ab->NumRecvXfers; idx++) {
void *devAddr = (void *)*((uint64_t *)ab->argptr);
- size_t devSz = (size_t) * ((size_t *)(ab->argptr + sizeof(void *)));
+ size_t devSz = (size_t)*((size_t *)(ab->argptr + sizeof(void *)));
void *hostAddr = new char[devSz];
D2HAddrList.insert(std::pair<void *, void *>(devAddr, hostAddr));
emisSkipXferArgSet(ab);
@@ -174,7 +174,7 @@ rpc::Status handleOffloadOpcodes(plugin::GenericDeviceTy &Device,
void *devAddr = (void *)*((uint64_t *)ab->argptr);
recvXfers[id] = D2HAddrList[devAddr];
recvXferSzs[id] =
- (uint64_t) * ((size_t *)(ab->argptr + sizeof(void *)));
+ (uint64_t)*((size_t *)(ab->argptr + sizeof(void *)));
emisSkipXferArgSet(ab);
}
id++;
diff --git a/openmp/device/src/Misc.cpp b/openmp/device/src/Misc.cpp
index fc300d60ea752..77f9815dd2a95 100644
--- a/openmp/device/src/Misc.cpp
+++ b/openmp/device/src/Misc.cpp
@@ -174,7 +174,7 @@ unsigned long long __llvm_emissary_rpc_dm(uint32_t sz32, void *bufdata) {
for (uint32_t idx = 0; idx < NumSendXfers; idx++) {
void *D2Hdata = (void *)*((uint64_t *)argptr);
argptr += sizeof(void *);
- size_t D2Hsize = (size_t) * ((size_t *)argptr);
+ size_t D2Hsize = (size_t)*((size_t *)argptr);
argptr += sizeof(size_t);
Port.send_n(D2Hdata, D2Hsize);
}
More information about the Openmp-commits
mailing list