[clang] [CUDA] Add device-side kernel launch support (PR #165519)
via cfe-commits
cfe-commits at lists.llvm.org
Fri Nov 21 13:25:47 PST 2025
https://github.com/darkbuck updated https://github.com/llvm/llvm-project/pull/165519
>From ec6233175ae8d981985dea9dfb724fa2daf87b1f Mon Sep 17 00:00:00 2001
From: Michael Liao <michael.hliao at gmail.com>
Date: Sat, 18 Oct 2025 19:46:39 -0400
Subject: [PATCH] [CUDA] Add device-side kernel launch support
- CUDA's dynamic parallelism extension allows device-side kernel
launches, which share the identical syntax to host-side launches,
e.g.,
kernel<<<Dg, Db, Ns, S>>>(arguments);
but differ from the code generation. That device-side kernel launches
is eventually translated into the following sequence
config = cudaGetParameterBuffer(alignment, size);
// setup arguments by copying them into `config`.
cudaLaunchDevice(func, config, Dg, Db, Ns, S);
- To support the device-side kernel launch, 'CUDAKernelCallExpr' is
reused but its config expr is set to a call to 'cudaLaunchDevice'.
During the code generation, 'CUDAKernelCallExpr' is expanded into the
sequence aforementioned.
- As the device-side kernel launch requires the code to be compiled as
relocatable device code and linked with '-lcudadevrt'.
'clang-nvlink-wrapper' is modified to forward archives with fat
binaries directly.
---
clang/include/clang/AST/ASTContext.h | 16 +++
.../clang/Basic/DiagnosticSemaKinds.td | 7 ++
clang/include/clang/Sema/SemaCUDA.h | 5 +
clang/include/clang/Serialization/ASTReader.h | 2 +-
clang/lib/CodeGen/CGCUDARuntime.cpp | 106 ++++++++++++++++++
clang/lib/CodeGen/CGCUDARuntime.h | 4 +
clang/lib/CodeGen/CGExprCXX.cpp | 6 +
clang/lib/Sema/SemaCUDA.cpp | 99 +++++++++++++++-
clang/lib/Sema/SemaDecl.cpp | 32 ++++--
clang/lib/Serialization/ASTReader.cpp | 8 +-
clang/lib/Serialization/ASTWriter.cpp | 37 +++---
clang/test/CodeGenCUDA/Inputs/cuda.h | 8 +-
clang/test/CodeGenCUDA/device-kernel-call.cu | 35 ++++++
clang/test/Driver/nvlink-wrapper.c | 9 +-
clang/test/SemaCUDA/Inputs/cuda.h | 7 ++
.../test/SemaCUDA/call-kernel-from-kernel.cu | 5 +-
clang/test/SemaCUDA/device-kernel-call.cu | 23 ++++
clang/test/SemaCUDA/function-overload.cu | 26 ++---
clang/test/SemaCUDA/function-target.cu | 4 +-
clang/test/SemaCUDA/reference-to-kernel-fn.cu | 4 +-
.../ClangNVLinkWrapper.cpp | 76 +++++++++++--
.../tools/clang-nvlink-wrapper/NVLinkOpts.td | 5 +
22 files changed, 457 insertions(+), 67 deletions(-)
create mode 100644 clang/test/CodeGenCUDA/device-kernel-call.cu
create mode 100644 clang/test/SemaCUDA/device-kernel-call.cu
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 6e9e737dcae4f..303e8f0e9a7a4 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -500,6 +500,10 @@ class ASTContext : public RefCountedBase<ASTContext> {
/// Declaration for the CUDA cudaConfigureCall function.
FunctionDecl *cudaConfigureCallDecl = nullptr;
+ /// Declaration for the CUDA cudaGetParameterBuffer function.
+ FunctionDecl *cudaGetParameterBufferDecl = nullptr;
+ /// Declaration for the CUDA cudaLaunchDevice function.
+ FunctionDecl *cudaLaunchDeviceDecl = nullptr;
/// Keeps track of all declaration attributes.
///
@@ -1653,6 +1657,18 @@ class ASTContext : public RefCountedBase<ASTContext> {
return cudaConfigureCallDecl;
}
+ void setcudaGetParameterBufferDecl(FunctionDecl *FD) {
+ cudaGetParameterBufferDecl = FD;
+ }
+
+ FunctionDecl *getcudaGetParameterBufferDecl() {
+ return cudaGetParameterBufferDecl;
+ }
+
+ void setcudaLaunchDeviceDecl(FunctionDecl *FD) { cudaLaunchDeviceDecl = FD; }
+
+ FunctionDecl *getcudaLaunchDeviceDecl() { return cudaLaunchDeviceDecl; }
+
/// Returns true iff we need copy/dispose helpers for the given type.
bool BlockRequiresCopying(QualType Ty, const VarDecl *D);
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 53aa86a7dabde..f7891af357090 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9530,6 +9530,8 @@ def err_kern_is_nonstatic_method : Error<
"kernel function %0 must be a free function or static member function">;
def err_config_scalar_return : Error<
"CUDA special function '%0' must have scalar return type">;
+def err_config_pointer_return
+ : Error<"CUDA special function '%0' must have pointer return type">;
def err_kern_call_not_global_function : Error<
"kernel call to non-global function %0">;
def err_global_call_not_config : Error<
@@ -13747,4 +13749,9 @@ def warn_comparison_in_enum_initializer : Warning<
def note_enum_compare_typo_suggest : Note<
"use '%0' to perform a bitwise shift">;
+def err_cuda_device_kernel_launch_not_supported
+ : Error<"device-side kernel call/launch is not supported">;
+def err_cuda_device_kernel_launch_require_rdc
+ : Error<"kernel launch from __device__ or __global__ function requires "
+ "relocatable device code (i.e. requires -fgpu-rdc)">;
} // end of sema component.
diff --git a/clang/include/clang/Sema/SemaCUDA.h b/clang/include/clang/Sema/SemaCUDA.h
index dbc1432860d89..dbb4290f5d149 100644
--- a/clang/include/clang/Sema/SemaCUDA.h
+++ b/clang/include/clang/Sema/SemaCUDA.h
@@ -273,6 +273,11 @@ class SemaCUDA : public SemaBase {
/// of the function that will be called to configure kernel call, with the
/// parameters specified via <<<>>>.
std::string getConfigureFuncName() const;
+ /// Return the name of the parameter buffer allocation function for the
+ /// device kernel launch.
+ std::string getGetParameterBufferFuncName() const;
+ /// Return the name of the device kernel launch function.
+ std::string getLaunchDeviceFuncName() const;
/// Record variables that are potentially ODR-used in CUDA/HIP.
void recordPotentialODRUsedVariable(MultiExprArg Args,
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index a27cfe8a9b307..d276f0d21b958 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -1005,7 +1005,7 @@ class ASTReader
///
/// The AST context tracks a few important decls, currently cudaConfigureCall,
/// directly.
- SmallVector<GlobalDeclID, 2> CUDASpecialDeclRefs;
+ SmallVector<GlobalDeclID, 4> CUDASpecialDeclRefs;
/// The floating point pragma option settings.
SmallVector<uint64_t, 1> FPPragmaOptions;
diff --git a/clang/lib/CodeGen/CGCUDARuntime.cpp b/clang/lib/CodeGen/CGCUDARuntime.cpp
index 121a481213396..9cbdb641d00a1 100644
--- a/clang/lib/CodeGen/CGCUDARuntime.cpp
+++ b/clang/lib/CodeGen/CGCUDARuntime.cpp
@@ -22,6 +22,112 @@ using namespace CodeGen;
CGCUDARuntime::~CGCUDARuntime() {}
+static llvm::Value *emitGetParamBuf(CodeGenFunction &CGF,
+ const CUDAKernelCallExpr *E) {
+ auto *GetParamBuf = CGF.getContext().getcudaGetParameterBufferDecl();
+ const FunctionProtoType *GetParamBufProto =
+ GetParamBuf->getType()->getAs<FunctionProtoType>();
+
+ DeclRefExpr *DRE = DeclRefExpr::Create(
+ CGF.getContext(), {}, {}, GetParamBuf,
+ /*RefersToEnclosingVariableOrCapture=*/false, GetParamBuf->getNameInfo(),
+ GetParamBuf->getType(), VK_PRValue);
+ auto *ImpCast = ImplicitCastExpr::Create(
+ CGF.getContext(), CGF.getContext().getPointerType(GetParamBuf->getType()),
+ CK_FunctionToPointerDecay, DRE, nullptr, VK_PRValue, FPOptionsOverride());
+
+ CGCallee Callee = CGF.EmitCallee(ImpCast);
+ CallArgList Args;
+ // Use 64B alignment.
+ Args.add(RValue::get(CGF.CGM.getSize(CharUnits::fromQuantity(64))),
+ CGF.getContext().getSizeType());
+ // Calculate parameter sizes.
+ const PointerType *PT = E->getCallee()->getType()->getAs<PointerType>();
+ const FunctionProtoType *FTP =
+ PT->getPointeeType()->getAs<FunctionProtoType>();
+ CharUnits Offset = CharUnits::Zero();
+ for (auto ArgTy : FTP->getParamTypes()) {
+ auto TInfo = CGF.CGM.getContext().getTypeInfoInChars(ArgTy);
+ Offset = Offset.alignTo(TInfo.Align) + TInfo.Width;
+ }
+ Args.add(RValue::get(CGF.CGM.getSize(Offset)),
+ CGF.getContext().getSizeType());
+ const CGFunctionInfo &CallInfo = CGF.CGM.getTypes().arrangeFreeFunctionCall(
+ Args, GetParamBufProto, /*ChainCall=*/false);
+ auto Ret = CGF.EmitCall(CallInfo, Callee, /*ReturnValue=*/{}, Args);
+
+ return Ret.getScalarVal();
+}
+
+RValue CGCUDARuntime::EmitCUDADeviceKernelCallExpr(
+ CodeGenFunction &CGF, const CUDAKernelCallExpr *E,
+ ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke) {
+ ASTContext &Ctx = CGM.getContext();
+ assert(Ctx.getcudaLaunchDeviceDecl() == E->getConfig()->getDirectCallee());
+
+ llvm::BasicBlock *ConfigOKBlock = CGF.createBasicBlock("dkcall.configok");
+ llvm::BasicBlock *ContBlock = CGF.createBasicBlock("dkcall.end");
+
+ llvm::Value *Config = emitGetParamBuf(CGF, E);
+ CGF.Builder.CreateCondBr(
+ CGF.Builder.CreateICmpNE(Config,
+ llvm::Constant::getNullValue(Config->getType())),
+ ConfigOKBlock, ContBlock);
+
+ CodeGenFunction::ConditionalEvaluation eval(CGF);
+
+ eval.begin(CGF);
+ CGF.EmitBlock(ConfigOKBlock);
+
+ QualType KernelCalleeFuncTy =
+ E->getCallee()->getType()->getAs<PointerType>()->getPointeeType();
+ CGCallee KernelCallee = CGF.EmitCallee(E->getCallee());
+ // Emit kernel arguments.
+ CallArgList KernelCallArgs;
+ CGF.EmitCallArgs(KernelCallArgs,
+ KernelCalleeFuncTy->getAs<FunctionProtoType>(),
+ E->arguments(), E->getDirectCallee());
+ // Copy emitted kernel arguments into that parameter buffer.
+ RawAddress CfgBase(Config, CGM.Int8Ty,
+ /*Alignment=*/CharUnits::fromQuantity(64));
+ CharUnits Offset = CharUnits::Zero();
+ for (auto &Arg : KernelCallArgs) {
+ auto TInfo = CGM.getContext().getTypeInfoInChars(Arg.getType());
+ Offset = Offset.alignTo(TInfo.Align);
+ Address Addr =
+ CGF.Builder.CreateConstInBoundsGEP(CfgBase, Offset.getQuantity());
+ Arg.copyInto(CGF, Addr);
+ Offset += TInfo.Width;
+ }
+ // Make `cudaLaunchDevice` call, i.e. E->getConfig().
+ const CallExpr *LaunchCall = E->getConfig();
+ QualType LaunchCalleeFuncTy = LaunchCall->getCallee()
+ ->getType()
+ ->getAs<PointerType>()
+ ->getPointeeType();
+ CGCallee LaunchCallee = CGF.EmitCallee(LaunchCall->getCallee());
+ CallArgList LaunchCallArgs;
+ CGF.EmitCallArgs(LaunchCallArgs,
+ LaunchCalleeFuncTy->getAs<FunctionProtoType>(),
+ LaunchCall->arguments(), LaunchCall->getDirectCallee());
+ // Replace func and paramterbuffer arguments.
+ LaunchCallArgs[0] = CallArg(RValue::get(KernelCallee.getFunctionPointer()),
+ CGM.getContext().VoidPtrTy);
+ LaunchCallArgs[1] = CallArg(RValue::get(Config), CGM.getContext().VoidPtrTy);
+ const CGFunctionInfo &LaunchCallInfo = CGM.getTypes().arrangeFreeFunctionCall(
+ LaunchCallArgs, LaunchCalleeFuncTy->getAs<FunctionProtoType>(),
+ /*ChainCall=*/false);
+ CGF.EmitCall(LaunchCallInfo, LaunchCallee, ReturnValue, LaunchCallArgs,
+ CallOrInvoke,
+ /*IsMustTail=*/false, E->getExprLoc());
+ CGF.EmitBranch(ContBlock);
+
+ CGF.EmitBlock(ContBlock);
+ eval.end(CGF);
+
+ return RValue::get(nullptr);
+}
+
RValue CGCUDARuntime::EmitCUDAKernelCallExpr(CodeGenFunction &CGF,
const CUDAKernelCallExpr *E,
ReturnValueSlot ReturnValue,
diff --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h
index 86f776004ee7c..64fb9a31422e0 100644
--- a/clang/lib/CodeGen/CGCUDARuntime.h
+++ b/clang/lib/CodeGen/CGCUDARuntime.h
@@ -88,6 +88,10 @@ class CGCUDARuntime {
ReturnValueSlot ReturnValue,
llvm::CallBase **CallOrInvoke = nullptr);
+ virtual RValue EmitCUDADeviceKernelCallExpr(
+ CodeGenFunction &CGF, const CUDAKernelCallExpr *E,
+ ReturnValueSlot ReturnValue, llvm::CallBase **CallOrInvoke = nullptr);
+
/// Emits a kernel launch stub.
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0;
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index f64cf9f8a6c2d..8a2c021b2210f 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -503,6 +503,12 @@ RValue CodeGenFunction::EmitCXXOperatorMemberCallExpr(
RValue CodeGenFunction::EmitCUDAKernelCallExpr(const CUDAKernelCallExpr *E,
ReturnValueSlot ReturnValue,
llvm::CallBase **CallOrInvoke) {
+ auto *FD = E->getConfig()->getDirectCallee();
+ // Emit as a device kernel call if the config is prepared using
+ // 'cudaGetParameterBuffer'.
+ if (FD && CGM.getContext().getcudaLaunchDeviceDecl() == FD)
+ return CGM.getCUDARuntime().EmitCUDADeviceKernelCallExpr(
+ *this, E, ReturnValue, CallOrInvoke);
return CGM.getCUDARuntime().EmitCUDAKernelCallExpr(*this, E, ReturnValue,
CallOrInvoke);
}
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 31735a0f5feb3..dd9bcab56b083 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -52,16 +52,94 @@ bool SemaCUDA::PopForceHostDevice() {
ExprResult SemaCUDA::ActOnExecConfigExpr(Scope *S, SourceLocation LLLLoc,
MultiExprArg ExecConfig,
SourceLocation GGGLoc) {
- FunctionDecl *ConfigDecl = getASTContext().getcudaConfigureCallDecl();
+ bool IsDeviceKernelCall = false;
+ switch (CurrentTarget()) {
+ case CUDAFunctionTarget::Global:
+ case CUDAFunctionTarget::Device:
+ IsDeviceKernelCall = true;
+ break;
+ case CUDAFunctionTarget::HostDevice:
+ if (getLangOpts().CUDAIsDevice) {
+ IsDeviceKernelCall = true;
+ if (FunctionDecl *Caller =
+ SemaRef.getCurFunctionDecl(/*AllowLambda=*/true);
+ Caller && isImplicitHostDeviceFunction(Caller)) {
+ // Under the device compilation, config call under an HD function should
+ // be treated as a device kernel call. But, for implicit HD ones (such
+ // as lambdas), need to check whether RDC is enabled or not.
+ if (!getLangOpts().GPURelocatableDeviceCode)
+ IsDeviceKernelCall = false;
+ // HIP doesn't support device-side kernel call yet. Still treat it as
+ // the host-side kernel call.
+ if (getLangOpts().HIP)
+ IsDeviceKernelCall = false;
+ }
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (IsDeviceKernelCall && getLangOpts().HIP)
+ return ExprError(
+ Diag(LLLLoc, diag::err_cuda_device_kernel_launch_not_supported));
+
+ if (IsDeviceKernelCall && !getLangOpts().GPURelocatableDeviceCode)
+ return ExprError(
+ Diag(LLLLoc, diag::err_cuda_device_kernel_launch_require_rdc));
+
+ FunctionDecl *ConfigDecl = IsDeviceKernelCall
+ ? getASTContext().getcudaLaunchDeviceDecl()
+ : getASTContext().getcudaConfigureCallDecl();
if (!ConfigDecl)
return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
- << getConfigureFuncName());
+ << (IsDeviceKernelCall ? getLaunchDeviceFuncName()
+ : getConfigureFuncName()));
+ // Additional check on the launch function if it's a device kernel call.
+ if (IsDeviceKernelCall) {
+ auto *GetParamBuf = getASTContext().getcudaGetParameterBufferDecl();
+ if (!GetParamBuf)
+ return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
+ << getGetParameterBufferFuncName());
+ }
+
QualType ConfigQTy = ConfigDecl->getType();
DeclRefExpr *ConfigDR = new (getASTContext()) DeclRefExpr(
getASTContext(), ConfigDecl, false, ConfigQTy, VK_LValue, LLLLoc);
SemaRef.MarkFunctionReferenced(LLLLoc, ConfigDecl);
+ if (IsDeviceKernelCall) {
+ SmallVector<Expr *> Args;
+ // Use a null pointer as the kernel function, which may not be resolvable
+ // here. For example, resolving that kernel function may need additional
+ // kernel arguments.
+ llvm::APInt Zero(SemaRef.Context.getTypeSize(SemaRef.Context.IntTy), 0);
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+ SemaRef.Context.IntTy, LLLLoc));
+ // Use a null pointer as the placeholder of the parameter buffer, which
+ // should be replaced with the actual allocation later, in the codegen.
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+ SemaRef.Context.IntTy, LLLLoc));
+ // Add the original config arguments.
+ llvm::append_range(Args, ExecConfig);
+ // Add the default blockDim if it's missing.
+ if (Args.size() < 4) {
+ llvm::APInt One(SemaRef.Context.getTypeSize(SemaRef.Context.IntTy), 1);
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, One,
+ SemaRef.Context.IntTy, LLLLoc));
+ }
+ // Add the default sharedMemSize if it's missing.
+ if (Args.size() < 5)
+ Args.push_back(IntegerLiteral::Create(SemaRef.Context, Zero,
+ SemaRef.Context.IntTy, LLLLoc));
+ // Add the default stream if it's missing.
+ if (Args.size() < 6)
+ Args.push_back(new (SemaRef.Context) CXXNullPtrLiteralExpr(
+ SemaRef.Context.NullPtrTy, LLLLoc));
+ return SemaRef.BuildCallExpr(S, ConfigDR, LLLLoc, Args, GGGLoc, nullptr,
+ /*IsExecConfig=*/true);
+ }
return SemaRef.BuildCallExpr(S, ConfigDR, LLLLoc, ExecConfig, GGGLoc, nullptr,
/*IsExecConfig=*/true);
}
@@ -246,12 +324,12 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
CalleeTarget == CUDAFunctionTarget::InvalidTarget)
return CFP_Never;
- // (a) Can't call global from some contexts until we support CUDA's
- // dynamic parallelism.
+ // (a) Call global from either global or device contexts is allowed as part
+ // of CUDA's dynamic parallelism support.
if (CalleeTarget == CUDAFunctionTarget::Global &&
(CallerTarget == CUDAFunctionTarget::Global ||
CallerTarget == CUDAFunctionTarget::Device))
- return CFP_Never;
+ return CFP_Native;
// (b) Calling HostDevice is OK for everyone.
if (CalleeTarget == CUDAFunctionTarget::HostDevice)
@@ -279,7 +357,8 @@ SemaCUDA::IdentifyPreference(const FunctionDecl *Caller,
if (CallerTarget == CUDAFunctionTarget::HostDevice) {
// It's OK to call a compilation-mode matching function from an HD one.
if ((getLangOpts().CUDAIsDevice &&
- CalleeTarget == CUDAFunctionTarget::Device) ||
+ (CalleeTarget == CUDAFunctionTarget::Device ||
+ CalleeTarget == CUDAFunctionTarget::Global)) ||
(!getLangOpts().CUDAIsDevice &&
(CalleeTarget == CUDAFunctionTarget::Host ||
CalleeTarget == CUDAFunctionTarget::Global)))
@@ -1103,6 +1182,14 @@ std::string SemaCUDA::getConfigureFuncName() const {
return "cudaConfigureCall";
}
+std::string SemaCUDA::getGetParameterBufferFuncName() const {
+ return "cudaGetParameterBuffer";
+}
+
+std::string SemaCUDA::getLaunchDeviceFuncName() const {
+ return "cudaLaunchDevice";
+}
+
// Record any local constexpr variables that are passed one way on the host
// and another on the device.
void SemaCUDA::recordPotentialODRUsedVariable(
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index b8ca2a376fde8..3426e57981640 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -11056,14 +11056,30 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,
}
if (getLangOpts().CUDA) {
- IdentifierInfo *II = NewFD->getIdentifier();
- if (II && II->isStr(CUDA().getConfigureFuncName()) &&
- !NewFD->isInvalidDecl() &&
- NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
- if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
- Diag(NewFD->getLocation(), diag::err_config_scalar_return)
- << CUDA().getConfigureFuncName();
- Context.setcudaConfigureCallDecl(NewFD);
+ if (IdentifierInfo *II = NewFD->getIdentifier()) {
+ if (II->isStr(CUDA().getConfigureFuncName()) && !NewFD->isInvalidDecl() &&
+ NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
+ if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
+ Diag(NewFD->getLocation(), diag::err_config_scalar_return)
+ << CUDA().getConfigureFuncName();
+ Context.setcudaConfigureCallDecl(NewFD);
+ }
+ if (II->isStr(CUDA().getGetParameterBufferFuncName()) &&
+ !NewFD->isInvalidDecl() &&
+ NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
+ if (!R->castAs<FunctionType>()->getReturnType()->isPointerType())
+ Diag(NewFD->getLocation(), diag::err_config_pointer_return)
+ << CUDA().getConfigureFuncName();
+ Context.setcudaGetParameterBufferDecl(NewFD);
+ }
+ if (II->isStr(CUDA().getLaunchDeviceFuncName()) &&
+ !NewFD->isInvalidDecl() &&
+ NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
+ if (!R->castAs<FunctionType>()->getReturnType()->isScalarType())
+ Diag(NewFD->getLocation(), diag::err_config_scalar_return)
+ << CUDA().getConfigureFuncName();
+ Context.setcudaLaunchDeviceDecl(NewFD);
+ }
}
}
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 55c52154c4113..5c82cafc49177 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -5580,9 +5580,13 @@ void ASTReader::InitializeContext() {
// If there were any CUDA special declarations, deserialize them.
if (!CUDASpecialDeclRefs.empty()) {
- assert(CUDASpecialDeclRefs.size() == 1 && "More decl refs than expected!");
+ assert(CUDASpecialDeclRefs.size() == 3 && "More decl refs than expected!");
Context.setcudaConfigureCallDecl(
- cast<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[0])));
+ cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[0])));
+ Context.setcudaGetParameterBufferDecl(
+ cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[1])));
+ Context.setcudaLaunchDeviceDecl(
+ cast_or_null<FunctionDecl>(GetDecl(CUDASpecialDeclRefs[2])));
}
// Re-export any modules that were imported by a non-module AST file.
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 547497cbd87d9..1871e48df35ff 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -5706,8 +5706,13 @@ void ASTWriter::PrepareWritingSpecialDecls(Sema &SemaRef) {
GetDeclRef(SemaRef.getStdAlignValT());
}
- if (Context.getcudaConfigureCallDecl())
+ if (Context.getcudaConfigureCallDecl() ||
+ Context.getcudaGetParameterBufferDecl() ||
+ Context.getcudaLaunchDeviceDecl()) {
GetDeclRef(Context.getcudaConfigureCallDecl());
+ GetDeclRef(Context.getcudaGetParameterBufferDecl());
+ GetDeclRef(Context.getcudaLaunchDeviceDecl());
+ }
// Writing all of the known namespaces.
for (const auto &I : SemaRef.KnownNamespaces)
@@ -5834,19 +5839,19 @@ void ASTWriter::WriteSpecialDeclRecords(Sema &SemaRef) {
Stream.EmitRecord(PENDING_IMPLICIT_INSTANTIATIONS, PendingInstantiations);
}
+ auto AddEmittedDeclRefOrZero = [this](RecordData &Refs, Decl *D) {
+ if (!D || !wasDeclEmitted(D))
+ Refs.push_back(0);
+ else
+ AddDeclRef(D, Refs);
+ };
+
// Write the record containing declaration references of Sema.
RecordData SemaDeclRefs;
if (SemaRef.StdNamespace || SemaRef.StdBadAlloc || SemaRef.StdAlignValT) {
- auto AddEmittedDeclRefOrZero = [this, &SemaDeclRefs](Decl *D) {
- if (!D || !wasDeclEmitted(D))
- SemaDeclRefs.push_back(0);
- else
- AddDeclRef(D, SemaDeclRefs);
- };
-
- AddEmittedDeclRefOrZero(SemaRef.getStdNamespace());
- AddEmittedDeclRefOrZero(SemaRef.getStdBadAlloc());
- AddEmittedDeclRefOrZero(SemaRef.getStdAlignValT());
+ AddEmittedDeclRefOrZero(SemaDeclRefs, SemaRef.getStdNamespace());
+ AddEmittedDeclRefOrZero(SemaDeclRefs, SemaRef.getStdBadAlloc());
+ AddEmittedDeclRefOrZero(SemaDeclRefs, SemaRef.getStdAlignValT());
}
if (!SemaDeclRefs.empty())
Stream.EmitRecord(SEMA_DECL_REFS, SemaDeclRefs);
@@ -5862,9 +5867,13 @@ void ASTWriter::WriteSpecialDeclRecords(Sema &SemaRef) {
// Write the record containing CUDA-specific declaration references.
RecordData CUDASpecialDeclRefs;
- if (auto *CudaCallDecl = Context.getcudaConfigureCallDecl();
- CudaCallDecl && wasDeclEmitted(CudaCallDecl)) {
- AddDeclRef(CudaCallDecl, CUDASpecialDeclRefs);
+ if (auto *CudaCallDecl = Context.getcudaConfigureCallDecl(),
+ *CudaGetParamDecl = Context.getcudaGetParameterBufferDecl(),
+ *CudaLaunchDecl = Context.getcudaLaunchDeviceDecl();
+ CudaCallDecl || CudaGetParamDecl || CudaLaunchDecl) {
+ AddEmittedDeclRefOrZero(CUDASpecialDeclRefs, CudaCallDecl);
+ AddEmittedDeclRefOrZero(CUDASpecialDeclRefs, CudaGetParamDecl);
+ AddEmittedDeclRefOrZero(CUDASpecialDeclRefs, CudaLaunchDecl);
Stream.EmitRecord(CUDA_SPECIAL_DECL_REFS, CUDASpecialDeclRefs);
}
diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h
index e7ad784335027..421fa4dd7dbae 100644
--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -72,7 +72,13 @@ extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
extern "C" cudaError_t cudaLaunchKernel_ptsz(const void *func, dim3 gridDim,
dim3 blockDim, void **args,
size_t sharedMem, cudaStream_t stream);
-
+extern "C" __device__ cudaError_t cudaLaunchDevice(void *func,
+ void *parameterBuffer,
+ dim3 gridDim, dim3 blockDim,
+ unsigned int sharedMem,
+ cudaStream_t stream);
+extern "C" __device__ void *cudaGetParameterBuffer(size_t alignment,
+ size_t size);
#endif
extern "C" __device__ int printf(const char*, ...);
diff --git a/clang/test/CodeGenCUDA/device-kernel-call.cu b/clang/test/CodeGenCUDA/device-kernel-call.cu
new file mode 100644
index 0000000000000..eff2b37bd298d
--- /dev/null
+++ b/clang/test/CodeGenCUDA/device-kernel-call.cu
@@ -0,0 +1,35 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -fgpu-rdc -emit-llvm %s -o - | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+// CHECK-LABEL: define dso_local ptx_kernel void @_Z2g2i(
+// CHECK-SAME: i32 noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[X_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT: store i32 [[X]], ptr [[X_ADDR]], align 4
+// CHECK-NEXT: ret void
+//
+__global__ void g2(int x) {}
+
+// CHECK-LABEL: define dso_local ptx_kernel void @_Z2g1v(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[AGG_TMP:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 4
+// CHECK-NEXT: [[AGG_TMP1:%.*]] = alloca [[STRUCT_DIM3]], align 4
+// CHECK-NEXT: [[CALL:%.*]] = call ptr @cudaGetParameterBuffer(i64 noundef 64, i64 noundef 4) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT: [[TMP0:%.*]] = icmp ne ptr [[CALL]], null
+// CHECK-NEXT: br i1 [[TMP0]], label %[[DKCALL_CONFIGOK:.*]], label %[[DKCALL_END:.*]]
+// CHECK: [[DKCALL_CONFIGOK]]:
+// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[CALL]], i64 0
+// CHECK-NEXT: store i32 42, ptr [[TMP1]], align 64
+// CHECK-NEXT: call void @_ZN4dim3C1Ejjj(ptr noundef nonnull align 4 dereferenceable(12) [[AGG_TMP]], i32 noundef 1, i32 noundef 1, i32 noundef 1) #[[ATTR3]]
+// CHECK-NEXT: call void @_ZN4dim3C1Ejjj(ptr noundef nonnull align 4 dereferenceable(12) [[AGG_TMP1]], i32 noundef 1, i32 noundef 1, i32 noundef 1) #[[ATTR3]]
+// CHECK-NEXT: [[CALL2:%.*]] = call i32 @cudaLaunchDevice(ptr noundef @_Z2g2i, ptr noundef [[CALL]], ptr noundef byval([[STRUCT_DIM3]]) align 4 [[AGG_TMP]], ptr noundef byval([[STRUCT_DIM3]]) align 4 [[AGG_TMP1]], i32 noundef 0, ptr noundef null) #[[ATTR3]]
+// CHECK-NEXT: br label %[[DKCALL_END]]
+// CHECK: [[DKCALL_END]]:
+// CHECK-NEXT: ret void
+//
+__global__ void g1(void) {
+ g2<<<1, 1>>>(42);
+}
diff --git a/clang/test/Driver/nvlink-wrapper.c b/clang/test/Driver/nvlink-wrapper.c
index 79f4a6641732f..387d2186389e5 100644
--- a/clang/test/Driver/nvlink-wrapper.c
+++ b/clang/test/Driver/nvlink-wrapper.c
@@ -51,14 +51,17 @@ int baz() { return y + x; }
// `libx.a` and `liby.a` because extern weak symbols do not extract and `libz.a`
// is not used at all.
//
-// RUN: clang-nvlink-wrapper --dry-run %t-x.a %t-u.a %t-y.a %t-z.a %t-w.a %t.o \
+// RUN: clang-nvlink-wrapper --dry-run --assume-device-archive %t-x.a %t-u.a %t-y.a %t-z.a %t-w.a %t.o \
// RUN: -arch sm_52 -o a.out 2>&1 | FileCheck %s --check-prefix=LINK
+// RUN: clang-nvlink-wrapper --dry-run %t-x.a %t-u.a %t-y.a %t-z.a %t-w.a %t.o \
+// RUN: -arch sm_52 -o a.out 2>&1 | FileCheck %s --check-prefix=FORWARD
// LINK: nvlink{{.*}} -arch sm_52 -o a.out [[INPUT:.+]].cubin {{.*}}-x-{{.*}}.cubin{{.*}}-y-{{.*}}.cubin
+// FORWARD: nvlink{{.*}} -arch sm_52 -o a.out [[INPUT:.+]].cubin {{.*}}-x.a {{.*}}-u.a {{.*}}-y.a {{.*}}-z.a {{.*}}-w.a
//
// Same as above but we use '--undefined' to forcibly extract 'libz.a'
//
-// RUN: clang-nvlink-wrapper --dry-run %t-x.a %t-u.a %t-y.a %t-z.a %t-w.a %t.o \
+// RUN: clang-nvlink-wrapper --dry-run --assume-device-archive %t-x.a %t-u.a %t-y.a %t-z.a %t-w.a %t.o \
// RUN: -u z -arch sm_52 -o a.out 2>&1 | FileCheck %s --check-prefix=LINK
// UNDEFINED: nvlink{{.*}} -arch sm_52 -o a.out [[INPUT:.+]].cubin {{.*}}-x-{{.*}}.cubin{{.*}}-y-{{.*}}.cubin{{.*}}-z-{{.*}}.cubin
@@ -66,7 +69,7 @@ int baz() { return y + x; }
// Check that the LTO interface works and properly preserves symbols used in a
// regular object file.
//
-// RUN: clang-nvlink-wrapper --dry-run %t.o %t-u.o %t-y.a \
+// RUN: clang-nvlink-wrapper --dry-run --assume-device-archive %t.o %t-u.o %t-y.a \
// RUN: -arch sm_52 -o a.out 2>&1 | FileCheck %s --check-prefix=LTO
// LTO: ptxas{{.*}} -m64 -c [[PTX:.+]].s -O3 -arch sm_52 -o [[CUBIN:.+]].cubin
// LTO: nvlink{{.*}} -arch sm_52 -o a.out [[CUBIN]].cubin {{.*}}-u-{{.*}}.cubin {{.*}}-y-{{.*}}.cubin
diff --git a/clang/test/SemaCUDA/Inputs/cuda.h b/clang/test/SemaCUDA/Inputs/cuda.h
index 2bf45e03d91c7..de6f7fb635421 100644
--- a/clang/test/SemaCUDA/Inputs/cuda.h
+++ b/clang/test/SemaCUDA/Inputs/cuda.h
@@ -46,6 +46,13 @@ extern "C" int __cudaPushCallConfiguration(dim3 gridSize, dim3 blockSize,
extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim,
dim3 blockDim, void **args,
size_t sharedMem, cudaStream_t stream);
+extern "C" __device__ cudaError_t cudaLaunchDevice(void *func,
+ void *parameterBuffer,
+ dim3 gridDim, dim3 blockDim,
+ unsigned int sharedMem,
+ cudaStream_t stream);
+extern "C" __device__ void *cudaGetParameterBuffer(size_t alignment,
+ size_t size);
#endif
// Host- and device-side placement new overloads.
diff --git a/clang/test/SemaCUDA/call-kernel-from-kernel.cu b/clang/test/SemaCUDA/call-kernel-from-kernel.cu
index 5f8832f3cd070..01dba44339520 100644
--- a/clang/test/SemaCUDA/call-kernel-from-kernel.cu
+++ b/clang/test/SemaCUDA/call-kernel-from-kernel.cu
@@ -1,9 +1,12 @@
// RUN: %clang_cc1 %s --std=c++11 -triple nvptx -o - \
// RUN: -verify -fcuda-is-device -fsyntax-only -verify-ignore-unexpected=note
+// RUN: %clang_cc1 %s --std=c++11 -fgpu-rdc -triple nvptx -o - \
+// RUN: -verify=rdc -fcuda-is-device -fsyntax-only -verify-ignore-unexpected=note
+// rdc-no-diagnostics
#include "Inputs/cuda.h"
__global__ void kernel1();
__global__ void kernel2() {
- kernel1<<<1,1>>>(); // expected-error {{reference to __global__ function 'kernel1' in __global__ function}}
+ kernel1<<<1,1>>>(); // expected-error {{kernel launch from __device__ or __global__ function requires relocatable device code (i.e. requires -fgpu-rdc)}}
}
diff --git a/clang/test/SemaCUDA/device-kernel-call.cu b/clang/test/SemaCUDA/device-kernel-call.cu
new file mode 100644
index 0000000000000..fea6deac02e55
--- /dev/null
+++ b/clang/test/SemaCUDA/device-kernel-call.cu
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -fcuda-is-device -verify=nordc %s
+// RUN: %clang_cc1 -fcuda-is-device -fgpu-rdc -verify=rdc %s
+// RUN: %clang_cc1 -x hip -fcuda-is-device -verify=hip %s
+
+// rdc-no-diagnostics
+
+#include "Inputs/cuda.h"
+
+__global__ void g2(int x) {}
+
+// CHECK-LABEL: define{{.*}}g1
+__global__ void g1(void) {
+ // CHECK: [[CONFIG:%.*]] = call{{.*}}_Z22cudaGetParameterBuffermm(i64{{.*}}64, i64{{.*}}4)
+ // CHECK-NEXT: [[FLAG:%.*]] = icmp ne ptr [[CONFIG]], null
+ // CHECK-NEXT: br i1 [[FLAG]], label %[[THEN:.*]], label %[[ENDIF:.*]]
+ // CHECK: [[THEN]]:
+ // CHECK-NEXT: [[PPTR:%.*]] = getelementptr{{.*}}i8, ptr [[CONFIG]], i64 0
+ // CHECK-NEXT: store i32 42, ptr [[PPTR]]
+ // CHECK: = call{{.*}} i32 @_Z16cudaLaunchDevicePvS_4dim3S0_jP10cudaStream(ptr{{.*}} @_Z2g2i, ptr{{.*}} [[CONFIG]],
+ g2<<<1, 1>>>(42);
+ // nordc-error at -1 {{kernel launch from __device__ or __global__ function requires relocatable device code (i.e. requires -fgpu-rdc)}}
+ // hip-error at -2 {{device-side kernel call/launch is not supported}}
+}
diff --git a/clang/test/SemaCUDA/function-overload.cu b/clang/test/SemaCUDA/function-overload.cu
index 3d05839af7528..11f84a912ea7b 100644
--- a/clang/test/SemaCUDA/function-overload.cu
+++ b/clang/test/SemaCUDA/function-overload.cu
@@ -91,10 +91,7 @@ __host__ HostReturnTy h() { return HostReturnTy(); }
// devdefer-note at -4 1+ {{candidate function not viable: call to __host__ function from __global__ function}}
__global__ void g() {}
-// dev-note at -1 1+ {{'g' declared here}}
-// devdefer-note at -2 1+ {{candidate function not viable: call to __global__ function from __device__ function}}
// expected-note at -3 0+ {{candidate function not viable: call to __global__ function from __host__ __device__ function}}
-// devdefer-note at -4 1+ {{candidate function not viable: call to __global__ function from __global__ function}}
extern "C" __device__ DeviceReturnTy cd() { return DeviceReturnTy(); }
// host-note at -1 1+ {{'cd' declared here}}
@@ -144,9 +141,9 @@ __device__ void devicef() {
DeviceFnPtr fp_cdh = cdh;
DeviceReturnTy ret_cdh = cdh();
- GlobalFnPtr fp_g = g; // dev-error {{reference to __global__ function 'g' in __device__ function}}
- g(); // devdefer-error {{no matching function for call to 'g'}}
- g<<<0,0>>>(); // dev-error {{reference to __global__ function 'g' in __device__ function}}
+ GlobalFnPtr fp_g = g;
+ g(); // expected-error {{call to global function 'g' not configured}}
+ g<<<0,0>>>(); // expected-error {{kernel launch from __device__ or __global__ function requires relocatable device code (i.e. requires -fgpu-rdc)}}
}
__global__ void globalf() {
@@ -165,9 +162,9 @@ __global__ void globalf() {
DeviceFnPtr fp_cdh = cdh;
DeviceReturnTy ret_cdh = cdh();
- GlobalFnPtr fp_g = g; // dev-error {{reference to __global__ function 'g' in __global__ function}}
- g(); // devdefer-error {{no matching function for call to 'g'}}
- g<<<0,0>>>(); // dev-error {{reference to __global__ function 'g' in __global__ function}}
+ GlobalFnPtr fp_g = g;
+ g(); // expected-error {{call to global function 'g' not configured}}
+ g<<<0,0>>>(); // expected-error {{kernel launch from __device__ or __global__ function requires relocatable device code (i.e. requires -fgpu-rdc)}}
}
__host__ __device__ void hostdevicef() {
@@ -199,20 +196,13 @@ __host__ __device__ void hostdevicef() {
CurrentReturnTy ret_cdh = cdh();
GlobalFnPtr fp_g = g;
-#if defined(__CUDA_ARCH__)
- // expected-error at -2 {{reference to __global__ function 'g' in __host__ __device__ function}}
-#endif
g();
-#if defined (__CUDA_ARCH__)
- // expected-error at -2 {{reference to __global__ function 'g' in __host__ __device__ function}}
-#else
- // expected-error at -4 {{call to global function 'g' not configured}}
-#endif
+ // expected-error at -1 {{call to global function 'g' not configured}}
g<<<0,0>>>();
#if defined(__CUDA_ARCH__)
- // expected-error at -2 {{reference to __global__ function 'g' in __host__ __device__ function}}
+ // expected-error at -2 {{kernel launch from __device__ or __global__ function requires relocatable device code (i.e. requires -fgpu-rdc)}}
#endif
}
diff --git a/clang/test/SemaCUDA/function-target.cu b/clang/test/SemaCUDA/function-target.cu
index 64444b6676248..66704a320cee1 100644
--- a/clang/test/SemaCUDA/function-target.cu
+++ b/clang/test/SemaCUDA/function-target.cu
@@ -24,11 +24,11 @@ __host__ void h1(void) {
__host__ void d1h(void); // expected-note {{candidate function not viable: call to __host__ function from __device__ function}}
__device__ void d1d(void);
__host__ __device__ void d1hd(void);
-__global__ void d1g(void); // dev-note {{'d1g' declared here}}
+__global__ void d1g(void);
__device__ void d1(void) {
d1h(); // expected-error {{no matching function}}
d1d();
d1hd();
- d1g<<<1, 1>>>(); // dev-error {{reference to __global__ function 'd1g' in __device__ function}}
+ d1g<<<1, 1>>>(); // expected-error {{kernel launch from __device__ or __global__ function requires relocatable device code (i.e. requires -fgpu-rdc)}}
}
diff --git a/clang/test/SemaCUDA/reference-to-kernel-fn.cu b/clang/test/SemaCUDA/reference-to-kernel-fn.cu
index 70a1cda6ab0c8..bdb70fc8b55d1 100644
--- a/clang/test/SemaCUDA/reference-to-kernel-fn.cu
+++ b/clang/test/SemaCUDA/reference-to-kernel-fn.cu
@@ -8,6 +8,7 @@
// device-side kernel launches.)
// host-no-diagnostics
+// dev-no-diagnostics
#include "Inputs/cuda.h"
@@ -19,11 +20,10 @@ typedef void (*fn_ptr_t)();
__host__ __device__ fn_ptr_t get_ptr_hd() {
return kernel;
- // dev-error at -1 {{reference to __global__ function}}
}
__host__ fn_ptr_t get_ptr_h() {
return kernel;
}
__device__ fn_ptr_t get_ptr_d() {
- return kernel; // dev-error {{reference to __global__ function}}
+ return kernel;
}
diff --git a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
index 58eb671c61989..16ff9c06a239a 100644
--- a/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
+++ b/clang/tools/clang-nvlink-wrapper/ClangNVLinkWrapper.cpp
@@ -45,6 +45,8 @@
#include "llvm/Support/TargetSelect.h"
#include "llvm/Support/WithColor.h"
+#include <tuple>
+
using namespace llvm;
using namespace llvm::opt;
using namespace llvm::object;
@@ -165,6 +167,30 @@ void diagnosticHandler(const DiagnosticInfo &DI) {
}
}
+// Check if the archive has any object file with fat binary.
+bool hasFatBinary(const ArgList &Args, const Archive &LibFile) {
+ if (Args.hasArg(OPT_dry_run) && Args.hasArg(OPT_assume_device_archive))
+ return false;
+ Error Err = Error::success();
+ for (auto &C : LibFile.children(Err)) {
+ auto ChildBufferOrErr = C.getMemoryBufferRef();
+ if (!ChildBufferOrErr)
+ return false;
+ auto ObjFileOrErr = ObjectFile::createObjectFile(*ChildBufferOrErr);
+ if (!ObjFileOrErr)
+ return false;
+ const auto &Obj = **ObjFileOrErr;
+ // If the target of this object is not the device one, this archive most
+ // likely has fat binaries.
+ if (Obj.getArch() != Triple::nvptx && Obj.getArch() != Triple::nvptx64)
+ return true;
+ }
+ // Check err to ensure it's checked.
+ if (Err)
+ return false;
+ return false;
+}
+
Expected<StringRef> createTempFile(const ArgList &Args, const Twine &Prefix,
StringRef Extension) {
SmallString<128> OutputFile;
@@ -487,8 +513,13 @@ Expected<SmallVector<StringRef>> getInput(const ArgList &Args) {
for (const opt::Arg *Arg : Args.filtered(OPT_library_path))
LibraryPaths.push_back(Arg->getValue());
+ // Archives (with fatbin) forwarded to nvlink.
+ SmallVector<const char *> ForwardArchives;
+
bool WholeArchive = false;
- SmallVector<std::pair<std::unique_ptr<MemoryBuffer>, bool>> InputFiles;
+ SmallVector<std::tuple<std::unique_ptr<MemoryBuffer>, /*IsLazy=*/bool,
+ /*Forward=*/bool>>
+ InputFiles;
for (const opt::Arg *Arg : Args.filtered(
OPT_INPUT, OPT_library, OPT_whole_archive, OPT_no_whole_archive)) {
if (Arg->getOption().matches(OPT_whole_archive) ||
@@ -518,13 +549,21 @@ Expected<SmallVector<StringRef>> getInput(const ArgList &Args) {
switch (identify_magic(Buffer.getBuffer())) {
case file_magic::bitcode:
case file_magic::elf_relocatable:
- InputFiles.emplace_back(std::move(*BufferOrErr), /*IsLazy=*/false);
+ InputFiles.emplace_back(std::move(*BufferOrErr), /*IsLazy=*/false,
+ /*Forward=*/false);
break;
case file_magic::archive: {
Expected<std::unique_ptr<object::Archive>> LibFile =
object::Archive::create(Buffer);
if (!LibFile)
return LibFile.takeError();
+ // Skip extracting archives with fat binaries. Forward them to nvlink.
+ if (hasFatBinary(Args, **LibFile)) {
+ InputFiles.emplace_back(std::unique_ptr<MemoryBuffer>{},
+ /*IsLazy=*/false, /*Forward=*/true);
+ ForwardArchives.emplace_back(Args.MakeArgString(*Filename));
+ break;
+ }
Error Err = Error::success();
for (auto Child : (*LibFile)->children(Err)) {
auto ChildBufferOrErr = Child.getMemoryBufferRef();
@@ -534,7 +573,8 @@ Expected<SmallVector<StringRef>> getInput(const ArgList &Args) {
MemoryBuffer::getMemBufferCopy(
ChildBufferOrErr->getBuffer(),
ChildBufferOrErr->getBufferIdentifier());
- InputFiles.emplace_back(std::move(ChildBuffer), !WholeArchive);
+ InputFiles.emplace_back(std::move(ChildBuffer), !WholeArchive,
+ /*Forward=*/false);
}
if (Err)
return Err;
@@ -549,10 +589,19 @@ Expected<SmallVector<StringRef>> getInput(const ArgList &Args) {
StringMap<Symbol> SymTab;
for (auto &Sym : Args.getAllArgValues(OPT_u))
SymTab[Sym] = Symbol(Symbol::Undefined);
- SmallVector<std::unique_ptr<MemoryBuffer>> LinkerInput;
+ SmallVector<std::pair<std::unique_ptr<MemoryBuffer>, /*Forward=*/bool>>
+ LinkerInput;
while (Extracted) {
Extracted = false;
- for (auto &[Input, IsLazy] : InputFiles) {
+ for (auto &[Input, IsLazy, Forward] : InputFiles) {
+ // Forward input if required.
+ if (Forward) {
+ LinkerInput.emplace_back(std::unique_ptr<MemoryBuffer>{},
+ /*Forward=*/true);
+ Forward = false; // Mark this input is processed.
+ continue;
+ }
+
if (!Input)
continue;
@@ -566,17 +615,20 @@ Expected<SmallVector<StringRef>> getInput(const ArgList &Args) {
if (!*ExtractOrErr)
continue;
- LinkerInput.emplace_back(std::move(Input));
+ LinkerInput.emplace_back(std::move(Input), /*Forward=*/false);
}
}
InputFiles.clear();
// Extract any bitcode files to be passed to the LTO pipeline.
SmallVector<std::unique_ptr<MemoryBuffer>> BitcodeFiles;
- for (auto &Input : LinkerInput)
+ for (auto &[Input, Forward] : LinkerInput) {
+ if (Forward)
+ continue;
if (identify_magic(Input->getBuffer()) == file_magic::bitcode)
BitcodeFiles.emplace_back(std::move(Input));
- erase_if(LinkerInput, [](const auto &F) { return !F; });
+ }
+ erase_if(LinkerInput, [](const auto &P) { return !P.second && !P.first; });
// Run the LTO pipeline on the extracted inputs.
SmallVector<StringRef> Files;
@@ -672,7 +724,13 @@ Expected<SmallVector<StringRef>> getInput(const ArgList &Args) {
// linker requires all NVPTX inputs to have this extension for some reason.
// We don't use a symbolic link because it's not supported on Windows and some
// of this input files could be extracted from an archive.
- for (auto &Input : LinkerInput) {
+ auto FAI = ForwardArchives.begin();
+ for (auto &[Input, Forward] : LinkerInput) {
+ if (Forward) {
+ assert(FAI != ForwardArchives.end());
+ Files.emplace_back(*FAI++);
+ continue;
+ }
auto TempFileOrErr = createTempFile(
Args, sys::path::stem(Input->getBufferIdentifier()), "cubin");
if (!TempFileOrErr)
diff --git a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
index 7af35bf5989ec..3a77d84548249 100644
--- a/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
+++ b/clang/tools/clang-nvlink-wrapper/NVLinkOpts.td
@@ -112,3 +112,8 @@ def mllvm_EQ : Joined<["-"], "mllvm=">, Flags<[HelpHidden]>, Alias<mllvm>;
def dry_run : Flag<["--", "-"], "dry-run">, Flags<[WrapperOnlyOption]>,
HelpText<"Print generated commands without running.">;
+def assume_device_archive
+ : Flag<["--", "-"], "assume-device-archive">,
+ Flags<[WrapperOnlyOption]>,
+ HelpText<
+ "Assume archives have device object files only in dry-run mode.">;
More information about the cfe-commits
mailing list