r262498 - [CUDA] Emit host-side 'shadows' for device-side global variables
Artem Belevich via cfe-commits
cfe-commits at lists.llvm.org
Wed Mar 2 10:28:51 PST 2016
Author: tra
Date: Wed Mar 2 12:28:50 2016
New Revision: 262498
URL: http://llvm.org/viewvc/llvm-project?rev=262498&view=rev
Log:
[CUDA] Emit host-side 'shadows' for device-side global variables
... and register them with CUDA runtime.
This is needed for commonly used cudaMemcpy*() APIs that use address of
host-side shadow to access their counterparts on device side.
Fixes PR26340
Differential Revision: http://reviews.llvm.org/D17779
Modified:
cfe/trunk/lib/CodeGen/CGCUDANV.cpp
cfe/trunk/lib/CodeGen/CGCUDARuntime.h
cfe/trunk/lib/CodeGen/CodeGenModule.cpp
cfe/trunk/test/CodeGenCUDA/device-stub.cu
cfe/trunk/test/CodeGenCUDA/filter-decl.cu
Modified: cfe/trunk/lib/CodeGen/CGCUDANV.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGCUDANV.cpp?rev=262498&r1=262497&r2=262498&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CGCUDANV.cpp (original)
+++ cfe/trunk/lib/CodeGen/CGCUDANV.cpp Wed Mar 2 12:28:50 2016
@@ -38,6 +38,7 @@ private:
llvm::Module &TheModule;
/// Keeps track of kernel launch stubs emitted in this module
llvm::SmallVector<llvm::Function *, 16> EmittedKernels;
+ llvm::SmallVector<std::pair<llvm::GlobalVariable *, unsigned>, 16> DeviceVars;
/// Keeps track of variables containing handles of GPU binaries. Populated by
/// ModuleCtorFunction() and used to create corresponding cleanup calls in
/// ModuleDtorFunction()
@@ -47,7 +48,7 @@ private:
llvm::Constant *getLaunchFn() const;
/// Creates a function to register all kernel stubs generated in this module.
- llvm::Function *makeRegisterKernelsFn();
+ llvm::Function *makeRegisterGlobalsFn();
/// Helper function that generates a constant string and returns a pointer to
/// the start of the string. The result of this function can be used anywhere
@@ -68,6 +69,10 @@ public:
CGNVCUDARuntime(CodeGenModule &CGM);
void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
+ void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override {
+ DeviceVars.push_back(std::make_pair(&Var, Flags));
+ }
+
/// Creates module constructor function
llvm::Function *makeModuleCtorFunction() override;
/// Creates module destructor function
@@ -158,19 +163,24 @@ void CGNVCUDARuntime::emitDeviceStubBody
CGF.EmitBlock(EndBlock);
}
-/// Creates internal function to register all kernel stubs generated in this
-/// module with the CUDA runtime.
+/// Creates a function that sets up state on the host side for CUDA objects that
+/// have a presence on both the host and device sides. Specifically, registers
+/// the host side of kernel functions and device global variables with the CUDA
+/// runtime.
/// \code
-/// void __cuda_register_kernels(void** GpuBinaryHandle) {
+/// void __cuda_register_globals(void** GpuBinaryHandle) {
/// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
/// ...
/// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
+/// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
+/// ...
+/// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
/// }
/// \endcode
-llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() {
+llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
- llvm::GlobalValue::InternalLinkage, "__cuda_register_kernels", &TheModule);
+ llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", &TheModule);
llvm::BasicBlock *EntryBB =
llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
CGBuilderTy Builder(CGM, Context);
@@ -186,18 +196,44 @@ llvm::Function *CGNVCUDARuntime::makeReg
"__cudaRegisterFunction");
// Extract GpuBinaryHandle passed as the first argument passed to
- // __cuda_register_kernels() and generate __cudaRegisterFunction() call for
+ // __cuda_register_globals() and generate __cudaRegisterFunction() call for
// each emitted kernel.
llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
for (llvm::Function *Kernel : EmittedKernels) {
llvm::Constant *KernelName = makeConstantString(Kernel->getName());
llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
- llvm::Value *args[] = {
+ llvm::Value *Args[] = {
&GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy),
KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr,
NullPtr, NullPtr, NullPtr,
llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
- Builder.CreateCall(RegisterFunc, args);
+ Builder.CreateCall(RegisterFunc, Args);
+ }
+
+ // void __cudaRegisterVar(void **, char *, char *, const char *,
+ // int, int, int, int)
+ std::vector<llvm::Type *> RegisterVarParams = {
+ VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy,
+ IntTy, IntTy, IntTy, IntTy};
+ llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction(
+ llvm::FunctionType::get(IntTy, RegisterVarParams, false),
+ "__cudaRegisterVar");
+ for (auto &Pair : DeviceVars) {
+ llvm::GlobalVariable *Var = Pair.first;
+ unsigned Flags = Pair.second;
+ llvm::Constant *VarName = makeConstantString(Var->getName());
+ uint64_t VarSize =
+ CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
+ llvm::Value *Args[] = {
+ &GpuBinaryHandlePtr,
+ Builder.CreateBitCast(Var, VoidPtrTy),
+ VarName,
+ VarName,
+ llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0),
+ llvm::ConstantInt::get(IntTy, VarSize),
+ llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0),
+ llvm::ConstantInt::get(IntTy, 0)};
+ Builder.CreateCall(RegisterVar, Args);
}
Builder.CreateRetVoid();
@@ -208,15 +244,15 @@ llvm::Function *CGNVCUDARuntime::makeReg
/// \code
/// void __cuda_module_ctor(void*) {
/// Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0);
-/// __cuda_register_kernels(Handle0);
+/// __cuda_register_globals(Handle0);
/// ...
/// HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN);
-/// __cuda_register_kernels(HandleN);
+/// __cuda_register_globals(HandleN);
/// }
/// \endcode
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
- // void __cuda_register_kernels(void* handle);
- llvm::Function *RegisterKernelsFunc = makeRegisterKernelsFn();
+ // void __cuda_register_globals(void* handle);
+ llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
// void ** __cudaRegisterFatBinary(void *);
llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
@@ -272,8 +308,8 @@ llvm::Function *CGNVCUDARuntime::makeMod
CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
CGM.getPointerAlign());
- // Call __cuda_register_kernels(GpuBinaryHandle);
- CtorBuilder.CreateCall(RegisterKernelsFunc, RegisterFatbinCall);
+ // Call __cuda_register_globals(GpuBinaryHandle);
+ CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
// Save GpuBinaryHandle so we can unregister it in destructor.
GpuBinaryHandles.push_back(GpuBinaryHandle);
Modified: cfe/trunk/lib/CodeGen/CGCUDARuntime.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGCUDARuntime.h?rev=262498&r1=262497&r2=262498&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CGCUDARuntime.h (original)
+++ cfe/trunk/lib/CodeGen/CGCUDARuntime.h Wed Mar 2 12:28:50 2016
@@ -18,6 +18,7 @@
namespace llvm {
class Function;
+class GlobalVariable;
}
namespace clang {
@@ -37,6 +38,12 @@ protected:
CodeGenModule &CGM;
public:
+ // Global variable properties that must be passed to CUDA runtime.
+ enum DeviceVarFlags {
+ ExternDeviceVar = 0x01, // extern
+ ConstantDeviceVar = 0x02, // __constant__
+ };
+
CGCUDARuntime(CodeGenModule &CGM) : CGM(CGM) {}
virtual ~CGCUDARuntime();
@@ -46,6 +53,7 @@ public:
/// Emits a kernel launch stub.
virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0;
+ virtual void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) = 0;
/// Constructs and returns a module initialization function or nullptr if it's
/// not needed. Must be called after all kernels have been emitted.
Modified: cfe/trunk/lib/CodeGen/CodeGenModule.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CodeGenModule.cpp?rev=262498&r1=262497&r2=262498&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CodeGenModule.cpp (original)
+++ cfe/trunk/lib/CodeGen/CodeGenModule.cpp Wed Mar 2 12:28:50 2016
@@ -1528,11 +1528,18 @@ void CodeGenModule::EmitGlobal(GlobalDec
!Global->hasAttr<CUDASharedAttr>())
return;
} else {
- if (!Global->hasAttr<CUDAHostAttr>() && (
- Global->hasAttr<CUDADeviceAttr>() ||
- Global->hasAttr<CUDAConstantAttr>() ||
- Global->hasAttr<CUDASharedAttr>()))
+ // We need to emit host-side 'shadows' for all global
+ // device-side variables because the CUDA runtime needs their
+ // size and host-side address in order to provide access to
+ // their device-side incarnations.
+
+ // So device-only functions are the only things we skip.
+ if (isa<FunctionDecl>(Global) && !Global->hasAttr<CUDAHostAttr>() &&
+ Global->hasAttr<CUDADeviceAttr>())
return;
+
+ assert((isa<FunctionDecl>(Global) || isa<VarDecl>(Global)) &&
+ "Expected Variable or Function");
}
}
@@ -1561,8 +1568,15 @@ void CodeGenModule::EmitGlobal(GlobalDec
} else {
const auto *VD = cast<VarDecl>(Global);
assert(VD->isFileVarDecl() && "Cannot emit local var decl as global.");
-
- if (VD->isThisDeclarationADefinition() != VarDecl::Definition &&
+ // We need to emit device-side global CUDA variables even if a
+ // variable does not have a definition -- we still need to define
+ // host-side shadow for it.
+ bool MustEmitForCuda = LangOpts.CUDA && !LangOpts.CUDAIsDevice &&
+ !VD->hasDefinition() &&
+ (VD->hasAttr<CUDAConstantAttr>() ||
+ VD->hasAttr<CUDADeviceAttr>());
+ if (!MustEmitForCuda &&
+ VD->isThisDeclarationADefinition() != VarDecl::Definition &&
!Context.isMSStaticDataMemberInlineDefinition(VD))
return;
}
@@ -2444,6 +2458,10 @@ void CodeGenModule::EmitGlobalVarDefinit
if (D->hasAttr<AnnotateAttr>())
AddGlobalAnnotations(D, GV);
+ // Set the llvm linkage type as appropriate.
+ llvm::GlobalValue::LinkageTypes Linkage =
+ getLLVMLinkageVarDefinition(D, GV->isConstant());
+
// CUDA B.2.1 "The __device__ qualifier declares a variable that resides on
// the device. [...]"
// CUDA B.2.2 "The __constant__ qualifier, optionally used together with
@@ -2451,9 +2469,34 @@ void CodeGenModule::EmitGlobalVarDefinit
// Is accessible from all the threads within the grid and from the host
// through the runtime library (cudaGetSymbolAddress() / cudaGetSymbolSize()
// / cudaMemcpyToSymbol() / cudaMemcpyFromSymbol())."
- if (GV && LangOpts.CUDA && LangOpts.CUDAIsDevice &&
- (D->hasAttr<CUDAConstantAttr>() || D->hasAttr<CUDADeviceAttr>())) {
- GV->setExternallyInitialized(true);
+ if (GV && LangOpts.CUDA) {
+ if (LangOpts.CUDAIsDevice) {
+ if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>())
+ GV->setExternallyInitialized(true);
+ } else {
+ // Host-side shadows of external declarations of device-side
+ // global variables become internal definitions. These have to
+ // be internal in order to prevent name conflicts with global
+ // host variables with the same name in a different TUs.
+ if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
+ Linkage = llvm::GlobalValue::InternalLinkage;
+
+ // Shadow variables and their properties must be registered
+ // with CUDA runtime.
+ unsigned Flags = 0;
+ if (!D->hasDefinition())
+ Flags |= CGCUDARuntime::ExternDeviceVar;
+ if (D->hasAttr<CUDAConstantAttr>())
+ Flags |= CGCUDARuntime::ConstantDeviceVar;
+ getCUDARuntime().registerDeviceVar(*GV, Flags);
+ } else if (D->hasAttr<CUDASharedAttr>())
+ // __shared__ variables are odd. Shadows do get created, but
+ // they are not registered with the CUDA runtime, so they
+ // can't really be used to access their device-side
+ // counterparts. It's not clear yet whether it's nvcc's bug or
+ // a feature, but we've got to do the same for compatibility.
+ Linkage = llvm::GlobalValue::InternalLinkage;
+ }
}
GV->setInitializer(Init);
@@ -2470,9 +2513,6 @@ void CodeGenModule::EmitGlobalVarDefinit
GV->setAlignment(getContext().getDeclAlign(D).getQuantity());
- // Set the llvm linkage type as appropriate.
- llvm::GlobalValue::LinkageTypes Linkage =
- getLLVMLinkageVarDefinition(D, GV->isConstant());
// On Darwin, if the normal linkage of a C++ thread_local variable is
// LinkOnce or Weak, we keep the normal linkage to prevent multiple
Modified: cfe/trunk/test/CodeGenCUDA/device-stub.cu
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGenCUDA/device-stub.cu?rev=262498&r1=262497&r2=262498&view=diff
==============================================================================
--- cfe/trunk/test/CodeGenCUDA/device-stub.cu (original)
+++ cfe/trunk/test/CodeGenCUDA/device-stub.cu Wed Mar 2 12:28:50 2016
@@ -2,6 +2,40 @@
#include "Inputs/cuda.h"
+// CHECK-DAG: @device_var = internal global i32
+__device__ int device_var;
+
+// CHECK-DAG: @constant_var = internal global i32
+__constant__ int constant_var;
+
+// CHECK-DAG: @shared_var = internal global i32
+__shared__ int shared_var;
+
+// Make sure host globals don't get internalized...
+// CHECK-DAG: @host_var = global i32
+int host_var;
+// ... and that extern vars remain external.
+// CHECK-DAG: @ext_host_var = external global i32
+extern int ext_host_var;
+
+// Shadows for external device-side variables are *definitions* of
+// those variables.
+// CHECK-DAG: @ext_device_var = internal global i32
+extern __device__ int ext_device_var;
+// CHECK-DAG: @ext_device_var = internal global i32
+extern __constant__ int ext_constant_var;
+
+void use_pointers() {
+ int *p;
+ p = &device_var;
+ p = &constant_var;
+ p = &shared_var;
+ p = &host_var;
+ p = &ext_device_var;
+ p = &ext_constant_var;
+ p = &ext_host_var;
+}
+
// Make sure that all parts of GPU code init/cleanup are there:
// * constant unnamed string with the kernel name
// CHECK: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
@@ -32,9 +66,14 @@ __global__ void kernelfunc(int i, int j,
// CHECK: call{{.*}}kernelfunc
void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
-// Test that we've built a function to register kernels
-// CHECK: define internal void @__cuda_register_kernels
+// Test that we've built a function to register kernels and global vars.
+// CHECK: define internal void @__cuda_register_globals
// CHECK: call{{.*}}cudaRegisterFunction(i8** %0, {{.*}}kernelfunc
+// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}device_var{{.*}}i32 0, i32 4, i32 0, i32 0
+// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}constant_var{{.*}}i32 0, i32 4, i32 1, i32 0
+// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_device_var{{.*}}i32 1, i32 4, i32 0, i32 0
+// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_constant_var{{.*}}i32 1, i32 4, i32 1, i32 0
+// CHECK: ret void
// Test that we've built contructor..
// CHECK: define internal void @__cuda_module_ctor
@@ -42,8 +81,8 @@ void hostfunc(void) { kernelfunc<<<1, 1>
// CHECK: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper
// .. stores return value in __cuda_gpubin_handle
// CHECK-NEXT: store{{.*}}__cuda_gpubin_handle
-// .. and then calls __cuda_register_kernels
-// CHECK-NEXT: call void @__cuda_register_kernels
+// .. and then calls __cuda_register_globals
+// CHECK-NEXT: call void @__cuda_register_globals
// Test that we've created destructor.
// CHECK: define internal void @__cuda_module_dtor
Modified: cfe/trunk/test/CodeGenCUDA/filter-decl.cu
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGenCUDA/filter-decl.cu?rev=262498&r1=262497&r2=262498&view=diff
==============================================================================
--- cfe/trunk/test/CodeGenCUDA/filter-decl.cu (original)
+++ cfe/trunk/test/CodeGenCUDA/filter-decl.cu Wed Mar 2 12:28:50 2016
@@ -9,15 +9,15 @@
// CHECK-DEVICE-NOT: module asm "file scope asm is host only"
__asm__("file scope asm is host only");
-// CHECK-HOST-NOT: constantdata = externally_initialized global
+// CHECK-HOST: constantdata = internal global
// CHECK-DEVICE: constantdata = externally_initialized global
__constant__ char constantdata[256];
-// CHECK-HOST-NOT: devicedata = externally_initialized global
+// CHECK-HOST: devicedata = internal global
// CHECK-DEVICE: devicedata = externally_initialized global
__device__ char devicedata[256];
-// CHECK-HOST-NOT: shareddata = global
+// CHECK-HOST: shareddata = internal global
// CHECK-DEVICE: shareddata = global
__shared__ char shareddata[256];
More information about the cfe-commits
mailing list