[clang] 80072fd - [CUDA][HIP] Allow comdat for kernels

Wed Nov 10 13:43:09 PST 2021

Author: Yaxun (Sam) Liu
Date: 2021-11-10T16:42:23-05:00
New Revision: 80072fde61d40a4e8a9da673476730d34a483fa2

URL: https://github.com/llvm/llvm-project/commit/80072fde61d40a4e8a9da673476730d34a483fa2
DIFF: https://github.com/llvm/llvm-project/commit/80072fde61d40a4e8a9da673476730d34a483fa2.diff

LOG: [CUDA][HIP] Allow comdat for kernels

Two identical instantiations of a template function can be emitted by two TU's
with linkonce_odr linkage without causing duplicate symbols in linker. MSVC
also requires these symbols be in comdat sections. Linux does not require
the symbols in comdat sections to be merged by linker but by default
clang puts them in comdat sections.

If a template kernel is instantiated identically in two TU's. MSVC requires
that them to be in comdat sections, otherwise MSVC linker will diagnose them as
duplicate symbols. However, currently clang does not put instantiated template
kernels in comdat sections, which causes link error for MSVC.

This patch allows putting instantiated template kernels into comdat sections.

Reviewed by: Artem Belevich, Reid Kleckner

Differential Revision: https://reviews.llvm.org/D112492

Added: 
    

Modified: 
    clang/lib/CodeGen/CGCUDANV.cpp
    clang/lib/CodeGen/CodeGenModule.cpp
    clang/test/CodeGenCUDA/kernel-stub-name.cu
    clang/test/CodeGenCUDA/usual-deallocators.cu

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 69499672bd861..a1b4431ca8c43 100644

--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -1147,6 +1147,7 @@ llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
   Var->setAlignment(CGM.getPointerAlign().getAsAlign());
   Var->setDSOLocal(F->isDSOLocal());
   Var->setVisibility(F->getVisibility());
+  CGM.maybeSetTrivialComdat(*GD.getDecl(), *Var);
   KernelHandles[F] = Var;
   KernelStubs[Var] = F;
   return Var;

diff  --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index d36cff82f9dde..a4c60f0c50c2d 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -4308,11 +4308,6 @@ static bool shouldBeInCOMDAT(CodeGenModule &CGM, const Decl &D) {
   if (!CGM.supportsCOMDAT())
     return false;
 
-  // Do not set COMDAT attribute for CUDA/HIP stub functions to prevent
-  // them being "merged" by the COMDAT Folding linker optimization.
-  if (D.hasAttr<CUDAGlobalAttr>())
-    return false;
-
   if (D.hasAttr<SelectAnyAttr>())
     return true;
 

diff  --git a/clang/test/CodeGenCUDA/kernel-stub-name.cu b/clang/test/CodeGenCUDA/kernel-stub-name.cu
index 460dd6010e835..8e82c3612e323 100644
--- a/clang/test/CodeGenCUDA/kernel-stub-name.cu
+++ b/clang/test/CodeGenCUDA/kernel-stub-name.cu
@@ -2,16 +2,35 @@
 
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fcuda-include-gpubinary %t -o - -x hip\
-// RUN:   | FileCheck %s
+// RUN:   | FileCheck -check-prefixes=CHECK,GNU %s
+
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
+// RUN:     -fcuda-include-gpubinary %t -o - -x hip\
+// RUN:   | FileCheck -check-prefix=NEG %s
+
+// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -emit-llvm %s \
+// RUN:     -aux-triple amdgcn-amd-amdhsa -fcuda-include-gpubinary \
+// RUN:     %t -o - -x hip\
+// RUN:   | FileCheck -check-prefixes=CHECK,MSVC %s
+
+// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -emit-llvm %s \
+// RUN:     -aux-triple amdgcn-amd-amdhsa -fcuda-include-gpubinary \
+// RUN:     %t -o - -x hip\
+// RUN:   | FileCheck -check-prefix=NEG %s
 
 #include "Inputs/cuda.h"
 
-// Kernel handles
+// Check kernel handles are emitted for non-MSVC target but not for MSVC target.
 
-// CHECK: @[[HCKERN:ckernel]] = constant void ()* @__device_stub__ckernel, align 8
-// CHECK: @[[HNSKERN:_ZN2ns8nskernelEv]] = constant void ()* @_ZN2ns23__device_stub__nskernelEv, align 8
-// CHECK: @[[HTKERN:_Z10kernelfuncIiEvv]] = linkonce_odr constant void ()* @_Z25__device_stub__kernelfuncIiEvv, align 8
-// CHECK: @[[HDKERN:_Z11kernel_declv]] = external constant void ()*, align 8
+// GNU: @[[HCKERN:ckernel]] = constant void ()* @[[CSTUB:__device_stub__ckernel]], align 8
+// GNU: @[[HNSKERN:_ZN2ns8nskernelEv]] = constant void ()* @[[NSSTUB:_ZN2ns23__device_stub__nskernelEv]], align 8
+// GNU: @[[HTKERN:_Z10kernelfuncIiEvv]] = linkonce_odr constant void ()* @[[TSTUB:_Z25__device_stub__kernelfuncIiEvv]], comdat, align 8
+// GNU: @[[HDKERN:_Z11kernel_declv]] = external constant void ()*, align 8
+
+// MSVC: @[[HCKERN:ckernel]] = dso_local constant void ()* @[[CSTUB:__device_stub__ckernel]], align 8
+// MSVC: @[[HNSKERN:"\?nskernel at ns@@YAXXZ.*"]] = dso_local constant void ()* @[[NSSTUB:"\?nskernel at ns@@YAXXZ"]], align 8
+// MSVC: @[[HTKERN:"\?\?\$kernelfunc at H@@YAXXZ.*"]] = linkonce_odr dso_local constant void ()* @[[TSTUB:"\?\?\$kernelfunc at H@@YAXXZ.*"]], comdat, align 8
+// MSVC: @[[HDKERN:"\?kernel_decl@@YAXXZ.*"]] = external dso_local constant void ()*, align 8
 
 extern "C" __global__ void ckernel() {}
 
@@ -24,10 +43,10 @@ __global__ void kernelfunc() {}
 
 __global__ void kernel_decl();
 
-void (*kernel_ptr)();
-void *void_ptr;
+extern "C" void (*kernel_ptr)();
+extern "C" void *void_ptr;
 
-void launch(void *kern);
+extern "C" void launch(void *kern);
 
 // Device side kernel names
 
@@ -37,21 +56,22 @@ void launch(void *kern);
 
 // Non-template kernel stub functions
 
-// CHECK: define{{.*}}@[[CSTUB:__device_stub__ckernel]]
+// CHECK: define{{.*}}@[[CSTUB]]
 // CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HCKERN]]
-// CHECK: define{{.*}}@[[NSSTUB:_ZN2ns23__device_stub__nskernelEv]]
-// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HNSKERN]]
 
+// CHECK: define{{.*}}@[[NSSTUB]]
+// CHECK: call{{.*}}@hipLaunchByPtr{{.*}}@[[HNSKERN]]
 
-// Check kernel stub is used for triple chevron
+// Check kernel stub is called for triple chevron.
 
-// CHECK-LABEL: define{{.*}}@_Z4fun1v()
+// CHECK-LABEL: define{{.*}}@fun1()
 // CHECK: call void @[[CSTUB]]()
 // CHECK: call void @[[NSSTUB]]()
-// CHECK: call void @[[TSTUB:_Z25__device_stub__kernelfuncIiEvv]]()
-// CHECK: call void @[[DSTUB:_Z26__device_stub__kernel_declv]]()
+// CHECK: call void @[[TSTUB]]()
+// GNU: call void @[[DSTUB:_Z26__device_stub__kernel_declv]]()
+// MSVC: call void @[[DSTUB:"\?kernel_decl@@YAXXZ"]]()
 
-void fun1(void) {
+extern "C" void fun1(void) {
   ckernel<<<1, 1>>>();
   ns::nskernel<<<1, 1>>>();
   kernelfunc<int><<<1, 1>>>();
@@ -67,28 +87,28 @@ void fun1(void) {
 
 // CHECK: declare{{.*}}@[[DSTUB]]
 
-// Check kernel handle is used for passing the kernel as a function pointer
+// Check kernel handle is used for passing the kernel as a function pointer.
 
-// CHECK-LABEL: define{{.*}}@_Z4fun2v()
-// CHECK: call void @_Z6launchPv({{.*}}[[HCKERN]]
-// CHECK: call void @_Z6launchPv({{.*}}[[HNSKERN]]
-// CHECK: call void @_Z6launchPv({{.*}}[[HTKERN]]
-// CHECK: call void @_Z6launchPv({{.*}}[[HDKERN]]
-void fun2() {
+// CHECK-LABEL: define{{.*}}@fun2()
+// CHECK: call void @launch({{.*}}[[HCKERN]]
+// CHECK: call void @launch({{.*}}[[HNSKERN]]
+// CHECK: call void @launch({{.*}}[[HTKERN]]
+// CHECK: call void @launch({{.*}}[[HDKERN]]
+extern "C" void fun2() {
   launch((void *)ckernel);
   launch((void *)ns::nskernel);
   launch((void *)kernelfunc<int>);
   launch((void *)kernel_decl);
 }
 
-// Check kernel handle is used for assigning a kernel to a function pointer
+// Check kernel handle is used for assigning a kernel to a function pointer.
 
-// CHECK-LABEL: define{{.*}}@_Z4fun3v()
+// CHECK-LABEL: define{{.*}}@fun3()
 // CHECK:  store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr, align 8
 // CHECK:  store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr, align 8
 // CHECK:  store i8* bitcast (void ()** @[[HCKERN]] to i8*), i8** @void_ptr, align 8
 // CHECK:  store i8* bitcast (void ()** @[[HCKERN]] to i8*), i8** @void_ptr, align 8
-void fun3() {
+extern "C" void fun3() {
   kernel_ptr = ckernel;
   kernel_ptr = &ckernel;
   void_ptr = (void *)ckernel;
@@ -96,34 +116,37 @@ void fun3() {
 }
 
 // Check kernel stub is loaded from kernel handle when function pointer is
-// used with triple chevron
+// used with triple chevron.
 
-// CHECK-LABEL: define{{.*}}@_Z4fun4v()
+// CHECK-LABEL: define{{.*}}@fun4()
 // CHECK:  store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr
-// CHECK:  call i32 @_Z16hipConfigureCall4dim3S_mP9hipStream
+// CHECK:  call i32 @{{.*hipConfigureCall}}
 // CHECK:  %[[HANDLE:.*]] = load void ()*, void ()** @kernel_ptr, align 8
 // CHECK:  %[[CAST:.*]] = bitcast void ()* %[[HANDLE]] to void ()**
 // CHECK:  %[[STUB:.*]] = load void ()*, void ()** %[[CAST]], align 8
 // CHECK:  call void %[[STUB]]()
-void fun4() {
+extern "C" void fun4() {
   kernel_ptr = ckernel;
   kernel_ptr<<<1,1>>>();
 }
 
-// Check kernel handle is passed to a function
+// Check kernel handle is passed to a function.
 
-// CHECK-LABEL: define{{.*}}@_Z4fun5v()
+// CHECK-LABEL: define{{.*}}@fun5()
 // CHECK:  store void ()* bitcast (void ()** @[[HCKERN]] to void ()*), void ()** @kernel_ptr
 // CHECK:  %[[HANDLE:.*]] = load void ()*, void ()** @kernel_ptr, align 8
 // CHECK:  %[[CAST:.*]] = bitcast void ()* %[[HANDLE]] to i8*
-// CHECK:  call void @_Z6launchPv(i8* %[[CAST]])
-void fun5() {
+// CHECK:  call void @launch(i8* %[[CAST]])
+extern "C" void fun5() {
   kernel_ptr = ckernel;
   launch((void *)kernel_ptr);
 }
 
+// Check kernel handle is registered.
+
 // CHECK-LABEL: define{{.*}}@__hip_register_globals
 // CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HCKERN]]{{.*}}@[[CKERN]]
 // CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HNSKERN]]{{.*}}@[[NSKERN]]
 // CHECK: call{{.*}}@__hipRegisterFunction{{.*}}@[[HTKERN]]{{.*}}@[[TKERN]]
-// CHECK-NOT: call{{.*}}@__hipRegisterFunction{{.*}}@[[HDKERN]]{{.*}}@{{[0-9]*}}
+// NEG-NOT: call{{.*}}@__hipRegisterFunction{{.*}}__device_stub
+// NEG-NOT: call{{.*}}@__hipRegisterFunction{{.*}}kernel_decl

diff  --git a/clang/test/CodeGenCUDA/usual-deallocators.cu b/clang/test/CodeGenCUDA/usual-deallocators.cu
index 6f4cc267a23f3..da5712386f58b 100644
--- a/clang/test/CodeGenCUDA/usual-deallocators.cu
+++ b/clang/test/CodeGenCUDA/usual-deallocators.cu
@@ -109,7 +109,7 @@ __host__ __device__ void tests_hd(void *t) {
 }
 
 // Make sure that we've generated the kernel used by A::~A.
-// DEVICE-LABEL: define dso_local void @_Z1fIiEvT_
+// DEVICE-LABEL: define void @_Z1fIiEvT_
 
 // Make sure we've picked deallocator for the correct side of compilation.