[clang] [CUDA][HIP] Make template implicitly host device (PR #70369)

Yaxun Liu via cfe-commits cfe-commits at lists.llvm.org
Tue Oct 31 11:19:43 PDT 2023


https://github.com/yxsamliu updated https://github.com/llvm/llvm-project/pull/70369

>From f68b605114ab6b7183b4516df3f1227ef5d6f9d8 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu at amd.com>
Date: Fri, 27 Oct 2023 23:34:51 -0400
Subject: [PATCH] [CUDA][HIP] Make template implicitly host device

Added option -foffload-implicit-host-device-templates which is off by default.

When the option is on, template functions and specializations without
host/device attributes have implicit host device attributes.

They can be overridden by device template functions with the same signagure.
They are emitted on device side only if they are used on device side.

This feature is added as an extension.
`__has_extension(cuda_implicit_host_device_templates)` can be used to
check whether it is enabled.

This is to facilitate using standard C++ headers for device.

Fixes: https://github.com/llvm/llvm-project/issues/69956

Fixes: SWDEV-428314
---
 clang/include/clang/AST/ASTContext.h          |   4 +
 clang/include/clang/Basic/Features.def        |   1 +
 clang/include/clang/Basic/LangOptions.def     |   1 +
 clang/include/clang/Driver/Options.td         |   8 ++
 clang/include/clang/Sema/Sema.h               |   4 +
 clang/lib/CodeGen/CodeGenModule.cpp           |  22 +++-
 clang/lib/Driver/ToolChains/Clang.cpp         |   3 +
 clang/lib/Sema/SemaCUDA.cpp                   |  42 ++++++-
 clang/lib/Sema/SemaExpr.cpp                   |   7 ++
 .../CodeGenCUDA/implicit-host-device-fun.cu   | 118 ++++++++++++++++++
 clang/test/Lexer/has_extension.cu             |  13 ++
 .../test/SemaCUDA/Inputs/forced-host-device.h |   6 +
 .../test/SemaCUDA/implicit-host-device-fun.cu |  22 ++++
 13 files changed, 247 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/CodeGenCUDA/implicit-host-device-fun.cu
 create mode 100644 clang/test/Lexer/has_extension.cu
 create mode 100644 clang/test/SemaCUDA/Inputs/forced-host-device.h
 create mode 100644 clang/test/SemaCUDA/implicit-host-device-fun.cu

diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 24d22a9c692cd34..e34ad155c25c3a2 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -1156,6 +1156,10 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// host code.
   llvm::DenseSet<const ValueDecl *> CUDAExternalDeviceDeclODRUsedByHost;
 
+  /// Keep track of CUDA/HIP implicit host device functions used on device side
+  /// in device compilation.
+  llvm::DenseSet<const FunctionDecl *> CUDAImplicitHostDeviceFunUsedByDevice;
+
   ASTContext(LangOptions &LOpts, SourceManager &SM, IdentifierTable &idents,
              SelectorTable &sels, Builtin::Context &builtins,
              TranslationUnitKind TUKind);
diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
index cf626d0120cc7c7..da77aee8de36990 100644
--- a/clang/include/clang/Basic/Features.def
+++ b/clang/include/clang/Basic/Features.def
@@ -283,6 +283,7 @@ FEATURE(cxx_abi_relative_vtable, LangOpts.CPlusPlus && LangOpts.RelativeCXXABIVT
 
 // CUDA/HIP Features
 FEATURE(cuda_noinline_keyword, LangOpts.CUDA)
+EXTENSION(cuda_implicit_host_device_templates, LangOpts.CUDA && LangOpts.OffloadImplicitHostDeviceTemplates)
 
 #undef EXTENSION
 #undef FEATURE
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index c0ea4ecb9806a5b..8f09d714d498c91 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -268,6 +268,7 @@ LANGOPT(CUDAAllowVariadicFunctions, 1, 0, "allowing variadic functions in CUDA d
 LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
 LANGOPT(GPUDeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
 LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
+LANGOPT(OffloadImplicitHostDeviceTemplates, 1, 0, "assume template functions to be implicitly host device by default for CUDA/HIP")
 LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions for HIP")
 LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kernel launch bounds for HIP")
 LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index c8b730e0f7ecd84..759aee80223876d 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1146,6 +1146,14 @@ defm gpu_rdc : BoolFOption<"gpu-rdc",
           "Generate relocatable device code, also known as separate compilation mode">,
   NegFlag<SetFalse>>;
 
+defm offload_implicit_host_device_templates :
+  BoolFOption<"offload-implicit-host-device-templates",
+  LangOpts<"OffloadImplicitHostDeviceTemplates">, DefaultFalse,
+  PosFlag<SetTrue, [], [ClangOption, CC1Option],
+          "Template functions or specializations without host, device and "
+          "global attributes have implicit host device attributes (CUDA/HIP only)">,
+  NegFlag<SetFalse>>;
+
 def fgpu_default_stream_EQ : Joined<["-"], "fgpu-default-stream=">,
   HelpText<"Specify default stream. The default value is 'legacy'. (CUDA/HIP only)">,
   Visibility<[ClangOption, CC1Option]>,
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 91a4211a5cf5cce..79b5472a2e74763 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -13489,6 +13489,10 @@ class Sema final {
   /// host or device attribute.
   void CUDASetLambdaAttrs(CXXMethodDecl *Method);
 
+  /// Record \p FD if it is a CUDA/HIP implicit host device function used on
+  /// device side in device compilation.
+  void CUDARecordImplicitHostDeviceFuncUsedByDevice(const FunctionDecl *FD);
+
   /// Finds a function in \p Matches with highest calling priority
   /// from \p Caller context and erases all functions with lower
   /// calling priority.
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 4203a6218aba632..99ba025ef27d9aa 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -28,6 +28,7 @@
 #include "CoverageMappingGen.h"
 #include "TargetInfo.h"
 #include "clang/AST/ASTContext.h"
+#include "clang/AST/ASTLambda.h"
 #include "clang/AST/CharUnits.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclObjC.h"
@@ -3565,6 +3566,14 @@ ConstantAddress CodeGenModule::GetWeakRefReference(const ValueDecl *VD) {
   return ConstantAddress(Aliasee, DeclTy, Alignment);
 }
 
+template <typename AttrT> static bool hasImplicitAttr(const ValueDecl *D) {
+  if (!D)
+    return false;
+  if (auto *A = D->getAttr<AttrT>())
+    return A->isImplicit();
+  return D->isImplicit();
+}
+
 void CodeGenModule::EmitGlobal(GlobalDecl GD) {
   const auto *Global = cast<ValueDecl>(GD.getDecl());
 
@@ -3586,16 +3595,23 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
     return emitCPUDispatchDefinition(GD);
 
   // If this is CUDA, be selective about which declarations we emit.
+  // Non-constexpr non-lambda implicit host device functions are not emitted
+  // unless they are used on device side.
   if (LangOpts.CUDA) {
     if (LangOpts.CUDAIsDevice) {
-      if (!Global->hasAttr<CUDADeviceAttr>() &&
+      const auto *FD = dyn_cast<FunctionDecl>(Global);
+      if ((!Global->hasAttr<CUDADeviceAttr>() ||
+           (LangOpts.OffloadImplicitHostDeviceTemplates && FD &&
+            hasImplicitAttr<CUDAHostAttr>(FD) &&
+            hasImplicitAttr<CUDADeviceAttr>(FD) && !FD->isConstexpr() &&
+            !isLambdaCallOperator(FD) &&
+            !getContext().CUDAImplicitHostDeviceFunUsedByDevice.count(FD))) &&
           !Global->hasAttr<CUDAGlobalAttr>() &&
           !Global->hasAttr<CUDAConstantAttr>() &&
           !Global->hasAttr<CUDASharedAttr>() &&
           !Global->getType()->isCUDADeviceBuiltinSurfaceType() &&
           !Global->getType()->isCUDADeviceBuiltinTextureType() &&
-          !(LangOpts.HIPStdPar &&
-            isa<FunctionDecl>(Global) &&
+          !(LangOpts.HIPStdPar && isa<FunctionDecl>(Global) &&
             !Global->hasAttr<CUDAHostAttr>()))
         return;
     } else {
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 79f7fba22570746..6570ed1bf3981e1 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7395,6 +7395,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   Args.AddLastArg(CmdArgs, options::OPT_foffload_uniform_block,
                   options::OPT_fno_offload_uniform_block);
 
+  Args.AddLastArg(CmdArgs, options::OPT_foffload_implicit_host_device_templates,
+                  options::OPT_fno_offload_implicit_host_device_templates);
+
   if (IsCudaDevice || IsHIPDevice) {
     StringRef InlineThresh =
         Args.getLastArgValue(options::OPT_fgpu_inline_threshold_EQ);
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index d993499cf4a6e6e..318174f7be8fa95 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -678,6 +678,27 @@ void Sema::checkAllowedCUDAInitializer(VarDecl *VD) {
   }
 }
 
+void Sema::CUDARecordImplicitHostDeviceFuncUsedByDevice(
+    const FunctionDecl *Callee) {
+  FunctionDecl *Caller = getCurFunctionDecl(/*AllowLambda=*/true);
+  if (!Caller)
+    return;
+
+  if (!isCUDAImplicitHostDeviceFunction(Callee))
+    return;
+
+  CUDAFunctionTarget CallerTarget = IdentifyCUDATarget(Caller);
+
+  // Record whether an implicit host device function is used on device side.
+  if (CallerTarget != CFT_Device && CallerTarget != CFT_Global &&
+      (CallerTarget != CFT_HostDevice ||
+       (isCUDAImplicitHostDeviceFunction(Caller) &&
+        !getASTContext().CUDAImplicitHostDeviceFunUsedByDevice.count(Caller))))
+    return;
+
+  getASTContext().CUDAImplicitHostDeviceFunUsedByDevice.insert(Callee);
+}
+
 // With -fcuda-host-device-constexpr, an unattributed constexpr function is
 // treated as implicitly __host__ __device__, unless:
 //  * it is a variadic function (device-side variadic functions are not
@@ -702,6 +723,18 @@ void Sema::maybeAddCUDAHostDeviceAttrs(FunctionDecl *NewD,
     return;
   }
 
+  // If a template function has no host/device/global attributes,
+  // make it implicitly host device function.
+  if (getLangOpts().OffloadImplicitHostDeviceTemplates &&
+      !NewD->hasAttr<CUDAHostAttr>() && !NewD->hasAttr<CUDADeviceAttr>() &&
+      !NewD->hasAttr<CUDAGlobalAttr>() &&
+      (NewD->getDescribedFunctionTemplate() ||
+       NewD->isFunctionTemplateSpecialization())) {
+    NewD->addAttr(CUDAHostAttr::CreateImplicit(Context));
+    NewD->addAttr(CUDADeviceAttr::CreateImplicit(Context));
+    return;
+  }
+
   if (!getLangOpts().CUDAHostDeviceConstexpr || !NewD->isConstexpr() ||
       NewD->isVariadic() || NewD->hasAttr<CUDAHostAttr>() ||
       NewD->hasAttr<CUDADeviceAttr>() || NewD->hasAttr<CUDAGlobalAttr>())
@@ -950,7 +983,14 @@ void Sema::checkCUDATargetOverload(FunctionDecl *NewFD,
     // HD/global functions "exist" in some sense on both the host and device, so
     // should have the same implementation on both sides.
     if (NewTarget != OldTarget &&
-        ((NewTarget == CFT_HostDevice) || (OldTarget == CFT_HostDevice) ||
+        ((NewTarget == CFT_HostDevice &&
+          !(LangOpts.OffloadImplicitHostDeviceTemplates &&
+            isCUDAImplicitHostDeviceFunction(NewFD) &&
+            OldTarget == CFT_Device)) ||
+         (OldTarget == CFT_HostDevice &&
+          !(LangOpts.OffloadImplicitHostDeviceTemplates &&
+            isCUDAImplicitHostDeviceFunction(OldFD) &&
+            NewTarget == CFT_Device)) ||
          (NewTarget == CFT_Global) || (OldTarget == CFT_Global)) &&
         !IsOverload(NewFD, OldFD, /* UseMemberUsingDeclRules = */ false,
                     /* ConsiderCudaAttrs = */ false)) {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 8d3086cf3962c74..17a4e9003ba6624 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -19101,6 +19101,13 @@ void Sema::MarkFunctionReferenced(SourceLocation Loc, FunctionDecl *Func,
   if (FPT && isUnresolvedExceptionSpec(FPT->getExceptionSpecType()))
     ResolveExceptionSpec(Loc, FPT);
 
+  // A callee could be called by a host function then by a device function.
+  // If we only try recording once, we will miss recording the use on device
+  // side. Therefore keep trying until it is recorded.
+  if (LangOpts.OffloadImplicitHostDeviceTemplates && LangOpts.CUDAIsDevice &&
+      !getASTContext().CUDAImplicitHostDeviceFunUsedByDevice.count(Func))
+    CUDARecordImplicitHostDeviceFuncUsedByDevice(Func);
+
   // If this is the first "real" use, act on that.
   if (OdrUse == OdrUseContext::Used && !Func->isUsed(/*CheckUsedAttr=*/false)) {
     // Keep track of used but undefined functions.
diff --git a/clang/test/CodeGenCUDA/implicit-host-device-fun.cu b/clang/test/CodeGenCUDA/implicit-host-device-fun.cu
new file mode 100644
index 000000000000000..19c13b38b509662
--- /dev/null
+++ b/clang/test/CodeGenCUDA/implicit-host-device-fun.cu
@@ -0,0 +1,118 @@
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu \
+// RUN:   -foffload-implicit-host-device-templates \
+// RUN:   -emit-llvm -o - -x hip %s 2>&1 | \
+// RUN:   FileCheck -check-prefixes=COMM,HOST %s 
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \
+// RUN:   -target-cpu gfx1100 \
+// RUN:   -foffload-implicit-host-device-templates \
+// RUN:   -emit-llvm -o - -x hip %s 2>&1 | \
+// RUN:   FileCheck -check-prefixes=COMM,DEV %s
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device \
+// RUN:   -target-cpu gfx1100 \
+// RUN:   -foffload-implicit-host-device-templates \
+// RUN:   -emit-llvm -o - -x hip %s 2>&1 | \
+// RUN:   FileCheck -check-prefixes=DEV-NEG %s
+
+#include "Inputs/cuda.h"
+
+// Implicit host device template not overloaded by device template.
+// Used by both device and host function.
+// Emitted on both host and device.
+
+// COMM-LABEL: define {{.*}}@_Z20template_no_overloadIiET_S0_(
+// COMM:  ret i32 1
+template<typename T>
+T template_no_overload(T x) {
+  return 1;
+}
+
+// Implicit host device template overloaded by device template.
+// Used by both device and host function.
+// Implicit host device template emitted on host.
+// Device template emitted on device.
+
+// COMM-LABEL: define {{.*}}@_Z22template_with_overloadIiET_S0_(
+// HOST:  ret i32 2
+// DEV:  ret i32 3
+template<typename T>
+T template_with_overload(T x) {
+  return 2;
+}
+
+template<typename T>
+__device__ T template_with_overload(T x) {
+  return 3;
+}
+
+// Implicit host device template used by host function only.
+// Emitted on host only.
+// HOST-LABEL: define {{.*}}@_Z21template_used_by_hostIiET_S0_(
+// DEV-NEG-NOT: define {{.*}}@_Z21template_used_by_hostIiET_S0_(
+// HOST:  ret i32 10
+template<typename T>
+T template_used_by_host(T x) {
+  return 10;
+}
+
+// Implicit host device template indirectly used by host function only.
+// Emitted on host only.
+// HOST-LABEL: define {{.*}}@_Z32template_indirectly_used_by_hostIiET_S0_(
+// DEV-NEG-NOT: define {{.*}}@_Z32template_indirectly_used_by_hostIiET_S0_(
+// HOST:  ret i32 11
+template<typename T>
+T template_indirectly_used_by_host(T x) {
+  return 11;
+}
+
+template<typename T>
+T template_in_middle_by_host(T x) {
+  template_indirectly_used_by_host(x);
+  return 12;
+}
+
+// Implicit host device template indirectly used by device function only.
+// Emitted on device.
+// DEVICE-LABEL: define {{.*}}@_Z34template_indirectly_used_by_deviceIiET_S0_(
+// DEVICE:  ret i32 21
+template<typename T>
+T template_indirectly_used_by_device(T x) {
+  return 21;
+}
+
+template<typename T>
+T template_in_middle_by_device(T x) {
+  template_indirectly_used_by_device(x);
+  return 22;
+}
+
+// Implicit host device template indirectly used by host device function only.
+// Emitted on host and device.
+// COMMON-LABEL: define {{.*}}@_Z39template_indirectly_used_by_host_deviceIiET_S0_(
+// COMMON:  ret i32 31
+template<typename T>
+T template_indirectly_used_by_host_device(T x) {
+  return 31;
+}
+
+template<typename T>
+T template_in_middle_by_host_device(T x) {
+  template_indirectly_used_by_host_device(x);
+  return 32;
+}
+
+void host_fun() {
+  template_no_overload(0);
+  template_with_overload(0);
+  template_used_by_host(0);
+  template_in_middle_by_host(0);
+}
+
+__device__ void device_fun() {
+  template_no_overload(0);
+  template_with_overload(0);
+  template_in_middle_by_device(0);
+}
+
+__host__ __device__ void host_device_fun() {
+  template_in_middle_by_host_device(0);
+}
diff --git a/clang/test/Lexer/has_extension.cu b/clang/test/Lexer/has_extension.cu
new file mode 100644
index 000000000000000..fd5083e84b887de
--- /dev/null
+++ b/clang/test/Lexer/has_extension.cu
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -E -triple x86_64-linux-gnu %s -o - \
+// RUN:   | FileCheck -check-prefix=NOHDT %s
+// RUN: %clang_cc1 -E -triple x86_64-linux-gnu %s -o - \
+// RUN:   -foffload-implicit-host-device-templates \
+// RUN:   | FileCheck -check-prefix=HDT %s
+
+// NOHDT: no_implicit_host_device_templates
+// HDT: has_implicit_host_device_templates
+#if __has_extension(cuda_implicit_host_device_templates)
+int has_implicit_host_device_templates();
+#else
+int no_implicit_host_device_templates();
+#endif
diff --git a/clang/test/SemaCUDA/Inputs/forced-host-device.h b/clang/test/SemaCUDA/Inputs/forced-host-device.h
new file mode 100644
index 000000000000000..baba196373475ec
--- /dev/null
+++ b/clang/test/SemaCUDA/Inputs/forced-host-device.h
@@ -0,0 +1,6 @@
+void bar();
+#pragma clang force_cuda_host_device begin
+void foo();
+void bar();
+#pragma clang force_cuda_host_device end
+void foo() {}
diff --git a/clang/test/SemaCUDA/implicit-host-device-fun.cu b/clang/test/SemaCUDA/implicit-host-device-fun.cu
new file mode 100644
index 000000000000000..f73a48f2f11ea81
--- /dev/null
+++ b/clang/test/SemaCUDA/implicit-host-device-fun.cu
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -isystem %S/Inputs  -fsyntax-only %s
+// RUN: %clang_cc1 -isystem %S/Inputs -fcuda-is-device  -fsyntax-only %s
+// RUN: %clang_cc1 -isystem %S/Inputs -foffload-implicit-host-device-templates -fsyntax-only %s
+// RUN: %clang_cc1 -isystem %S/Inputs -foffload-implicit-host-device-templates -fcuda-is-device  -fsyntax-only %s
+
+#include <cuda.h>
+
+template<typename T>
+void tempf(T x) {
+}
+
+template<typename T>
+__device__ void tempf(T x) {
+}
+
+void host_fun() {
+  tempf(1);
+}
+
+__device__ void device_fun() {
+  tempf(1);
+}



More information about the cfe-commits mailing list