[clang] 4d680f5 - [HIP][Clang][Sema] Add Sema support for `hipstdpar`

Tue Oct 3 05:29:24 PDT 2023

Author: Alex Voicu
Date: 2023-10-03T13:29:12+01:00
New Revision: 4d680f56475ce17d8fb793655eb3d77ac8aee1b9

URL: https://github.com/llvm/llvm-project/commit/4d680f56475ce17d8fb793655eb3d77ac8aee1b9
DIFF: https://github.com/llvm/llvm-project/commit/4d680f56475ce17d8fb793655eb3d77ac8aee1b9.diff

LOG: [HIP][Clang][Sema] Add Sema support for `hipstdpar`

This patch adds the Sema changes needed for enabling HIP parallel algorithm offload on AMDGPU targets. This change impacts the CUDA / HIP language specific checks, and only manifests if compiling in `hipstdpar` mode. In this case, we essentially do three things:

1. Allow device side callers to call host side callees - since the user visible HLL would be standard C++, with no annotations / restriction mechanisms, we cannot unambiguously establish that such a call is an error, so we conservatively allow all such calls, deferring actual cleanup to a subsequent pass over IR;
2. Allow host formed lambdas to capture by reference;
3. Allow device functions to use host global variables.

Reviewed by: yaxunl

Differential Revision: https://reviews.llvm.org/D155833

Added: 
    clang/test/SemaHipStdPar/device-can-call-host.cpp

Modified: 
    clang/lib/Sema/SemaCUDA.cpp
    clang/lib/Sema/SemaExpr.cpp
    clang/lib/Sema/SemaStmtAsm.cpp

Removed: 
    


################################################################################
diff  --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 88f5484575db17a..3336dbf474df019 100644

--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -249,6 +249,15 @@ Sema::IdentifyCUDAPreference(const FunctionDecl *Caller,
       (CallerTarget == CFT_Global && CalleeTarget == CFT_Device))
     return CFP_Native;
 
+  // HipStdPar mode is special, in that assessing whether a device side call to
+  // a host target is deferred to a subsequent pass, and cannot unambiguously be
+  // adjudicated in the AST, hence we optimistically allow them to pass here.
+  if (getLangOpts().HIPStdPar &&
+      (CallerTarget == CFT_Global || CallerTarget == CFT_Device ||
+       CallerTarget == CFT_HostDevice) &&
+      CalleeTarget == CFT_Host)
+    return CFP_HostDevice;
+
   // (d) HostDevice behavior depends on compilation mode.
   if (CallerTarget == CFT_HostDevice) {
     // It's OK to call a compilation-mode matching function from an HD one.
@@ -895,7 +904,7 @@ void Sema::CUDACheckLambdaCapture(CXXMethodDecl *Callee,
   if (!ShouldCheck || !Capture.isReferenceCapture())
     return;
   auto DiagKind = SemaDiagnosticBuilder::K_Deferred;
-  if (Capture.isVariableCapture()) {
+  if (Capture.isVariableCapture() && !getLangOpts().HIPStdPar) {
     SemaDiagnosticBuilder(DiagKind, Capture.getLocation(),
                           diag::err_capture_bad_target, Callee, *this)
         << Capture.getVariable();

diff  --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 2ed31a90c5dc1da..797b71bffbb451e 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -19157,7 +19157,7 @@ MarkVarDeclODRUsed(ValueDecl *V, SourceLocation Loc, Sema &SemaRef,
       // Diagnose ODR-use of host global variables in device functions.
       // Reference of device global variables in host functions is allowed
       // through shadow variables therefore it is not diagnosed.
-      if (SemaRef.LangOpts.CUDAIsDevice) {
+      if (SemaRef.LangOpts.CUDAIsDevice && !SemaRef.LangOpts.HIPStdPar) {
         SemaRef.targetDiag(Loc, diag::err_ref_bad_target)
             << /*host*/ 2 << /*variable*/ 1 << Var << UserTarget;
         SemaRef.targetDiag(Var->getLocation(),

diff  --git a/clang/lib/Sema/SemaStmtAsm.cpp b/clang/lib/Sema/SemaStmtAsm.cpp
index 2acb269f042399b..83351b703c1536c 100644
--- a/clang/lib/Sema/SemaStmtAsm.cpp
+++ b/clang/lib/Sema/SemaStmtAsm.cpp
@@ -271,7 +271,8 @@ StmtResult Sema::ActOnGCCAsmStmt(SourceLocation AsmLoc, bool IsSimple,
       OutputName = Names[i]->getName();
 
     TargetInfo::ConstraintInfo Info(Literal->getString(), OutputName);
-    if (!Context.getTargetInfo().validateOutputConstraint(Info)) {
+    if (!Context.getTargetInfo().validateOutputConstraint(Info) &&
+        !(LangOpts.HIPStdPar && LangOpts.CUDAIsDevice)) {
       targetDiag(Literal->getBeginLoc(),
                  diag::err_asm_invalid_output_constraint)
           << Info.getConstraintStr();

diff  --git a/clang/test/SemaHipStdPar/device-can-call-host.cpp b/clang/test/SemaHipStdPar/device-can-call-host.cpp
new file mode 100644
index 000000000000000..3fedc179251d281
--- /dev/null
+++ b/clang/test/SemaHipStdPar/device-can-call-host.cpp
@@ -0,0 +1,93 @@
+// RUN: %clang_cc1 -x hip %s --hipstdpar -triple amdgcn-amd-amdhsa --std=c++17 \
+// RUN:   -fcuda-is-device -emit-llvm -o /dev/null -verify
+
+// Note: These would happen implicitly, within the implementation of the
+//       accelerator specific algorithm library, and not from user code.
+
+// Calls from the accelerator side to implicitly host (i.e. unannotated)
+// functions are fine.
+
+// expected-no-diagnostics
+
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+
+extern "C" void host_fn() {}
+
+struct Dummy {};
+
+struct S {
+  S() {}
+  ~S() { host_fn(); }
+
+  int x;
+};
+
+struct T {
+  __device__ void hd() { host_fn(); }
+
+  __device__ void hd3();
+
+  void h() {}
+
+  void operator+();
+  void operator-(const T&) {}
+
+  operator Dummy() { return Dummy(); }
+};
+
+__device__ void T::hd3() { host_fn(); }
+
+template <typename T> __device__ void hd2() { host_fn(); }
+
+__global__ void kernel() { hd2<int>(); }
+
+__device__ void hd() { host_fn(); }
+
+template <typename T> __device__ void hd3() { host_fn(); }
+__device__ void device_fn() { hd3<int>(); }
+
+__device__ void local_var() {
+  S s;
+}
+
+__device__ void explicit_destructor(S *s) {
+  s->~S();
+}
+
+__device__ void hd_member_fn() {
+  T t;
+
+  t.hd();
+}
+
+__device__ void h_member_fn() {
+  T t;
+  t.h();
+}
+
+__device__ void unaryOp() {
+  T t;
+  (void) +t;
+}
+
+__device__ void binaryOp() {
+  T t;
+  (void) (t - t);
+}
+
+__device__ void implicitConversion() {
+  T t;
+  Dummy d = t;
+}
+
+template <typename T>
+struct TmplStruct {
+  template <typename U> __device__ void fn() {}
+};
+
+template <>
+template <>
+__device__ void TmplStruct<int>::fn<int>() { host_fn(); }
+
+__device__ void double_specialization() { TmplStruct<int>().fn<int>(); }