[clang] acb6f80 - [CUDA][HIP] Fix overloading resolution

Wed Dec 2 13:34:05 PST 2020

Author: Yaxun (Sam) Liu
Date: 2020-12-02T16:33:33-05:00
New Revision: acb6f80d96b74af3ec515bb9811d213abb406c31

URL: https://github.com/llvm/llvm-project/commit/acb6f80d96b74af3ec515bb9811d213abb406c31
DIFF: https://github.com/llvm/llvm-project/commit/acb6f80d96b74af3ec515bb9811d213abb406c31.diff

LOG: [CUDA][HIP] Fix overloading resolution

This patch implements correct hostness based overloading resolution
in isBetterOverloadCandidate.

Based on hostness, if one candidate is emittable whereas the other
candidate is not emittable, the emittable candidate is better.

If both candidates are emittable, or neither is emittable based on hostness, then
other rules should be used to determine which is better. This is because
hostness based overloading resolution is mostly for determining
viability of a function. If two functions are both viable, other factors
should take precedence in preference.

If other rules cannot determine which is better, CUDA preference will be
used again to determine which is better.

However, correct hostness based overloading resolution
requires overloading resolution diagnostics to be deferred,
which is not on by default. The rationale is that deferring
overloading resolution diagnostics may hide overloading reslolutions
issues in header files.

An option -fgpu-exclude-wrong-side-overloads is added, which is off by
default.

When -fgpu-exclude-wrong-side-overloads is off, keep the original behavior,
that is, exclude wrong side overloads only if there are same side overloads.
This may result in incorrect overloading resolution when there are no
same side candates, but is sufficient for most CUDA/HIP applications.

When -fgpu-exclude-wrong-side-overloads is on, enable deferring
overloading resolution diagnostics and enable correct hostness
based overloading resolution, i.e., always exclude wrong side overloads.

Differential Revision: https://reviews.llvm.org/D80450

Added: 
    

Modified: 
    clang/include/clang/Basic/LangOptions.def
    clang/include/clang/Driver/Options.td
    clang/include/clang/Sema/Overload.h
    clang/lib/Driver/ToolChains/Clang.cpp
    clang/lib/Frontend/CompilerInvocation.cpp
    clang/lib/Sema/SemaOverload.cpp
    clang/test/Driver/hip-options.hip
    clang/test/SemaCUDA/deferred-oeverload.cu
    clang/test/SemaCUDA/function-overload.cu

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index f41febf30c53..071cc314b7d1 100644

--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -243,6 +243,7 @@ LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
 LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions for HIP")
 LANGOPT(GPUMaxThreadsPerBlock, 32, 256, "default max threads per block for kernel launch bounds for HIP")
 LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP")
+LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP")
 
 LANGOPT(SYCL              , 1, 0, "SYCL")
 LANGOPT(SYCLIsDevice      , 1, 0, "Generate code for SYCL device")

diff  --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 6e37a3154bdf..b58f5cbc63d0 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -721,6 +721,9 @@ defm gpu_allow_device_init : OptInFFlag<"gpu-allow-device-init",
 defm gpu_defer_diag : OptInFFlag<"gpu-defer-diag",
   "Defer", "Don't defer", " host/device related diagnostic messages"
   " for CUDA/HIP">;
+defm gpu_exclude_wrong_side_overloads : OptInFFlag<"gpu-exclude-wrong-side-overloads",
+  "Always exclude wrong side overloads", "Exclude wrong side overloads only if there are same side overloads",
+  " in overloading resolution for CUDA/HIP", [HelpHidden]>;
 def gpu_max_threads_per_block_EQ : Joined<["--"], "gpu-max-threads-per-block=">,
   Flags<[CC1Option]>,
   HelpText<"Default max threads per block for kernel launch bounds for HIP">;

diff  --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h
index 4f5e497bc202..5be6a618711c 100644
--- a/clang/include/clang/Sema/Overload.h
+++ b/clang/include/clang/Sema/Overload.h
@@ -1051,6 +1051,9 @@ class Sema;
 
     void destroyCandidates();
 
+    /// Whether diagnostics should be deferred.
+    bool shouldDeferDiags(Sema &S, ArrayRef<Expr *> Args, SourceLocation OpLoc);
+
   public:
     OverloadCandidateSet(SourceLocation Loc, CandidateSetKind CSK,
                          OperatorRewriteInfo RewriteInfo = {})

diff  --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index caa77123f7eb..a513c0025a62 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5610,6 +5610,12 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     if (Args.hasFlag(options::OPT_fgpu_defer_diag,
                      options::OPT_fno_gpu_defer_diag, false))
       CmdArgs.push_back("-fgpu-defer-diag");
+    if (Args.hasFlag(options::OPT_fgpu_exclude_wrong_side_overloads,
+                     options::OPT_fno_gpu_exclude_wrong_side_overloads,
+                     false)) {
+      CmdArgs.push_back("-fgpu-exclude-wrong-side-overloads");
+      CmdArgs.push_back("-fgpu-defer-diag");
+    }
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_fcf_protection_EQ)) {

diff  --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 068c8608ca65..1c63ce612be0 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -2693,6 +2693,9 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
   if (Args.hasArg(OPT_fno_cuda_host_device_constexpr))
     Opts.CUDAHostDeviceConstexpr = 0;
 
+  if (Args.hasArg(OPT_fgpu_exclude_wrong_side_overloads))
+    Opts.GPUExcludeWrongSideOverloads = 1;
+
   if (Args.hasArg(OPT_fgpu_defer_diag))
     Opts.GPUDeferDiag = 1;
 

diff  --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index f43a2a2e88e8..20a7bd08443d 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -9616,6 +9616,75 @@ bool clang::isBetterOverloadCandidate(
   else if (!Cand1.Viable)
     return false;
 
+  // [CUDA] A function with 'never' preference is marked not viable, therefore
+  // is never shown up here. The worst preference shown up here is 'wrong side',
+  // e.g. an H function called by a HD function in device compilation. This is
+  // valid AST as long as the HD function is not emitted, e.g. it is an inline
+  // function which is called only by an H function. A deferred diagnostic will
+  // be triggered if it is emitted. However a wrong-sided function is still
+  // a viable candidate here.
+  //
+  // If Cand1 can be emitted and Cand2 cannot be emitted in the current
+  // context, Cand1 is better than Cand2. If Cand1 can not be emitted and Cand2
+  // can be emitted, Cand1 is not better than Cand2. This rule should have
+  // precedence over other rules.
+  //
+  // If both Cand1 and Cand2 can be emitted, or neither can be emitted, then
+  // other rules should be used to determine which is better. This is because
+  // host/device based overloading resolution is mostly for determining
+  // viability of a function. If two functions are both viable, other factors
+  // should take precedence in preference, e.g. the standard-defined preferences
+  // like argument conversion ranks or enable_if partial-ordering. The
+  // preference for pass-object-size parameters is probably most similar to a
+  // type-based-overloading decision and so should take priority.
+  //
+  // If other rules cannot determine which is better, CUDA preference will be
+  // used again to determine which is better.
+  //
+  // TODO: Currently IdentifyCUDAPreference does not return correct values
+  // for functions called in global variable initializers due to missing
+  // correct context about device/host. Therefore we can only enforce this
+  // rule when there is a caller. We should enforce this rule for functions
+  // in global variable initializers once proper context is added.
+  //
+  // TODO: We can only enable the hostness based overloading resolution when
+  // -fgpu-exclude-wrong-side-overloads is on since this requires deferring
+  // overloading resolution diagnostics.
+  if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function &&
+      S.getLangOpts().GPUExcludeWrongSideOverloads) {
+    if (FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext)) {
+      bool IsCallerImplicitHD = Sema::isCUDAImplicitHostDeviceFunction(Caller);
+      bool IsCand1ImplicitHD =
+          Sema::isCUDAImplicitHostDeviceFunction(Cand1.Function);
+      bool IsCand2ImplicitHD =
+          Sema::isCUDAImplicitHostDeviceFunction(Cand2.Function);
+      auto P1 = S.IdentifyCUDAPreference(Caller, Cand1.Function);
+      auto P2 = S.IdentifyCUDAPreference(Caller, Cand2.Function);
+      assert(P1 != Sema::CFP_Never && P2 != Sema::CFP_Never);
+      // The implicit HD function may be a function in a system header which
+      // is forced by pragma. In device compilation, if we prefer HD candidates
+      // over wrong-sided candidates, overloading resolution may change, which
+      // may result in non-deferrable diagnostics. As a workaround, we let
+      // implicit HD candidates take equal preference as wrong-sided candidates.
+      // This will preserve the overloading resolution.
+      // TODO: We still need special handling of implicit HD functions since
+      // they may incur other diagnostics to be deferred. We should make all
+      // host/device related diagnostics deferrable and remove special handling
+      // of implicit HD functions.
+      auto EmitThreshold =
+          (S.getLangOpts().CUDAIsDevice && IsCallerImplicitHD &&
+           (IsCand1ImplicitHD || IsCand2ImplicitHD))
+              ? Sema::CFP_Never
+              : Sema::CFP_WrongSide;
+      auto Cand1Emittable = P1 > EmitThreshold;
+      auto Cand2Emittable = P2 > EmitThreshold;
+      if (Cand1Emittable && !Cand2Emittable)
+        return true;
+      if (!Cand1Emittable && Cand2Emittable)
+        return false;
+    }
+  }
+
   // C++ [over.match.best]p1:
   //
   //   -- if F is a static member function, ICS1(F) is defined such
@@ -9850,12 +9919,6 @@ bool clang::isBetterOverloadCandidate(
       return Cmp == Comparison::Better;
   }
 
-  if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) {
-    FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext);
-    return S.IdentifyCUDAPreference(Caller, Cand1.Function) >
-           S.IdentifyCUDAPreference(Caller, Cand2.Function);
-  }
-
   bool HasPS1 = Cand1.Function != nullptr &&
                 functionHasPassObjectSizeParams(Cand1.Function);
   bool HasPS2 = Cand2.Function != nullptr &&
@@ -9863,8 +9926,21 @@ bool clang::isBetterOverloadCandidate(
   if (HasPS1 != HasPS2 && HasPS1)
     return true;
 
-  Comparison MV = isBetterMultiversionCandidate(Cand1, Cand2);
-  return MV == Comparison::Better;
+  auto MV = isBetterMultiversionCandidate(Cand1, Cand2);
+  if (MV == Comparison::Better)
+    return true;
+  if (MV == Comparison::Worse)
+    return false;
+
+  // If other rules cannot determine which is better, CUDA preference is used
+  // to determine which is better.
+  if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) {
+    FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext);
+    return S.IdentifyCUDAPreference(Caller, Cand1.Function) >
+           S.IdentifyCUDAPreference(Caller, Cand2.Function);
+  }
+
+  return false;
 }
 
 /// Determine whether two declarations are "equivalent" for the purposes of
@@ -9957,7 +10033,11 @@ OverloadCandidateSet::BestViableFunction(Sema &S, SourceLocation Loc,
   // only on their host/device attributes. Specifically, if one
   // candidate call is WrongSide and the other is SameSide, we ignore
   // the WrongSide candidate.
-  if (S.getLangOpts().CUDA) {
+  // We only need to remove wrong-sided candidates here if
+  // -fgpu-exclude-wrong-side-overloads is off. When
+  // -fgpu-exclude-wrong-side-overloads is on, all candidates are compared
+  // uniformly in isBetterOverloadCandidate.
+  if (S.getLangOpts().CUDA && !S.getLangOpts().GPUExcludeWrongSideOverloads) {
     const FunctionDecl *Caller = dyn_cast<FunctionDecl>(S.CurContext);
     bool ContainsSameSideCandidate =
         llvm::any_of(Candidates, [&](OverloadCandidate *Cand) {
@@ -11620,26 +11700,34 @@ SmallVector<OverloadCandidate *, 32> OverloadCandidateSet::CompleteCandidates(
   return Cands;
 }
 
-/// When overload resolution fails, prints diagnostic messages containing the
-/// candidates in the candidate set.
-void OverloadCandidateSet::NoteCandidates(PartialDiagnosticAt PD,
-    Sema &S, OverloadCandidateDisplayKind OCD, ArrayRef<Expr *> Args,
-    StringRef Opc, SourceLocation OpLoc,
-    llvm::function_ref<bool(OverloadCandidate &)> Filter) {
-
+bool OverloadCandidateSet::shouldDeferDiags(Sema &S, ArrayRef<Expr *> Args,
+                                            SourceLocation OpLoc) {
   bool DeferHint = false;
   if (S.getLangOpts().CUDA && S.getLangOpts().GPUDeferDiag) {
-    // Defer diagnostic for CUDA/HIP if there are wrong-sided candidates.
+    // Defer diagnostic for CUDA/HIP if there are wrong-sided candidates or
+    // host device candidates.
     auto WrongSidedCands =
         CompleteCandidates(S, OCD_AllCandidates, Args, OpLoc, [](auto &Cand) {
-          return Cand.Viable == false &&
-                 Cand.FailureKind == ovl_fail_bad_target;
+          return (Cand.Viable == false &&
+                  Cand.FailureKind == ovl_fail_bad_target) ||
+                 (Cand.Function->template hasAttr<CUDAHostAttr>() &&
+                  Cand.Function->template hasAttr<CUDADeviceAttr>());
         });
     DeferHint = WrongSidedCands.size();
   }
+  return DeferHint;
+}
+
+/// When overload resolution fails, prints diagnostic messages containing the
+/// candidates in the candidate set.
+void OverloadCandidateSet::NoteCandidates(
+    PartialDiagnosticAt PD, Sema &S, OverloadCandidateDisplayKind OCD,
+    ArrayRef<Expr *> Args, StringRef Opc, SourceLocation OpLoc,
+    llvm::function_ref<bool(OverloadCandidate &)> Filter) {
+
   auto Cands = CompleteCandidates(S, OCD, Args, OpLoc, Filter);
 
-  S.Diag(PD.first, PD.second, DeferHint);
+  S.Diag(PD.first, PD.second, shouldDeferDiags(S, Args, OpLoc));
 
   NoteCandidates(S, Args, Cands, Opc, OpLoc);
 
@@ -11691,7 +11779,9 @@ void OverloadCandidateSet::NoteCandidates(Sema &S, ArrayRef<Expr *> Args,
   }
 
   if (I != E)
-    S.Diag(OpLoc, diag::note_ovl_too_many_candidates) << int(E - I);
+    S.Diag(OpLoc, diag::note_ovl_too_many_candidates,
+           shouldDeferDiags(S, Args, OpLoc))
+        << int(E - I);
 }
 
 static SourceLocation

diff  --git a/clang/test/Driver/hip-options.hip b/clang/test/Driver/hip-options.hip
index 46cfe0a531f6..d66eb14ca9bf 100644
--- a/clang/test/Driver/hip-options.hip
+++ b/clang/test/Driver/hip-options.hip
@@ -35,3 +35,8 @@
 // RUN: %clang -### -nogpuinc -nogpulib -munsafe-fp-atomics \
 // RUN:   --cuda-gpu-arch=gfx906  %s 2>&1 | FileCheck -check-prefix=UNSAFE-FP-ATOMICS %s
 // UNSAFE-FP-ATOMICS: clang{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-munsafe-fp-atomics"
+
+// RUN: %clang -### -target x86_64-unknown-linux-gnu -nogpuinc -nogpulib -fgpu-exclude-wrong-side-overloads \
+// RUN:   --cuda-gpu-arch=gfx906  %s 2>&1 | FileCheck -check-prefix=FIX-OVERLOAD %s
+// FIX-OVERLOAD: clang{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-fgpu-exclude-wrong-side-overloads" "-fgpu-defer-diag"
+// FIX-OVERLOAD: clang{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-fgpu-exclude-wrong-side-overloads" "-fgpu-defer-diag"

diff  --git a/clang/test/SemaCUDA/deferred-oeverload.cu b/clang/test/SemaCUDA/deferred-oeverload.cu
index a90bb7cf0d86..c800ac527ead 100644
--- a/clang/test/SemaCUDA/deferred-oeverload.cu
+++ b/clang/test/SemaCUDA/deferred-oeverload.cu
@@ -54,7 +54,7 @@ struct B { int x; };
 // This fails to substitue for A but no diagnostic
 // should be emitted.
 template<typename T, typename T::foo* = nullptr>
-__host__ __device__ void sfinae(T t) { // com-note {{candidate template ignored: substitution failure [with T = B]}}
+__host__ __device__ void sfinae(T t) { // host-note {{candidate template ignored: substitution failure [with T = B]}}
   t.x = 1;
 }
 
@@ -64,13 +64,13 @@ __host__ __device__ void sfinae(T t) { // com-note {{candidate template ignored:
 // file scope.
 
 template<typename T, typename T::isA* = nullptr>
-__host__ __device__ void sfinae(T t) { // com-note {{candidate template ignored: substitution failure [with T = B]}}
+__host__ __device__ void sfinae(T t) { // host-note {{candidate template ignored: substitution failure [with T = B]}}
   t.x = 1;
 }
 
 void test_sfinae() {
   sfinae(A());
-  sfinae(B()); // com-error{{no matching function for call to 'sfinae'}}
+  sfinae(B()); // host-error{{no matching function for call to 'sfinae'}}
 }
 
 // Make sure throw is diagnosed in OpenMP parallel region in host function.

diff  --git a/clang/test/SemaCUDA/function-overload.cu b/clang/test/SemaCUDA/function-overload.cu
index 191268c9a5f1..574b65ee7fd8 100644
--- a/clang/test/SemaCUDA/function-overload.cu
+++ b/clang/test/SemaCUDA/function-overload.cu
@@ -1,8 +1,16 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: nvptx-registered-target
 
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify=host,expected %s
-// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device -verify=dev,expected %s
+// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-linux-gnu -fsyntax-only \
+// RUN:   -verify=host,hostdefer,devdefer,expected %s
+// RUN: %clang_cc1 -std=c++14 -triple nvptx64-nvidia-cuda -fsyntax-only \
+// RUN:   -fcuda-is-device -verify=dev,devnodeferonly,hostdefer,devdefer,expected %s
+// RUN: %clang_cc1 -fgpu-exclude-wrong-side-overloads -fgpu-defer-diag -DDEFER=1 \
+// RUN:    -std=c++14 -triple x86_64-unknown-linux-gnu -fsyntax-only \
+// RUN:    -verify=host,hostdefer,expected %s
+// RUN: %clang_cc1 -fgpu-exclude-wrong-side-overloads -fgpu-defer-diag -DDEFER=1 \
+// RUN:    -std=c++14 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device \
+// RUN:    -verify=dev,devdeferonly,devdefer,expected %s
 
 #include "Inputs/cuda.h"
 
@@ -76,37 +84,37 @@ extern "C" __host__ __device__ int chhd2() { return 0; }
 // Helper functions to verify calling restrictions.
 __device__ DeviceReturnTy d() { return DeviceReturnTy(); }
 // host-note at -1 1+ {{'d' declared here}}
-// expected-note at -2 1+ {{candidate function not viable: call to __device__ function from __host__ function}}
+// hostdefer-note at -2 1+ {{candidate function not viable: call to __device__ function from __host__ function}}
 // expected-note at -3 0+ {{candidate function not viable: call to __device__ function from __host__ __device__ function}}
 
 __host__ HostReturnTy h() { return HostReturnTy(); }
 // dev-note at -1 1+ {{'h' declared here}}
-// expected-note at -2 1+ {{candidate function not viable: call to __host__ function from __device__ function}}
+// devdefer-note at -2 1+ {{candidate function not viable: call to __host__ function from __device__ function}}
 // expected-note at -3 0+ {{candidate function not viable: call to __host__ function from __host__ __device__ function}}
-// expected-note at -4 1+ {{candidate function not viable: call to __host__ function from __global__ function}}
+// devdefer-note at -4 1+ {{candidate function not viable: call to __host__ function from __global__ function}}
 
 __global__ void g() {}
 // dev-note at -1 1+ {{'g' declared here}}
-// expected-note at -2 1+ {{candidate function not viable: call to __global__ function from __device__ function}}
+// devdefer-note at -2 1+ {{candidate function not viable: call to __global__ function from __device__ function}}
 // expected-note at -3 0+ {{candidate function not viable: call to __global__ function from __host__ __device__ function}}
-// expected-note at -4 1+ {{candidate function not viable: call to __global__ function from __global__ function}}
+// devdefer-note at -4 1+ {{candidate function not viable: call to __global__ function from __global__ function}}
 
 extern "C" __device__ DeviceReturnTy cd() { return DeviceReturnTy(); }
 // host-note at -1 1+ {{'cd' declared here}}
-// expected-note at -2 1+ {{candidate function not viable: call to __device__ function from __host__ function}}
+// hostdefer-note at -2 1+ {{candidate function not viable: call to __device__ function from __host__ function}}
 // expected-note at -3 0+ {{candidate function not viable: call to __device__ function from __host__ __device__ function}}
 
 extern "C" __host__ HostReturnTy ch() { return HostReturnTy(); }
 // dev-note at -1 1+ {{'ch' declared here}}
-// expected-note at -2 1+ {{candidate function not viable: call to __host__ function from __device__ function}}
+// devdefer-note at -2 1+ {{candidate function not viable: call to __host__ function from __device__ function}}
 // expected-note at -3 0+ {{candidate function not viable: call to __host__ function from __host__ __device__ function}}
-// expected-note at -4 1+ {{candidate function not viable: call to __host__ function from __global__ function}}
+// devdefer-note at -4 1+ {{candidate function not viable: call to __host__ function from __global__ function}}
 
 __host__ void hostf() {
   DeviceFnPtr fp_d = d;         // host-error {{reference to __device__ function 'd' in __host__ function}}
-  DeviceReturnTy ret_d = d();   // expected-error {{no matching function for call to 'd'}}
+  DeviceReturnTy ret_d = d();   // hostdefer-error {{no matching function for call to 'd'}}
   DeviceFnPtr fp_cd = cd;       // host-error {{reference to __device__ function 'cd' in __host__ function}}
-  DeviceReturnTy ret_cd = cd(); // expected-error {{no matching function for call to 'cd'}}
+  DeviceReturnTy ret_cd = cd(); // hostdefer-error {{no matching function for call to 'cd'}}
 
   HostFnPtr fp_h = h;
   HostReturnTy ret_h = h();
@@ -130,9 +138,9 @@ __device__ void devicef() {
   DeviceReturnTy ret_cd = cd();
 
   HostFnPtr fp_h = h;         // dev-error {{reference to __host__ function 'h' in __device__ function}}
-  HostReturnTy ret_h = h();   // expected-error {{no matching function for call to 'h'}}
+  HostReturnTy ret_h = h();   // devdefer-error {{no matching function for call to 'h'}}
   HostFnPtr fp_ch = ch;       // dev-error {{reference to __host__ function 'ch' in __device__ function}}
-  HostReturnTy ret_ch = ch(); // expected-error {{no matching function for call to 'ch'}}
+  HostReturnTy ret_ch = ch(); // devdefer-error {{no matching function for call to 'ch'}}
 
   DeviceFnPtr fp_dh = dh;
   DeviceReturnTy ret_dh = dh();
@@ -140,7 +148,7 @@ __device__ void devicef() {
   DeviceReturnTy ret_cdh = cdh();
 
   GlobalFnPtr fp_g = g; // dev-error {{reference to __global__ function 'g' in __device__ function}}
-  g(); // expected-error {{no matching function for call to 'g'}}
+  g(); // devdefer-error {{no matching function for call to 'g'}}
   g<<<0,0>>>(); // dev-error {{reference to __global__ function 'g' in __device__ function}}
 }
 
@@ -151,9 +159,9 @@ __global__ void globalf() {
   DeviceReturnTy ret_cd = cd();
 
   HostFnPtr fp_h = h;         // dev-error {{reference to __host__ function 'h' in __global__ function}}
-  HostReturnTy ret_h = h();   // expected-error {{no matching function for call to 'h'}}
+  HostReturnTy ret_h = h();   // devdefer-error {{no matching function for call to 'h'}}
   HostFnPtr fp_ch = ch;       // dev-error {{reference to __host__ function 'ch' in __global__ function}}
-  HostReturnTy ret_ch = ch(); // expected-error {{no matching function for call to 'ch'}}
+  HostReturnTy ret_ch = ch(); // devdefer-error {{no matching function for call to 'ch'}}
 
   DeviceFnPtr fp_dh = dh;
   DeviceReturnTy ret_dh = dh();
@@ -161,7 +169,7 @@ __global__ void globalf() {
   DeviceReturnTy ret_cdh = cdh();
 
   GlobalFnPtr fp_g = g; // dev-error {{reference to __global__ function 'g' in __global__ function}}
-  g(); // expected-error {{no matching function for call to 'g'}}
+  g(); // devdefer-error {{no matching function for call to 'g'}}
   g<<<0,0>>>(); // dev-error {{reference to __global__ function 'g' in __global__ function}}
 }
 
@@ -184,7 +192,7 @@ __host__ __device__ void hostdevicef() {
 #if defined(__CUDA_ARCH__)
   // expected-error at -5 {{reference to __host__ function 'h' in __host__ __device__ function}}
   // expected-error at -5 {{reference to __host__ function 'h' in __host__ __device__ function}}
-  // expected-error at -5 {{reference to __host__ function 'ch' in __host__ __device__ function}}
+  // devdefer-error at -5 {{reference to __host__ function 'ch' in __host__ __device__ function}}
   // expected-error at -5 {{reference to __host__ function 'ch' in __host__ __device__ function}}
 #endif
 
@@ -331,9 +339,7 @@ __device__ void test_device_calls_template_fn() {
 // If we have a mix of HD and H-only or D-only candidates in the overload set,
 // normal C++ overload resolution rules apply first.
 template <typename T> TemplateReturnTy template_vs_hd_function(T arg)
-#ifdef __CUDA_ARCH__
-//expected-note at -2 {{declared here}}
-#endif
+// devnodeferonly-note at -1{{'template_vs_hd_function<int>' declared here}}
 {
   return TemplateReturnTy();
 }
@@ -342,11 +348,14 @@ __host__ __device__ HostDeviceReturnTy template_vs_hd_function(float arg) {
 }
 
 __host__ __device__ void test_host_device_calls_hd_template() {
-  HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f);
-  TemplateReturnTy ret2 = template_vs_hd_function(1);
-#ifdef __CUDA_ARCH__
-  // expected-error at -2 {{reference to __host__ function 'template_vs_hd_function<int>' in __host__ __device__ function}}
+#if __CUDA_ARCH__ && DEFER
+  typedef HostDeviceReturnTy ExpectedReturnTy;
+#else
+  typedef TemplateReturnTy ExpectedReturnTy;
 #endif
+  HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f);
+  ExpectedReturnTy ret2 = template_vs_hd_function(1);
+  // devnodeferonly-error at -1{{reference to __host__ function 'template_vs_hd_function<int>' in __host__ __device__ function}}
 }
 
 __host__ void test_host_calls_hd_template() {
@@ -367,14 +376,14 @@ __device__ void test_device_calls_hd_template() {
 __device__ DeviceReturnTy device_only_function(int arg) { return DeviceReturnTy(); }
 __device__ DeviceReturnTy2 device_only_function(float arg) { return DeviceReturnTy2(); }
 #ifndef __CUDA_ARCH__
-  // expected-note at -3 {{'device_only_function' declared here}}
-  // expected-note at -3 {{'device_only_function' declared here}}
+  // expected-note at -3 2{{'device_only_function' declared here}}
+  // expected-note at -3 2{{'device_only_function' declared here}}
 #endif
 __host__ HostReturnTy host_only_function(int arg) { return HostReturnTy(); }
 __host__ HostReturnTy2 host_only_function(float arg) { return HostReturnTy2(); }
 #ifdef __CUDA_ARCH__
-  // expected-note at -3 {{'host_only_function' declared here}}
-  // expected-note at -3 {{'host_only_function' declared here}}
+  // expected-note at -3 2{{'host_only_function' declared here}}
+  // expected-note at -3 2{{'host_only_function' declared here}}
 #endif
 
 __host__ __device__ void test_host_device_single_side_overloading() {
@@ -392,6 +401,37 @@ __host__ __device__ void test_host_device_single_side_overloading() {
 #endif
 }
 
+// wrong-sided overloading should not cause diagnostic unless it is emitted.
+// This inline function is not emitted.
+inline __host__ __device__ void test_host_device_wrong_side_overloading_inline_no_diag() {
+  DeviceReturnTy ret1 = device_only_function(1);
+  DeviceReturnTy2 ret2 = device_only_function(1.0f);
+  HostReturnTy ret3 = host_only_function(1);
+  HostReturnTy2 ret4 = host_only_function(1.0f);
+}
+
+// wrong-sided overloading should cause diagnostic if it is emitted.
+// This inline function is emitted since it is called by an emitted function.
+inline __host__ __device__ void test_host_device_wrong_side_overloading_inline_diag() {
+  DeviceReturnTy ret1 = device_only_function(1);
+  DeviceReturnTy2 ret2 = device_only_function(1.0f);
+#ifndef __CUDA_ARCH__
+  // expected-error at -3 {{reference to __device__ function 'device_only_function' in __host__ __device__ function}}
+  // expected-error at -3 {{reference to __device__ function 'device_only_function' in __host__ __device__ function}}
+#endif
+  HostReturnTy ret3 = host_only_function(1);
+  HostReturnTy2 ret4 = host_only_function(1.0f);
+#ifdef __CUDA_ARCH__
+  // expected-error at -3 {{reference to __host__ function 'host_only_function' in __host__ __device__ function}}
+  // expected-error at -3 {{reference to __host__ function 'host_only_function' in __host__ __device__ function}}
+#endif
+}
+
+__host__ __device__ void test_host_device_wrong_side_overloading_inline_diag_caller() {
+  test_host_device_wrong_side_overloading_inline_diag();
+  // expected-note at -1 {{called by 'test_host_device_wrong_side_overloading_inline_diag_caller'}}
+}
+
 // Verify that we allow overloading function templates.
 template <typename T> __host__ T template_overload(const T &a) { return a; };
 template <typename T> __device__ T template_overload(const T &a) { return a; };
@@ -419,3 +459,255 @@ __host__ __device__ int constexpr_overload(const T &x, const T &y) {
 int test_constexpr_overload(C2 &x, C2 &y) {
   return constexpr_overload(x, y);
 }
+
+// Verify no ambiguity for new operator.
+void *a = new int;
+__device__ void *b = new int;
+// expected-error at -1{{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+// Verify no ambiguity for new operator.
+template<typename _Tp> _Tp&& f();
+template<typename _Tp, typename = decltype(new _Tp(f<_Tp>()))>
+void __test();
+
+void foo() {
+  __test<int>();
+}
+
+// Test resolving implicit host device candidate vs wrong-sided candidate.
+// In device compilation, implicit host device caller choose implicit host
+// device candidate and wrong-sided candidate with equal preference.
+// Resolution result should not change with/without pragma.
+namespace ImplicitHostDeviceVsWrongSided {
+HostReturnTy callee(double x);
+#pragma clang force_cuda_host_device begin
+HostDeviceReturnTy callee(int x);
+inline HostReturnTy implicit_hd_caller() {
+  return callee(1.0);
+}
+#pragma clang force_cuda_host_device end
+}
+
+// Test resolving implicit host device candidate vs same-sided candidate.
+// In host compilation, implicit host device caller choose implicit host
+// device candidate and same-sided candidate with equal preference.
+// Resolution result should not change with/without pragma.
+namespace ImplicitHostDeviceVsSameSide {
+HostReturnTy callee(int x);
+#pragma clang force_cuda_host_device begin
+HostDeviceReturnTy callee(double x);
+inline HostDeviceReturnTy implicit_hd_caller() {
+  return callee(1.0);
+}
+#pragma clang force_cuda_host_device end
+}
+
+// Test resolving explicit host device candidate vs. wrong-sided candidate.
+// When -fgpu-defer-diag is off, wrong-sided candidate is not excluded, therefore
+// the first callee is chosen.
+// When -fgpu-defer-diag is on, wrong-sided candidate is excluded, therefore
+// the second callee is chosen.
+namespace ExplicitHostDeviceVsWrongSided {
+HostReturnTy callee(double x);
+__host__ __device__ HostDeviceReturnTy callee(int x);
+#if __CUDA_ARCH__ && DEFER
+typedef HostDeviceReturnTy ExpectedRetTy;
+#else
+typedef HostReturnTy ExpectedRetTy;
+#endif
+inline __host__ __device__ ExpectedRetTy explicit_hd_caller() {
+  return callee(1.0);
+}
+}
+
+// In the implicit host device function 'caller', the second 'callee' should be
+// chosen since it has better match, even though it is an implicit host device
+// function whereas the first 'callee' is a host function. A diagnostic will be
+// emitted if the first 'callee' is chosen since deduced return type cannot be
+// used before it is defined.
+namespace ImplicitHostDeviceByConstExpr {
+template <class a> a b;
+auto callee(...);
+template <class d> constexpr auto callee(d) -> decltype(0);
+struct e {
+  template <class ad, class... f> static auto g(ad, f...) {
+    return h<e, decltype(b<f>)...>;
+  }
+  struct i {
+    template <class, class... f> static constexpr auto caller(f... k) {
+      return callee(k...);
+    }
+  };
+  template <class, class... f> static auto h() {
+    return i::caller<int, f...>;
+  }
+};
+class l {
+  l() {
+    e::g([] {}, this);
+  }
+};
+}
+
+// Implicit HD candidate competes with device candidate.
+// a and b have implicit HD copy ctor. In copy ctor of b, ctor of a is resolved.
+// copy ctor of a should win over a(short), otherwise there will be ambiguity
+// due to conversion operator.
+namespace TestImplicitHDWithD {
+  struct a {
+    __device__ a(short);
+    __device__ operator unsigned() const;
+    __device__ operator int() const;
+  };
+  struct b {
+    a d;
+  };
+  void f(b g) { b e = g; }
+}
+
+// Implicit HD candidate competes with host candidate.
+// a and b have implicit HD copy ctor. In copy ctor of b, ctor of a is resolved.
+// copy ctor of a should win over a(short), otherwise there will be ambiguity
+// due to conversion operator.
+namespace TestImplicitHDWithH {
+  struct a {
+    a(short);
+    __device__ operator unsigned() const;
+    __device__ operator int() const;
+  };
+  struct b {
+    a d;
+  };
+  void f(b g) { b e = g; }
+}
+
+// Implicit HD candidate competes with HD candidate.
+// a and b have implicit HD copy ctor. In copy ctor of b, ctor of a is resolved.
+// copy ctor of a should win over a(short), otherwise there will be ambiguity
+// due to conversion operator.
+namespace TestImplicitHDWithHD {
+  struct a {
+    __host__ __device__ a(short);
+    __device__ operator unsigned() const;
+    __device__ operator int() const;
+  };
+  struct b {
+    a d;
+  };
+  void f(b g) { b e = g; }
+}
+
+// HD candidate competes with H candidate.
+// HD has type mismatch whereas H has type match.
+// In device compilation, H wins when -fgpu-defer-diag is off and HD wins
+// when -fgpu-defer-diags is on. In both cases the diagnostic should be
+// deferred.
+namespace TestDeferNoMatchingFuncNotEmitted {
+  template <typename> struct a {};
+  namespace b {
+    struct c : a<int> {};
+    template <typename d> void ag(d);
+  } // namespace b
+  template <typename ae>
+  __host__ __device__ void ag(a<ae>) {
+    ae e;
+    ag(e);
+  }
+  void f() { (void)ag<b::c>; }
+}
+
+namespace TestDeferNoMatchingFuncEmitted {
+  template <typename> struct a {};
+  namespace b {
+    struct c : a<int> {};
+    template <typename d> void ag(d);
+    // devnodeferonly-note at -1{{'ag<TestDeferNoMatchingFuncEmitted::b::c>' declared here}}
+  } // namespace b
+  template <typename ae>
+  __host__ __device__ void ag(a<ae>) {
+    ae e;
+    ag(e);
+    // devnodeferonly-error at -1{{reference to __host__ function 'ag<TestDeferNoMatchingFuncEmitted::b::c>' in __host__ __device__ function}}
+    // devdeferonly-error at -2{{no matching function for call to 'ag'}}
+    // devdeferonly-note at -3{{called by 'ag<TestDeferNoMatchingFuncEmitted::b::c>'}}
+  }
+  __host__ __device__ void f() { (void)ag<b::c>; }
+  // devnodeferonly-note at -1{{called by 'f'}}
+  // devdeferonly-note at -2{{called by 'f'}}
+}
+
+// Two HD candidates compete with H candidate.
+// HDs have type mismatch whereas H has type match.
+// In device compilation, H wins when -fgpu-defer-diag is off and two HD win
+// when -fgpu-defer-diags is on. In both cases the diagnostic should be
+// deferred.
+namespace TestDeferAmbiguityNotEmitted {
+  template <typename> struct a {};
+  namespace b {
+    struct c : a<int> {};
+    template <typename d> void ag(d, int);
+  } // namespace b
+  template <typename ae>
+  __host__ __device__ void ag(a<ae>, float) {
+    ae e;
+    ag(e, 1);
+  }
+  template <typename ae>
+  __host__ __device__ void ag(a<ae>, double) {
+  }
+  void f() {
+    b::c x;
+    ag(x, 1);
+  }
+}
+
+namespace TestDeferAmbiguityEmitted {
+  template <typename> struct a {};
+  namespace b {
+    struct c : a<int> {};
+    template <typename d> void ag(d, int);
+    // devnodeferonly-note at -1{{'ag<TestDeferAmbiguityEmitted::b::c>' declared here}}
+  } // namespace b
+  template <typename ae>
+  __host__ __device__ void ag(a<ae>, float) {
+    // devdeferonly-note at -1{{candidate function [with ae = int]}}
+    ae e;
+    ag(e, 1);
+  }
+  template <typename ae>
+  __host__ __device__ void ag(a<ae>, double) {
+    // devdeferonly-note at -1{{candidate function [with ae = int]}}
+  }
+  __host__ __device__ void f() {
+    b::c x;
+    ag(x, 1);
+    // devnodeferonly-error at -1{{reference to __host__ function 'ag<TestDeferAmbiguityEmitted::b::c>' in __host__ __device__ function}}
+    // devdeferonly-error at -2{{call to 'ag' is ambiguous}}
+  }
+}
+
+// Implicit HD functions compute with H function and D function.
+// In host compilation, foo(0.0, 2) should resolve to X::foo<double, int>.
+// In device compilation, foo(0.0, 2) should resolve to foo(double, int).
+// In either case there should be no ambiguity.
+namespace TestImplicitHDWithHAndD {
+  namespace X {
+    inline double foo(double, double) { return 0;}
+    inline constexpr float foo(float, float) { return 1;}
+    inline constexpr long double foo(long double, long double) { return 2;}
+    template<typename _Tp, typename _Up> inline constexpr double foo(_Tp, _Up) { return 3;}
+  };
+  using X::foo;
+  inline __device__ double foo(double, double) { return 4;}
+  inline __device__ float foo(float, int) { return 5;}
+  inline __device__ float foo(int, int) { return 6;}
+  inline __device__ double foo(double, int) { return 7;}
+  inline __device__ float foo(float, float) { return 9;}
+  template<typename _Tp, typename _Up> inline __device__ double foo(_Tp, _Up) { return 10;}
+
+  int g() {
+    return [](){
+    return foo(0.0, 2);
+    }();
+  }
+}