[clang] 0660397 - [CUDA] Allow targeting NVPTX directly without a host toolchain

Wed Jan 18 16:18:33 PST 2023

Author: Joseph Huber
Date: 2023-01-18T18:18:25-06:00
New Revision: 0660397e68096a64279b19f913e7de2c283e524f

URL: https://github.com/llvm/llvm-project/commit/0660397e68096a64279b19f913e7de2c283e524f
DIFF: https://github.com/llvm/llvm-project/commit/0660397e68096a64279b19f913e7de2c283e524f.diff

LOG: [CUDA] Allow targeting NVPTX directly without a host toolchain

Currently, the NVPTX compilation toolchain can only be invoked either
through CUDA or OpenMP via `--offload-device-only`. This is because we
cannot build a CUDA toolchain without an accompanying host toolchain for
the offloading. When using `--target=nvptx64-nvidia-cuda` this results
in generating calls to the GNU assembler and linker, leading to errors.

This patch abstracts the portions of the CUDA toolchain that are
independent of the host toolchain or offloading kind into a new base
class called `NVPTXToolChain`. We still need to read the host's triple
to build the CUDA installation, so if not present we just assume it will
match the host's system for now, or the user can provide the path
explicitly.

This should allow the compiler driver to create NVPTX device images
directly from C/C++ code.

Reviewed By: tra

Differential Revision: https://reviews.llvm.org/D140158

Added: 
    clang/test/Driver/cuda-cross-compiling.c

Modified: 
    clang/lib/Driver/Driver.cpp
    clang/lib/Driver/ToolChains/Cuda.cpp
    clang/lib/Driver/ToolChains/Cuda.h

Removed: 
    


################################################################################
diff  --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index e20aad49b0cd..555512f8c326 100644

--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -6039,6 +6039,9 @@ const ToolChain &Driver::getToolChain(const ArgList &Args,
     case llvm::Triple::Solaris:
       TC = std::make_unique<toolchains::Solaris>(*this, Target, Args);
       break;
+    case llvm::Triple::CUDA:
+      TC = std::make_unique<toolchains::NVPTXToolChain>(*this, Target, Args);
+      break;
     case llvm::Triple::AMDHSA:
       TC = std::make_unique<toolchains::ROCMToolChain>(*this, Target, Args);
       break;
@@ -6158,11 +6161,6 @@ const ToolChain &Driver::getToolChain(const ArgList &Args,
     }
   }
 
-  // Intentionally omitted from the switch above: llvm::Triple::CUDA.  CUDA
-  // compiles always need two toolchains, the CUDA toolchain and the host
-  // toolchain.  So the only valid way to create a CUDA toolchain is via
-  // CreateOffloadingDeviceToolChains.
-
   return *TC;
 }
 

diff  --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index b5e45ae4e0bf..484c8c070264 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -369,18 +369,20 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
                                     const ArgList &Args,
                                     const char *LinkingOutput) const {
   const auto &TC =
-      static_cast<const toolchains::CudaToolChain &>(getToolChain());
+      static_cast<const toolchains::NVPTXToolChain &>(getToolChain());
   assert(TC.getTriple().isNVPTX() && "Wrong platform");
 
   StringRef GPUArchName;
-  // If this is an OpenMP action we need to extract the device architecture
-  // from the -march=arch option. This option may come from -Xopenmp-target
-  // flag or the default value.
-  if (JA.isDeviceOffloading(Action::OFK_OpenMP)) {
+  // If this is a CUDA action we need to extract the device architecture
+  // from the Job's associated architecture, otherwise use the -march=arch
+  // option. This option may come from -Xopenmp-target flag or the default
+  // value.
+  if (JA.isDeviceOffloading(Action::OFK_Cuda)) {
+    GPUArchName = JA.getOffloadingArch();
+  } else {
     GPUArchName = Args.getLastArgValue(options::OPT_march_EQ);
     assert(!GPUArchName.empty() && "Must have an architecture passed in.");
-  } else
-    GPUArchName = JA.getOffloadingArch();
+  }
 
   // Obtain architecture from the action.
   CudaArch gpu_arch = StringToCudaArch(GPUArchName);
@@ -442,8 +444,22 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
   CmdArgs.push_back(Args.MakeArgString(CudaArchToString(gpu_arch)));
   CmdArgs.push_back("--output-file");
   const char *OutputFileName = Args.MakeArgString(TC.getInputFilename(Output));
-  if (std::string(OutputFileName) != std::string(Output.getFilename()))
+
+  // If we are invoking `nvlink` internally we need to output a `.cubin` file.
+  // Checking if the output is a temporary is the cleanest way to determine
+  // this. Putting this logic in `getInputFilename` isn't an option because it
+  // relies on the compilation.
+  // FIXME: This should hopefully be removed if NVIDIA updates their tooling.
+  if (Output.isFilename() &&
+      llvm::find(C.getTempFiles(), Output.getFilename()) !=
+          C.getTempFiles().end()) {
+    SmallString<256> Filename(Output.getFilename());
+    llvm::sys::path::replace_extension(Filename, "cubin");
+    OutputFileName = Args.MakeArgString(Filename);
+  }
+  if (Output.isFilename() && OutputFileName != Output.getFilename())
     C.addTempFile(OutputFileName);
+
   CmdArgs.push_back(OutputFileName);
   for (const auto &II : Inputs)
     CmdArgs.push_back(Args.MakeArgString(II.getFilename()));
@@ -451,15 +467,19 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
   for (const auto &A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))
     CmdArgs.push_back(Args.MakeArgString(A));
 
-  bool Relocatable = false;
+  bool Relocatable;
   if (JA.isOffloading(Action::OFK_OpenMP))
     // In OpenMP we need to generate relocatable code.
     Relocatable = Args.hasFlag(options::OPT_fopenmp_relocatable_target,
                                options::OPT_fnoopenmp_relocatable_target,
                                /*Default=*/true);
   else if (JA.isOffloading(Action::OFK_Cuda))
+    // In CUDA we generate relocatable code by default.
     Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
                                /*Default=*/false);
+  else
+    // Otherwise, we are compiling directly and should create linkable output.
+    Relocatable = true;
 
   if (Relocatable)
     CmdArgs.push_back("-c");
@@ -495,11 +515,11 @@ static bool shouldIncludePTX(const ArgList &Args, const char *gpu_arch) {
 // All inputs to this linker must be from CudaDeviceActions, as we need to look
 // at the Inputs' Actions in order to figure out which GPU architecture they
 // correspond to.
-void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
-                                 const InputInfo &Output,
-                                 const InputInfoList &Inputs,
-                                 const ArgList &Args,
-                                 const char *LinkingOutput) const {
+void NVPTX::FatBinary::ConstructJob(Compilation &C, const JobAction &JA,
+                                    const InputInfo &Output,
+                                    const InputInfoList &Inputs,
+                                    const ArgList &Args,
+                                    const char *LinkingOutput) const {
   const auto &TC =
       static_cast<const toolchains::CudaToolChain &>(getToolChain());
   assert(TC.getTriple().isNVPTX() && "Wrong platform");
@@ -546,6 +566,93 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       Exec, CmdArgs, Inputs, Output));
 }
 
+void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
+                                 const InputInfo &Output,
+                                 const InputInfoList &Inputs,
+                                 const ArgList &Args,
+                                 const char *LinkingOutput) const {
+  const auto &TC =
+      static_cast<const toolchains::NVPTXToolChain &>(getToolChain());
+  assert(TC.getTriple().isNVPTX() && "Wrong platform");
+
+  ArgStringList CmdArgs;
+  if (Output.isFilename()) {
+    CmdArgs.push_back("-o");
+    CmdArgs.push_back(Output.getFilename());
+  } else {
+    assert(Output.isNothing() && "Invalid output.");
+  }
+
+  if (mustEmitDebugInfo(Args) == EmitSameDebugInfoAsHost)
+    CmdArgs.push_back("-g");
+
+  if (Args.hasArg(options::OPT_v))
+    CmdArgs.push_back("-v");
+
+  StringRef GPUArch = Args.getLastArgValue(options::OPT_march_EQ);
+  assert(!GPUArch.empty() && "At least one GPU Arch required for nvlink.");
+
+  CmdArgs.push_back("-arch");
+  CmdArgs.push_back(Args.MakeArgString(GPUArch));
+
+  // Add paths specified in LIBRARY_PATH environment variable as -L options.
+  addDirectoryList(Args, CmdArgs, "-L", "LIBRARY_PATH");
+
+  // Add paths for the default clang library path.
+  SmallString<256> DefaultLibPath =
+      llvm::sys::path::parent_path(TC.getDriver().Dir);
+  llvm::sys::path::append(DefaultLibPath, CLANG_INSTALL_LIBDIR_BASENAME);
+  CmdArgs.push_back(Args.MakeArgString(Twine("-L") + DefaultLibPath));
+
+  for (const auto &II : Inputs) {
+    if (II.getType() == types::TY_LLVM_IR || II.getType() == types::TY_LTO_IR ||
+        II.getType() == types::TY_LTO_BC || II.getType() == types::TY_LLVM_BC) {
+      C.getDriver().Diag(diag::err_drv_no_linker_llvm_support)
+          << getToolChain().getTripleString();
+      continue;
+    }
+
+    // Currently, we only pass the input files to the linker, we do not pass
+    // any libraries that may be valid only for the host.
+    if (!II.isFilename())
+      continue;
+
+    // The 'nvlink' application performs RDC-mode linking when given a '.o'
+    // file and device linking when given a '.cubin' file. We always want to
+    // perform device linking, so just rename any '.o' files.
+    // FIXME: This should hopefully be removed if NVIDIA updates their tooling.
+    auto InputFile = getToolChain().getInputFilename(II);
+    if (llvm::sys::path::extension(InputFile) != ".cubin") {
+      // If there are no actions above this one then this is direct input and we
+      // can copy it. Otherwise the input is internal so a `.cubin` file should
+      // exist.
+      if (II.getAction() && II.getAction()->getInputs().size() == 0) {
+        const char *CubinF =
+            Args.MakeArgString(getToolChain().getDriver().GetTemporaryPath(
+                llvm::sys::path::stem(InputFile), "cubin"));
+        if (std::error_code EC =
+                llvm::sys::fs::copy_file(InputFile, C.addTempFile(CubinF)))
+          continue;
+
+        CmdArgs.push_back(CubinF);
+      } else {
+        SmallString<256> Filename(InputFile);
+        llvm::sys::path::replace_extension(Filename, "cubin");
+        CmdArgs.push_back(Args.MakeArgString(Filename));
+      }
+    } else {
+      CmdArgs.push_back(Args.MakeArgString(InputFile));
+    }
+  }
+
+  C.addCommand(std::make_unique<Command>(
+      JA, *this,
+      ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,
+                          "--options-file"},
+      Args.MakeArgString(getToolChain().GetProgramPath("nvlink")), CmdArgs,
+      Inputs, Output));
+}
+
 void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,
                                    const llvm::opt::ArgList &Args,
                                    std::vector<StringRef> &Features) {
@@ -588,14 +695,13 @@ void NVPTX::getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,
   Features.push_back(PtxFeature);
 }
 
-/// CUDA toolchain.  Our assembler is ptxas, and our "linker" is fatbinary,
-/// which isn't properly a linker but nonetheless performs the step of stitching
-/// together object files from the assembler into a single blob.
-
-CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
-                             const ToolChain &HostTC, const ArgList &Args)
-    : ToolChain(D, Triple, Args), HostTC(HostTC),
-      CudaInstallation(D, HostTC.getTriple(), Args) {
+/// NVPTX toolchain. Our assembler is ptxas, and our linker is nvlink. This
+/// operates as a stand-alone version of the NVPTX tools without the host
+/// toolchain.
+NVPTXToolChain::NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
+                               const llvm::Triple &HostTriple,
+                               const ArgList &Args)
+    : ToolChain(D, Triple, Args), CudaInstallation(D, HostTriple, Args) {
   if (CudaInstallation.isValid()) {
     CudaInstallation.WarnIfUnsupportedVersion();
     getProgramPaths().push_back(std::string(CudaInstallation.getBinPath()));
@@ -605,19 +711,70 @@ CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
   getProgramPaths().push_back(getDriver().Dir);
 }
 
-std::string CudaToolChain::getInputFilename(const InputInfo &Input) const {
-  // Only object files are changed, for example assembly files keep their .s
-  // extensions. If the user requested device-only compilation don't change it.
-  if (Input.getType() != types::TY_Object || getDriver().offloadDeviceOnly())
-    return ToolChain::getInputFilename(Input);
+/// We only need the host triple to locate the CUDA binary utilities, use the
+/// system's default triple if not provided.
+NVPTXToolChain::NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
+                               const ArgList &Args)
+    : NVPTXToolChain(D, Triple,
+                     llvm::Triple(llvm::sys::getDefaultTargetTriple()), Args) {}
 
-  // Replace extension for object files with cubin because nvlink relies on
-  // these particular file names.
-  SmallString<256> Filename(ToolChain::getInputFilename(Input));
-  llvm::sys::path::replace_extension(Filename, "cubin");
-  return std::string(Filename.str());
+llvm::opt::DerivedArgList *
+NVPTXToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
+                              StringRef BoundArch,
+                              Action::OffloadKind DeviceOffloadKind) const {
+  DerivedArgList *DAL =
+      ToolChain::TranslateArgs(Args, BoundArch, DeviceOffloadKind);
+  if (!DAL)
+    DAL = new DerivedArgList(Args.getBaseArgs());
+
+  const OptTable &Opts = getDriver().getOpts();
+
+  for (Arg *A : Args)
+    if (!llvm::is_contained(*DAL, A))
+      DAL->append(A);
+
+  if (!DAL->hasArg(options::OPT_march_EQ))
+    DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ),
+                      CudaArchToString(CudaArch::CudaDefault));
+
+  return DAL;
 }
 
+bool NVPTXToolChain::supportsDebugInfoOption(const llvm::opt::Arg *A) const {
+  const Option &O = A->getOption();
+  return (O.matches(options::OPT_gN_Group) &&
+          !O.matches(options::OPT_gmodules)) ||
+         O.matches(options::OPT_g_Flag) ||
+         O.matches(options::OPT_ggdbN_Group) || O.matches(options::OPT_ggdb) ||
+         O.matches(options::OPT_gdwarf) || O.matches(options::OPT_gdwarf_2) ||
+         O.matches(options::OPT_gdwarf_3) || O.matches(options::OPT_gdwarf_4) ||
+         O.matches(options::OPT_gdwarf_5) ||
+         O.matches(options::OPT_gcolumn_info);
+}
+
+void NVPTXToolChain::adjustDebugInfoKind(
+    codegenoptions::DebugInfoKind &DebugInfoKind, const ArgList &Args) const {
+  switch (mustEmitDebugInfo(Args)) {
+  case DisableDebugInfo:
+    DebugInfoKind = codegenoptions::NoDebugInfo;
+    break;
+  case DebugDirectivesOnly:
+    DebugInfoKind = codegenoptions::DebugDirectivesOnly;
+    break;
+  case EmitSameDebugInfoAsHost:
+    // Use same debug info level as the host.
+    break;
+  }
+}
+
+/// CUDA toolchain.  Our assembler is ptxas, and our "linker" is fatbinary,
+/// which isn't properly a linker but nonetheless performs the step of stitching
+/// together object files from the assembler into a single blob.
+
+CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
+                             const ToolChain &HostTC, const ArgList &Args)
+    : NVPTXToolChain(D, Triple, HostTC.getTriple(), Args), HostTC(HostTC) {}
+
 void CudaToolChain::addClangTargetOptions(
     const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args,
     Action::OffloadKind DeviceOffloadingKind) const {
@@ -696,33 +853,6 @@ llvm::DenormalMode CudaToolChain::getDefaultDenormalModeForType(
   return llvm::DenormalMode::getIEEE();
 }
 
-bool CudaToolChain::supportsDebugInfoOption(const llvm::opt::Arg *A) const {
-  const Option &O = A->getOption();
-  return (O.matches(options::OPT_gN_Group) &&
-          !O.matches(options::OPT_gmodules)) ||
-         O.matches(options::OPT_g_Flag) ||
-         O.matches(options::OPT_ggdbN_Group) || O.matches(options::OPT_ggdb) ||
-         O.matches(options::OPT_gdwarf) || O.matches(options::OPT_gdwarf_2) ||
-         O.matches(options::OPT_gdwarf_3) || O.matches(options::OPT_gdwarf_4) ||
-         O.matches(options::OPT_gdwarf_5) ||
-         O.matches(options::OPT_gcolumn_info);
-}
-
-void CudaToolChain::adjustDebugInfoKind(
-    codegenoptions::DebugInfoKind &DebugInfoKind, const ArgList &Args) const {
-  switch (mustEmitDebugInfo(Args)) {
-  case DisableDebugInfo:
-    DebugInfoKind = codegenoptions::NoDebugInfo;
-    break;
-  case DebugDirectivesOnly:
-    DebugInfoKind = codegenoptions::DebugDirectivesOnly;
-    break;
-  case EmitSameDebugInfoAsHost:
-    // Use same debug info level as the host.
-    break;
-  }
-}
-
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
                                        ArgStringList &CC1Args) const {
   // Check our CUDA version if we're going to include the CUDA headers.
@@ -735,6 +865,19 @@ void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
   CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
 }
 
+std::string CudaToolChain::getInputFilename(const InputInfo &Input) const {
+  // Only object files are changed, for example assembly files keep their .s
+  // extensions. If the user requested device-only compilation don't change it.
+  if (Input.getType() != types::TY_Object || getDriver().offloadDeviceOnly())
+    return ToolChain::getInputFilename(Input);
+
+  // Replace extension for object files with cubin because nvlink relies on
+  // these particular file names.
+  SmallString<256> Filename(ToolChain::getInputFilename(Input));
+  llvm::sys::path::replace_extension(Filename, "cubin");
+  return std::string(Filename.str());
+}
+
 llvm::opt::DerivedArgList *
 CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
                              StringRef BoundArch,
@@ -811,12 +954,20 @@ CudaToolChain::getSystemGPUArchs(const ArgList &Args) const {
   return std::move(GPUArchs);
 }
 
+Tool *NVPTXToolChain::buildAssembler() const {
+  return new tools::NVPTX::Assembler(*this);
+}
+
+Tool *NVPTXToolChain::buildLinker() const {
+  return new tools::NVPTX::Linker(*this);
+}
+
 Tool *CudaToolChain::buildAssembler() const {
   return new tools::NVPTX::Assembler(*this);
 }
 
 Tool *CudaToolChain::buildLinker() const {
-  return new tools::NVPTX::Linker(*this);
+  return new tools::NVPTX::FatBinary(*this);
 }
 
 void CudaToolChain::addClangWarningOptions(ArgStringList &CC1Args) const {

diff  --git a/clang/lib/Driver/ToolChains/Cuda.h b/clang/lib/Driver/ToolChains/Cuda.h
index e34a6dd43c5d..bf2f369d405c 100644
--- a/clang/lib/Driver/ToolChains/Cuda.h
+++ b/clang/lib/Driver/ToolChains/Cuda.h
@@ -95,6 +95,19 @@ class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
 
 // Runs fatbinary, which combines GPU object files ("cubin" files) and/or PTX
 // assembly into a single output file.
+class LLVM_LIBRARY_VISIBILITY FatBinary : public Tool {
+public:
+  FatBinary(const ToolChain &TC) : Tool("NVPTX::Linker", "fatbinary", TC) {}
+
+  bool hasIntegratedCPP() const override { return false; }
+
+  void ConstructJob(Compilation &C, const JobAction &JA,
+                    const InputInfo &Output, const InputInfoList &Inputs,
+                    const llvm::opt::ArgList &TCArgs,
+                    const char *LinkingOutput) const override;
+};
+
+// Runs nvlink, which links GPU object files ("cubin" files) into a single file.
 class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("NVPTX::Linker", "fatbinary", TC) {}
@@ -116,7 +129,48 @@ void getNVPTXTargetFeatures(const Driver &D, const llvm::Triple &Triple,
 
 namespace toolchains {
 
-class LLVM_LIBRARY_VISIBILITY CudaToolChain : public ToolChain {
+class LLVM_LIBRARY_VISIBILITY NVPTXToolChain : public ToolChain {
+public:
+  NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
+                 const llvm::Triple &HostTriple,
+                 const llvm::opt::ArgList &Args);
+
+  NVPTXToolChain(const Driver &D, const llvm::Triple &Triple,
+                 const llvm::opt::ArgList &Args);
+
+  llvm::opt::DerivedArgList *
+  TranslateArgs(const llvm::opt::DerivedArgList &Args, StringRef BoundArch,
+                Action::OffloadKind DeviceOffloadKind) const override;
+
+  // Never try to use the integrated assembler with CUDA; always fork out to
+  // ptxas.
+  bool useIntegratedAs() const override { return false; }
+  bool isCrossCompiling() const override { return true; }
+  bool isPICDefault() const override { return false; }
+  bool isPIEDefault(const llvm::opt::ArgList &Args) const override {
+    return false;
+  }
+  bool isPICDefaultForced() const override { return false; }
+  bool SupportsProfiling() const override { return false; }
+
+  bool IsMathErrnoDefault() const override { return false; }
+
+  bool supportsDebugInfoOption(const llvm::opt::Arg *A) const override;
+  void adjustDebugInfoKind(codegenoptions::DebugInfoKind &DebugInfoKind,
+                           const llvm::opt::ArgList &Args) const override;
+
+  // NVPTX supports only DWARF2.
+  unsigned GetDefaultDwarfVersion() const override { return 2; }
+  unsigned getMaxDwarfVersion() const override { return 2; }
+
+  CudaInstallationDetector CudaInstallation;
+
+protected:
+  Tool *buildAssembler() const override; // ptxas.
+  Tool *buildLinker() const override;    // nvlink.
+};
+
+class LLVM_LIBRARY_VISIBILITY CudaToolChain : public NVPTXToolChain {
 public:
   CudaToolChain(const Driver &D, const llvm::Triple &Triple,
                 const ToolChain &HostTC, const llvm::opt::ArgList &Args);
@@ -139,21 +193,6 @@ class LLVM_LIBRARY_VISIBILITY CudaToolChain : public ToolChain {
       const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
       const llvm::fltSemantics *FPType = nullptr) const override;
 
-  // Never try to use the integrated assembler with CUDA; always fork out to
-  // ptxas.
-  bool useIntegratedAs() const override { return false; }
-  bool isCrossCompiling() const override { return true; }
-  bool isPICDefault() const override { return false; }
-  bool isPIEDefault(const llvm::opt::ArgList &Args) const override {
-    return false;
-  }
-  bool isPICDefaultForced() const override { return false; }
-  bool SupportsProfiling() const override { return false; }
-  bool supportsDebugInfoOption(const llvm::opt::Arg *A) const override;
-  void adjustDebugInfoKind(codegenoptions::DebugInfoKind &DebugInfoKind,
-                           const llvm::opt::ArgList &Args) const override;
-  bool IsMathErrnoDefault() const override { return false; }
-
   void AddCudaIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                           llvm::opt::ArgStringList &CC1Args) const override;
 
@@ -174,12 +213,7 @@ class LLVM_LIBRARY_VISIBILITY CudaToolChain : public ToolChain {
   computeMSVCVersion(const Driver *D,
                      const llvm::opt::ArgList &Args) const override;
 
-  unsigned GetDefaultDwarfVersion() const override { return 2; }
-  // NVPTX supports only DWARF2.
-  unsigned getMaxDwarfVersion() const override { return 2; }
-
   const ToolChain &HostTC;
-  CudaInstallationDetector CudaInstallation;
 
   /// Uses nvptx-arch tool to get arch of the system GPU. Will return error
   /// if unable to find one.

diff  --git a/clang/test/Driver/cuda-cross-compiling.c b/clang/test/Driver/cuda-cross-compiling.c
new file mode 100644
index 000000000000..992fda8deb33
--- /dev/null
+++ b/clang/test/Driver/cuda-cross-compiling.c
@@ -0,0 +1,68 @@
+// Tests the driver when targeting the NVPTX architecture directly without a
+// host toolchain to perform CUDA mappings.
+
+// REQUIRES: nvptx-registered-target
+
+//
+// Test the generated phases when targeting NVPTX.
+//
+// RUN: %clang -target nvptx64-nvidia-cuda -ccc-print-phases %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=PHASES %s
+
+//      PHASES: 0: input, "[[INPUT:.+]]", c
+// PHASES-NEXT: 1: preprocessor, {0}, cpp-output
+// PHASES-NEXT: 2: compiler, {1}, ir
+// PHASES-NEXT: 3: backend, {2}, assembler
+// PHASES-NEXT: 4: assembler, {3}, object
+// PHASES-NEXT: 5: linker, {4}, image
+
+//
+// Test the generated bindings when targeting NVPTX.
+//
+// RUN: %clang -target nvptx64-nvidia-cuda -ccc-print-bindings %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=BINDINGS %s
+
+//      BINDINGS: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[PTX:.+]].s"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX]].s"], output: "[[CUBIN:.+]].o"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Linker", inputs: ["[[CUBIN]].o"], output: "a.out"
+
+//
+// Test the generated arguments to the CUDA binary utils when targeting NVPTX. 
+// Ensure that the '.o' files are converted to '.cubin' if produced internally.
+//
+// RUN: %clang -target nvptx64-nvidia-cuda -march=sm_61 -### %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=ARGS %s
+
+//      ARGS: -cc1" "-triple" "nvptx64-nvidia-cuda" "-S" {{.*}} "-target-cpu" "sm_61" "-target-feature" "+ptx{{[0-9]+}}" {{.*}} "-o" "[[PTX:.+]].s"
+// ARGS-NEXT: ptxas{{.*}}"-m64" "-O0" "--gpu-name" "sm_61" "--output-file" "[[CUBIN:.+]].cubin" "[[PTX]].s" "-c"
+// ARGS-NEXT: nvlink{{.*}}"-o" "a.out" "-arch" "sm_61" {{.*}} "[[CUBIN]].cubin"
+
+//
+// Test the generated arguments to the CUDA binary utils when targeting NVPTX. 
+// Ensure that we emit '.o' files if compiled with '-c'
+//
+// RUN: %clang -target nvptx64-nvidia-cuda -march=sm_61 -c -### %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=OBJECT %s
+
+//      OBJECT: -cc1" "-triple" "nvptx64-nvidia-cuda" "-S" {{.*}} "-target-cpu" "sm_61" "-target-feature" "+ptx{{[0-9]+}}" {{.*}} "-o" "[[PTX:.+]].s"
+// OBJECT-NEXT: ptxas{{.*}}"-m64" "-O0" "--gpu-name" "sm_61" "--output-file" "[[OBJ:.+]].o" "[[PTX]].s" "-c"
+
+//
+// Test the generated arguments to the CUDA binary utils when targeting NVPTX. 
+// Ensure that we copy input '.o' files to '.cubin' files when linking.
+//
+// RUN: touch %t.o
+// RUN: %clang -target nvptx64-nvidia-cuda -march=sm_61 -### %t.o 2>&1 \
+// RUN:   | FileCheck -check-prefix=LINK %s
+
+// LINK: nvlink{{.*}}"-o" "a.out" "-arch" "sm_61" {{.*}} "{{.*}}.cubin"
+
+//
+// Test the generated arguments default to a value with no architecture. 
+//
+// RUN: %clang -target nvptx64-nvidia-cuda -### %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=DEFAULT %s
+
+//      DEFAULT: -cc1" "-triple" "nvptx64-nvidia-cuda" "-S" {{.*}} "-target-cpu" "sm_35" "-target-feature" "+ptx{{[0-9]+}}" {{.*}} "-o" "[[PTX:.+]].s"
+// DEFAULT-NEXT: ptxas{{.*}}"-m64" "-O0" "--gpu-name" "sm_35" "--output-file" "[[CUBIN:.+]].cubin" "[[PTX]].s" "-c"
+// DEFAULT-NEXT: nvlink{{.*}}"-o" "a.out" "-arch" "sm_35" {{.*}} "[[CUBIN]].cubin"