r257809 - [CUDA] Invoke ptxas and fatbinary during compilation.

Thu Jan 14 13:41:28 PST 2016

Author: jlebar
Date: Thu Jan 14 15:41:27 2016
New Revision: 257809

URL: http://llvm.org/viewvc/llvm-project?rev=257809&view=rev
Log:
[CUDA] Invoke ptxas and fatbinary during compilation.

Summary:
Previously we compiled CUDA device code to PTX assembly and embedded
that asm as text in our host binary.  Now we compile to PTX assembly and
then invoke ptxas to assemble the PTX into a cubin file.  We gather the
ptx and cubin files for each of our --cuda-gpu-archs and combine them
using fatbinary, and then embed that into the host binary.

Adds two new command-line flags, -Xcuda_ptxas and -Xcuda_fatbinary,
which pass args down to the external tools.

Reviewers: tra, echristo

Subscribers: cfe-commits, jhen

Differential Revision: http://reviews.llvm.org/D16082

Added:
    cfe/trunk/test/Driver/Inputs/CUDA/usr/local/cuda/bin/
    cfe/trunk/test/Driver/Inputs/CUDA/usr/local/cuda/bin/.keep
    cfe/trunk/test/Driver/cuda-arch-translation.cu
    cfe/trunk/test/Driver/cuda-external-tools.cu
Modified:
    cfe/trunk/include/clang/Driver/Action.h
    cfe/trunk/include/clang/Driver/Options.td
    cfe/trunk/include/clang/Driver/ToolChain.h
    cfe/trunk/include/clang/Driver/Types.def
    cfe/trunk/lib/CodeGen/CGCUDANV.cpp
    cfe/trunk/lib/Driver/Action.cpp
    cfe/trunk/lib/Driver/Driver.cpp
    cfe/trunk/lib/Driver/ToolChains.cpp
    cfe/trunk/lib/Driver/ToolChains.h
    cfe/trunk/lib/Driver/Tools.cpp
    cfe/trunk/lib/Driver/Tools.h
    cfe/trunk/lib/Driver/Types.cpp
    cfe/trunk/test/Driver/cuda-options.cu

Modified: cfe/trunk/include/clang/Driver/Action.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Driver/Action.h?rev=257809&r1=257808&r2=257809&view=diff
==============================================================================

--- cfe/trunk/include/clang/Driver/Action.h (original)
+++ cfe/trunk/include/clang/Driver/Action.h Thu Jan 14 15:41:27 2016
@@ -136,7 +136,8 @@ public:
 
 class CudaDeviceAction : public Action {
   virtual void anchor();
-  /// GPU architecture to bind.  Always of the form /sm_\d+/.
+  /// GPU architecture to bind.  Always of the form /sm_\d+/ or null (when the
+  /// action applies to multiple architectures).
   const char *GpuArchName;
   /// True when action results are not consumed by the host action (e.g when
   /// -fsyntax-only or --cuda-device-only options are used).
@@ -147,7 +148,8 @@ public:
 
   const char *getGpuArchName() const { return GpuArchName; }
 
-  /// Gets the compute_XX that corresponds to getGpuArchName().
+  /// Gets the compute_XX that corresponds to getGpuArchName().  Returns null
+  /// when getGpuArchName() is null.
   const char *getComputeArchName() const;
 
   bool isAtTopLevel() const { return AtTopLevel; }

Modified: cfe/trunk/include/clang/Driver/Options.td
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Driver/Options.td?rev=257809&r1=257808&r2=257809&view=diff
==============================================================================
--- cfe/trunk/include/clang/Driver/Options.td (original)
+++ cfe/trunk/include/clang/Driver/Options.td Thu Jan 14 15:41:27 2016
@@ -336,6 +336,10 @@ def Xassembler : Separate<["-"], "Xassem
 def Xclang : Separate<["-"], "Xclang">,
   HelpText<"Pass <arg> to the clang compiler">, MetaVarName<"<arg>">,
   Flags<[DriverOption, CoreOption]>;
+def Xcuda_fatbinary : Separate<["-"], "Xcuda-fatbinary">,
+  HelpText<"Pass <arg> to fatbinary invocation">, MetaVarName<"<arg>">;
+def Xcuda_ptxas : Separate<["-"], "Xcuda-ptxas">,
+  HelpText<"Pass <arg> to the ptxas assembler">, MetaVarName<"<arg>">;
 def z : Separate<["-"], "z">, Flags<[LinkerInput, RenderAsInput]>,
   HelpText<"Pass -z <arg> to the linker">, MetaVarName<"<arg>">;
 def Xlinker : Separate<["-"], "Xlinker">, Flags<[LinkerInput, RenderAsInput]>,

Modified: cfe/trunk/include/clang/Driver/ToolChain.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Driver/ToolChain.h?rev=257809&r1=257808&r2=257809&view=diff
==============================================================================
--- cfe/trunk/include/clang/Driver/ToolChain.h (original)
+++ cfe/trunk/include/clang/Driver/ToolChain.h Thu Jan 14 15:41:27 2016
@@ -228,7 +228,7 @@ public:
   virtual bool IsIntegratedAssemblerDefault() const { return false; }
 
   /// \brief Check if the toolchain should use the integrated assembler.
-  bool useIntegratedAs() const;
+  virtual bool useIntegratedAs() const;
 
   /// IsMathErrnoDefault - Does this tool chain use -fmath-errno by default.
   virtual bool IsMathErrnoDefault() const { return true; }

Modified: cfe/trunk/include/clang/Driver/Types.def
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Driver/Types.def?rev=257809&r1=257808&r2=257809&view=diff
==============================================================================
--- cfe/trunk/include/clang/Driver/Types.def (original)
+++ cfe/trunk/include/clang/Driver/Types.def Thu Jan 14 15:41:27 2016
@@ -93,4 +93,5 @@ TYPE("treelang",                 Treelan
 TYPE("image",                    Image,        INVALID,         "out",   "")
 TYPE("dSYM",                     dSYM,         INVALID,         "dSYM",  "A")
 TYPE("dependencies",             Dependencies, INVALID,         "d",     "")
+TYPE("cuda-fatbin",              CUDA_FATBIN,  INVALID,         "fatbin","A")
 TYPE("none",                     Nothing,      INVALID,         nullptr, "u")

Modified: cfe/trunk/lib/CodeGen/CGCUDANV.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGCUDANV.cpp?rev=257809&r1=257808&r2=257809&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CGCUDANV.cpp (original)
+++ cfe/trunk/lib/CodeGen/CGCUDANV.cpp Thu Jan 14 15:41:27 2016
@@ -259,6 +259,8 @@ llvm::Function *CGNVCUDARuntime::makeMod
         TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage,
         llvm::ConstantStruct::get(FatbinWrapperTy, Values),
         "__cuda_fatbin_wrapper");
+    // NVIDIA's cuobjdump looks for fatbins in this section.
+    FatbinWrapper->setSection(".nvFatBinSegment");
 
     // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
     llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(

Modified: cfe/trunk/lib/Driver/Action.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/Action.cpp?rev=257809&r1=257808&r2=257809&view=diff
==============================================================================
--- cfe/trunk/lib/Driver/Action.cpp (original)
+++ cfe/trunk/lib/Driver/Action.cpp Thu Jan 14 15:41:27 2016
@@ -75,7 +75,7 @@ CudaDeviceAction::CudaDeviceAction(Actio
                                    bool AtTopLevel)
     : Action(CudaDeviceClass, Input), GpuArchName(ArchName),
       AtTopLevel(AtTopLevel) {
-  assert(IsValidGpuArchName(GpuArchName));
+  assert(!GpuArchName || IsValidGpuArchName(GpuArchName));
 }
 
 const char *CudaDeviceAction::getComputeArchName() const {

Modified: cfe/trunk/lib/Driver/Driver.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/Driver.cpp?rev=257809&r1=257808&r2=257809&view=diff
==============================================================================
--- cfe/trunk/lib/Driver/Driver.cpp (original)
+++ cfe/trunk/lib/Driver/Driver.cpp Thu Jan 14 15:41:27 2016
@@ -949,8 +949,9 @@ static unsigned PrintActions1(const Comp
     os << '"' << BIA->getArchName() << '"' << ", {"
        << PrintActions1(C, *BIA->begin(), Ids) << "}";
   } else if (CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
-    os << '"' << CDA->getGpuArchName() << '"' << ", {"
-       << PrintActions1(C, *CDA->begin(), Ids) << "}";
+    os << '"'
+       << (CDA->getGpuArchName() ? CDA->getGpuArchName() : "(multiple archs)")
+       << '"' << ", {" << PrintActions1(C, *CDA->begin(), Ids) << "}";
   } else {
     const ActionList *AL;
     if (CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
@@ -1327,7 +1328,7 @@ static Action *buildCudaActions(Compilat
   // Check whether any of device actions stopped before they could generate PTX.
   bool PartialCompilation =
       llvm::any_of(CudaDeviceActions, [](const Action *a) {
-        return a->getKind() != Action::BackendJobClass;
+        return a->getKind() != Action::AssembleJobClass;
       });
 
   // Figure out what to do with device actions -- pass them as inputs to the
@@ -1356,16 +1357,32 @@ static Action *buildCudaActions(Compilat
     return HostAction;
   }
 
-  // Outputs of device actions during complete CUDA compilation get created
-  // with AtTopLevel=false and become inputs for the host action.
+  // If we're not a partial or device-only compilation, we compile each arch to
+  // ptx and assemble to cubin, then feed the cubin *and* the ptx into a device
+  // "link" action, which uses fatbinary to combine these cubins into one
+  // fatbin.  The fatbin is then an input to the host compilation.
   ActionList DeviceActions;
-  for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I)
-    DeviceActions.push_back(
-        C.MakeAction<CudaDeviceAction>(CudaDeviceActions[I], GpuArchList[I],
-                                       /* AtTopLevel */ false));
+  for (unsigned I = 0, E = GpuArchList.size(); I != E; ++I) {
+    Action* AssembleAction = CudaDeviceActions[I];
+    assert(AssembleAction->getType() == types::TY_Object);
+    assert(AssembleAction->getInputs().size() == 1);
+
+    Action* BackendAction = AssembleAction->getInputs()[0];
+    assert(BackendAction->getType() == types::TY_PP_Asm);
+
+    for (const auto& A : {AssembleAction, BackendAction}) {
+      DeviceActions.push_back(C.MakeAction<CudaDeviceAction>(
+          A, GpuArchList[I], /* AtTopLevel */ false));
+    }
+  }
+  auto FatbinAction = C.MakeAction<CudaDeviceAction>(
+      C.MakeAction<LinkJobAction>(DeviceActions, types::TY_CUDA_FATBIN),
+      /* GpuArchName = */ nullptr,
+      /* AtTopLevel = */ false);
   // Return a new host action that incorporates original host action and all
   // device actions.
-  return C.MakeAction<CudaHostAction>(HostAction, DeviceActions);
+  return C.MakeAction<CudaHostAction>(std::move(HostAction),
+                                      ActionList({FatbinAction}));
 }
 
 void Driver::BuildActions(Compilation &C, const ToolChain &TC,
@@ -1600,7 +1617,7 @@ Action *Driver::ConstructPhaseAction(Com
     return C.MakeAction<BackendJobAction>(Input, types::TY_PP_Asm);
   }
   case phases::Assemble:
-    return C.MakeAction<AssembleJobAction>(Input, types::TY_Object);
+    return C.MakeAction<AssembleJobAction>(std::move(Input), types::TY_Object);
   }
 
   llvm_unreachable("invalid phase in ConstructPhaseAction");
@@ -1849,11 +1866,14 @@ InputInfo Driver::BuildJobsForActionNoCa
   if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
     // Initial processing of CudaDeviceAction carries host params.
     // Call BuildJobsForAction() again, now with correct device parameters.
-    assert(CDA->getGpuArchName() && "No GPU name in device action.");
-    return BuildJobsForAction(C, *CDA->begin(), C.getCudaDeviceToolChain(),
-                              CDA->getGpuArchName(), CDA->isAtTopLevel(),
-                              /*MultipleArchs*/ true, LinkingOutput,
-                              CachedResults);
+    InputInfo II = BuildJobsForAction(
+        C, *CDA->begin(), C.getCudaDeviceToolChain(), CDA->getGpuArchName(),
+        CDA->isAtTopLevel(), /*MultipleArchs*/ true, LinkingOutput,
+        CachedResults);
+    // Currently II's Action is *CDA->begin().  Set it to CDA instead, so that
+    // one can retrieve II's GPU arch.
+    II.setAction(A);
+    return II;
   }
 
   const ActionList *Inputs = &A->getInputs();

Modified: cfe/trunk/lib/Driver/ToolChains.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/ToolChains.cpp?rev=257809&r1=257808&r2=257809&view=diff
==============================================================================
--- cfe/trunk/lib/Driver/ToolChains.cpp (original)
+++ cfe/trunk/lib/Driver/ToolChains.cpp Thu Jan 14 15:41:27 2016
@@ -1652,13 +1652,14 @@ void Generic_GCC::CudaInstallationDetect
       continue;
 
     CudaInstallPath = CudaPath;
+    CudaBinPath = CudaPath + "/bin";
     CudaIncludePath = CudaInstallPath + "/include";
     CudaLibDevicePath = CudaInstallPath + "/nvvm/libdevice";
     CudaLibPath =
         CudaInstallPath + (TargetTriple.isArch64Bit() ? "/lib64" : "/lib");
 
     if (!(D.getVFS().exists(CudaIncludePath) &&
-          D.getVFS().exists(CudaLibPath) &&
+          D.getVFS().exists(CudaBinPath) && D.getVFS().exists(CudaLibPath) &&
           D.getVFS().exists(CudaLibDevicePath)))
       continue;
 
@@ -4182,13 +4183,16 @@ Tool *DragonFly::buildLinker() const {
   return new tools::dragonfly::Linker(*this);
 }
 
-/// Stub for CUDA toolchain. At the moment we don't have assembler or
-/// linker and need toolchain mainly to propagate device-side options
-/// to CC1.
+/// CUDA toolchain.  Our assembler is ptxas, and our "linker" is fatbinary,
+/// which isn't properly a linker but nonetheless performs the step of stitching
+/// together object files from the assembler into a single blob.
 
 CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
                              const ArgList &Args)
-    : Linux(D, Triple, Args) {}
+    : Linux(D, Triple, Args) {
+  if (CudaInstallation.isValid())
+    getProgramPaths().push_back(CudaInstallation.getBinPath());
+}
 
 void
 CudaToolChain::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
@@ -4222,7 +4226,7 @@ CudaToolChain::TranslateArgs(const llvm:
   for (Arg *A : Args) {
     if (A->getOption().matches(options::OPT_Xarch__)) {
       // Skip this argument unless the architecture matches BoundArch
-      if (A->getValue(0) != StringRef(BoundArch))
+      if (!BoundArch || A->getValue(0) != StringRef(BoundArch))
         continue;
 
       unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
@@ -4253,10 +4257,19 @@ CudaToolChain::TranslateArgs(const llvm:
     DAL->append(A);
   }
 
-  DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
+  if (BoundArch)
+    DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
   return DAL;
 }
 
+Tool *CudaToolChain::buildAssembler() const {
+  return new tools::NVPTX::Assembler(*this);
+}
+
+Tool *CudaToolChain::buildLinker() const {
+  return new tools::NVPTX::Linker(*this);
+}
+
 /// XCore tool chain
 XCoreToolChain::XCoreToolChain(const Driver &D, const llvm::Triple &Triple,
                                const ArgList &Args)

Modified: cfe/trunk/lib/Driver/ToolChains.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/ToolChains.h?rev=257809&r1=257808&r2=257809&view=diff
==============================================================================
--- cfe/trunk/lib/Driver/ToolChains.h (original)
+++ cfe/trunk/lib/Driver/ToolChains.h Thu Jan 14 15:41:27 2016
@@ -163,6 +163,7 @@ protected:
     bool IsValid;
     const Driver &D;
     std::string CudaInstallPath;
+    std::string CudaBinPath;
     std::string CudaLibPath;
     std::string CudaLibDevicePath;
     std::string CudaIncludePath;
@@ -179,6 +180,8 @@ protected:
 
     /// \brief Get the detected Cuda installation path.
     StringRef getInstallPath() const { return CudaInstallPath; }
+    /// \brief Get the detected path to Cuda's bin directory.
+    StringRef getBinPath() const { return CudaBinPath; }
     /// \brief Get the detected Cuda Include path.
     StringRef getIncludePath() const { return CudaIncludePath; }
     /// \brief Get the detected Cuda library path.
@@ -816,6 +819,14 @@ public:
                 const char *BoundArch) const override;
   void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
                              llvm::opt::ArgStringList &CC1Args) const override;
+
+  // Never try to use the integrated assembler with CUDA; always fork out to
+  // ptxas.
+  bool useIntegratedAs() const override { return false; }
+
+protected:
+  Tool *buildAssembler() const override;  // ptxas
+  Tool *buildLinker() const override;     // fatbinary (ok, not really a linker)
 };
 
 class LLVM_LIBRARY_VISIBILITY MipsLLVMToolChain : public Linux {

Modified: cfe/trunk/lib/Driver/Tools.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/Tools.cpp?rev=257809&r1=257808&r2=257809&view=diff
==============================================================================
--- cfe/trunk/lib/Driver/Tools.cpp (original)
+++ cfe/trunk/lib/Driver/Tools.cpp Thu Jan 14 15:41:27 2016
@@ -10625,3 +10625,81 @@ void PS4cpu::Link::ConstructJob(Compilat
   else
     ConstructGoldLinkJob(*this, C, JA, Output, Inputs, Args, LinkingOutput);
 }
+
+void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
+                                    const InputInfo &Output,
+                                    const InputInfoList &Inputs,
+                                    const ArgList &Args,
+                                    const char *LinkingOutput) const {
+  const auto &TC =
+      static_cast<const toolchains::CudaToolChain &>(getToolChain());
+  assert(TC.getArch() == llvm::Triple::nvptx ||
+         TC.getArch() == llvm::Triple::nvptx64);
+
+  std::vector<std::string> gpu_archs =
+      Args.getAllArgValues(options::OPT_march_EQ);
+  assert(gpu_archs.size() == 1 && "Exactly one GPU Arch required for ptxas.");
+  const std::string& gpu_arch = gpu_archs[0];
+
+
+  ArgStringList CmdArgs;
+  CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32");
+
+  // Clang's default optimization level is -O0, but ptxas's default is -O3.
+  CmdArgs.push_back(Args.MakeArgString(
+      llvm::Twine("-O") +
+      Args.getLastArgValue(options::OPT_O_Group, "0").data()));
+
+  // Don't bother passing -g to ptxas: It's enabled by default at -O0, and
+  // not supported at other optimization levels.
+
+  CmdArgs.push_back("--gpu-name");
+  CmdArgs.push_back(Args.MakeArgString(gpu_arch));
+  CmdArgs.push_back("--output-file");
+  CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
+  for (const auto& II : Inputs)
+    CmdArgs.push_back(Args.MakeArgString(II.getFilename()));
+
+  for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))
+    CmdArgs.push_back(Args.MakeArgString(A));
+
+  const char *Exec = Args.MakeArgString(TC.GetProgramPath("ptxas"));
+  C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+}
+
+// All inputs to this linker must be from CudaDeviceActions, as we need to look
+// at the Inputs' Actions in order to figure out which GPU architecture they
+// correspond to.
+void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
+                                 const InputInfo &Output,
+                                 const InputInfoList &Inputs,
+                                 const ArgList &Args,
+                                 const char *LinkingOutput) const {
+  const auto &TC =
+      static_cast<const toolchains::CudaToolChain &>(getToolChain());
+  assert(TC.getArch() == llvm::Triple::nvptx ||
+         TC.getArch() == llvm::Triple::nvptx64);
+
+  ArgStringList CmdArgs;
+  CmdArgs.push_back("--cuda");
+  CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-64" : "-32");
+  CmdArgs.push_back(Args.MakeArgString("--create"));
+  CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
+
+  for (const auto& II : Inputs) {
+    auto* A = cast<const CudaDeviceAction>(II.getAction());
+    // We need to pass an Arch of the form "sm_XX" for cubin files and
+    // "compute_XX" for ptx.
+    const char *Arch = (II.getType() == types::TY_PP_Asm)
+                           ? A->getComputeArchName()
+                           : A->getGpuArchName();
+    CmdArgs.push_back(Args.MakeArgString(llvm::Twine("--image=profile=") +
+                                         Arch + ",file=" + II.getFilename()));
+  }
+
+  for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_fatbinary))
+    CmdArgs.push_back(Args.MakeArgString(A));
+
+  const char *Exec = Args.MakeArgString(TC.GetProgramPath("fatbinary"));
+  C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+}

Modified: cfe/trunk/lib/Driver/Tools.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/Tools.h?rev=257809&r1=257808&r2=257809&view=diff
==============================================================================
--- cfe/trunk/lib/Driver/Tools.h (original)
+++ cfe/trunk/lib/Driver/Tools.h Thu Jan 14 15:41:27 2016
@@ -903,6 +903,41 @@ public:
 };
 } // end namespace PS4cpu
 
+namespace NVPTX {
+
+// Run ptxas, the NVPTX assembler.
+class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
+ public:
+   Assembler(const ToolChain &TC)
+       : Tool("NVPTX::Assembler", "ptxas", TC, RF_Full, llvm::sys::WEM_UTF8,
+              "--options-file") {}
+
+   bool hasIntegratedCPP() const override { return false; }
+
+   void ConstructJob(Compilation &C, const JobAction &JA,
+                     const InputInfo &Output, const InputInfoList &Inputs,
+                     const llvm::opt::ArgList &TCArgs,
+                     const char *LinkingOutput) const override;
+};
+
+// Runs fatbinary, which combines GPU object files ("cubin" files) and/or PTX
+// assembly into a single output file.
+class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+ public:
+   Linker(const ToolChain &TC)
+       : Tool("NVPTX::Linker", "fatbinary", TC, RF_Full, llvm::sys::WEM_UTF8,
+              "--options-file") {}
+
+   bool hasIntegratedCPP() const override { return false; }
+
+   void ConstructJob(Compilation &C, const JobAction &JA,
+                     const InputInfo &Output, const InputInfoList &Inputs,
+                     const llvm::opt::ArgList &TCArgs,
+                     const char *LinkingOutput) const override;
+};
+
+}  // end namespace NVPTX
+
 } // end namespace tools
 } // end namespace driver
 } // end namespace clang

Modified: cfe/trunk/lib/Driver/Types.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/Types.cpp?rev=257809&r1=257808&r2=257809&view=diff
==============================================================================
--- cfe/trunk/lib/Driver/Types.cpp (original)
+++ cfe/trunk/lib/Driver/Types.cpp Thu Jan 14 15:41:27 2016
@@ -232,8 +232,7 @@ void types::getCompilationPhases(ID Id,
         P.push_back(phases::Compile);
         P.push_back(phases::Backend);
       }
-      if (Id != TY_CUDA_DEVICE)
-        P.push_back(phases::Assemble);
+      P.push_back(phases::Assemble);
     }
   }
 

Added: cfe/trunk/test/Driver/Inputs/CUDA/usr/local/cuda/bin/.keep
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Driver/Inputs/CUDA/usr/local/cuda/bin/.keep?rev=257809&view=auto
==============================================================================
    (empty)

Added: cfe/trunk/test/Driver/cuda-arch-translation.cu
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Driver/cuda-arch-translation.cu?rev=257809&view=auto
==============================================================================
--- cfe/trunk/test/Driver/cuda-arch-translation.cu (added)
+++ cfe/trunk/test/Driver/cuda-arch-translation.cu Thu Jan 14 15:41:27 2016
@@ -0,0 +1,37 @@
+// Tests that "sm_XX" gets correctly converted to "compute_YY" when we invoke
+// fatbinary.
+//
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// CHECK:fatbinary
+
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_21 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM21 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_30 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM30 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_32 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM32 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_35 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM35 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_37 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM37 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_50 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM50 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_52 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM52 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_53 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM53 %s
+
+// SM20:--image=profile=sm_20{{.*}}--image=profile=compute_20
+// SM21:--image=profile=sm_21{{.*}}--image=profile=compute_20
+// SM30:--image=profile=sm_30{{.*}}--image=profile=compute_30
+// SM32:--image=profile=sm_32{{.*}}--image=profile=compute_32
+// SM35:--image=profile=sm_35{{.*}}--image=profile=compute_35
+// SM37:--image=profile=sm_37{{.*}}--image=profile=compute_37
+// SM50:--image=profile=sm_50{{.*}}--image=profile=compute_50
+// SM52:--image=profile=sm_52{{.*}}--image=profile=compute_52
+// SM53:--image=profile=sm_53{{.*}}--image=profile=compute_53

Added: cfe/trunk/test/Driver/cuda-external-tools.cu
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Driver/cuda-external-tools.cu?rev=257809&view=auto
==============================================================================
--- cfe/trunk/test/Driver/cuda-external-tools.cu (added)
+++ cfe/trunk/test/Driver/cuda-external-tools.cu Thu Jan 14 15:41:27 2016
@@ -0,0 +1,70 @@
+// Tests that ptxas and fatbinary are correctly during CUDA compilation.
+//
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// Regular compile with -O2.
+// RUN: %clang -### -target x86_64-linux-gnu -O2 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT2 %s
+
+// Regular compile without -O.  This should result in us passing -O0 to ptxas.
+// RUN: %clang -### -target x86_64-linux-gnu -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT0 %s
+
+// Regular compile targeting sm_35.
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM35 %s
+
+// 32-bit compile.
+// RUN: %clang -### -target x86_32-linux-gnu -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH32 -check-prefix SM20 %s
+
+// Compile with -fintegrated-as.  This should still cause us to invoke ptxas.
+// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT0 %s
+
+// Check -Xcuda-ptxas and -Xcuda-fatbinary
+// RUN: %clang -### -target x86_64-linux-gnu -c -Xcuda-ptxas -foo1 \
+// RUN:   -Xcuda-fatbinary -bar1 -Xcuda-ptxas -foo2 -Xcuda-fatbinary -bar2 %s 2>&1 \
+// RUN: | FileCheck -check-prefix SM20 -check-prefix PTXAS-EXTRA \
+// RUN:   -check-prefix FATBINARY-EXTRA %s
+
+// Match clang job that produces PTX assembly.
+// CHECK: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// SM20: "-target-cpu" "sm_20"
+// SM35: "-target-cpu" "sm_35"
+// SM20: "-o" "[[PTXFILE:[^"]*]]"
+// SM35: "-o" "[[PTXFILE:[^"]*]]"
+
+// Match the call to ptxas (which assembles PTX to SASS).
+// CHECK:ptxas
+// ARCH64: "-m64"
+// ARCH32: "-m32"
+// OPT0: "-O0"
+// OPT2: "-O2"
+// SM20: "--gpu-name" "sm_20"
+// SM35: "--gpu-name" "sm_35"
+// SM20: "--output-file" "[[CUBINFILE:[^"]*]]"
+// SM35: "--output-file" "[[CUBINFILE:[^"]*]]"
+// PTXAS-EXTRA: "-foo1"
+// PTXAS-EXTRA-SAME: "-foo2"
+// CHECK-SAME: "[[PTXFILE]]"
+
+// Match the call to fatbinary (which combines all our PTX and SASS into one
+// blob).
+// CHECK:fatbinary
+// CHECK-DAG: "--cuda"
+// ARCH64-DAG: "-64"
+// ARCH32-DAG: "-32"
+// CHECK-DAG: "--create" "[[FATBINARY:[^"]*]]"
+// SM20-DAG: "--image=profile=compute_20,file=[[PTXFILE]]"
+// SM35-DAG: "--image=profile=compute_35,file=[[PTXFILE]]"
+// SM20-DAG: "--image=profile=sm_20,file=[[CUBINFILE]]"
+// SM35-DAG: "--image=profile=sm_35,file=[[CUBINFILE]]"
+// FATBINARY-EXTRA: "-bar1"
+// FATBINARY-EXTRA-SAME: "-bar2"
+
+// Match the clang job for host compilation.
+// CHECK: "-cc1" "-triple" "x86_64--linux-gnu"
+// CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"

Modified: cfe/trunk/test/Driver/cuda-options.cu
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Driver/cuda-options.cu?rev=257809&r1=257808&r2=257809&view=diff
==============================================================================
--- cfe/trunk/test/Driver/cuda-options.cu (original)
+++ cfe/trunk/test/Driver/cuda-options.cu Thu Jan 14 15:41:27 2016
@@ -39,13 +39,6 @@
 // RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
 // RUN:    -check-prefix NOHOST -check-prefix NOLINK %s
 
-// Verify that with -S we compile host and device sides to assembly and
-// incorporate device code into the host side.
-// RUN: %clang -### -target x86_64-linux-gnu -S -c %s 2>&1 \
-// RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
-// RUN:    -check-prefix HOST -check-prefix INCLUDES-DEVICE \
-// RUN:    -check-prefix NOLINK %s
-
 // Verify that --cuda-gpu-arch option passes the correct GPU archtecture to
 // device compilation.
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
@@ -61,7 +54,7 @@
 // RUN:    -check-prefix DEVICE2 -check-prefix DEVICE-SM35 \
 // RUN:    -check-prefix DEVICE2-SM30 -check-prefix HOST \
 // RUN:    -check-prefix HOST-NOSAVE -check-prefix INCLUDES-DEVICE \
-// RUN:    -check-prefix INCLUDES-DEVICE2 -check-prefix NOLINK %s
+// RUN:    -check-prefix NOLINK %s
 
 // Verify that device-side results are passed to the correct tool when
 // -save-temps is used.
@@ -92,10 +85,16 @@
 // DEVICE-NOSAVE-SAME: "-aux-triple" "x86_64--linux-gnu"
 // DEVICE-SAME: "-fcuda-is-device"
 // DEVICE-SM35-SAME: "-target-cpu" "sm_35"
-// DEVICE-SAME: "-o" "[[GPUBINARY1:[^"]*]]"
+// DEVICE-SAME: "-o" "[[PTXFILE:[^"]*]]"
 // DEVICE-NOSAVE-SAME: "-x" "cuda"
 // DEVICE-SAVE-SAME: "-x" "ir"
 
+// Match the call to ptxas (which assembles PTX to SASS).
+// DEVICE:ptxas
+// DEVICE-SM35-DAG: "--gpu-name" "sm_35"
+// DEVICE-DAG: "--output-file" "[[CUBINFILE:[^"]*]]"
+// DEVICE-DAG: "[[PTXFILE]]"
+
 // Match another device-side compilation.
 // DEVICE2: "-cc1" "-triple" "nvptx64-nvidia-cuda"
 // DEVICE2-SAME: "-aux-triple" "x86_64--linux-gnu"
@@ -108,6 +107,11 @@
 // NODEVICE-NOT: "-cc1" "-triple" "nvptx64-nvidia-cuda"
 // NODEVICE-SAME-NOT: "-fcuda-is-device"
 
+// INCLUDES-DEVICE:fatbinary
+// INCLUDES-DEVICE-DAG: "--create" "[[FATBINARY:[^"]*]]"
+// INCLUDES-DEVICE-DAG: "--image=profile=sm_{{[0-9]+}},file=[[CUBINFILE]]"
+// INCLUDES-DEVICE-DAG: "--image=profile=compute_{{[0-9]+}},file=[[PTXFILE]]"
+
 // Match host-side preprocessor job with -save-temps.
 // HOST-SAVE: "-cc1" "-triple" "x86_64--linux-gnu"
 // HOST-SAVE-SAME: "-aux-triple" "nvptx64-nvidia-cuda"
@@ -121,8 +125,7 @@
 // HOST-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]"
 // HOST-NOSAVE-SAME: "-x" "cuda"
 // HOST-SAVE-SAME: "-x" "cuda-cpp-output"
-// INCLUDES-DEVICE-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY1]]"
-// INCLUDES-DEVICE2-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY2]]"
+// INCLUDES-DEVICE-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
 
 // Match external assembler that uses compilation output.
 // HOST-AS: "-o" "{{.*}}.o" "[[HOSTOUTPUT]]"