r324878 - [CUDA] Add option to generate relocatable device code
Jonas Hahnfeld via cfe-commits
cfe-commits at lists.llvm.org
Mon Feb 12 02:46:45 PST 2018
Author: hahnfeld
Date: Mon Feb 12 02:46:45 2018
New Revision: 324878
URL: http://llvm.org/viewvc/llvm-project?rev=324878&view=rev
Log:
[CUDA] Add option to generate relocatable device code
As a first step, pass '-c/--compile-only' to ptxas so that it
doesn't complain about references to external function. This
will successfully generate object files, but they won't work
at runtime because the registration routines need to adapted.
Differential Revision: https://reviews.llvm.org/D42921
Modified:
cfe/trunk/include/clang/Basic/LangOptions.def
cfe/trunk/include/clang/Driver/Options.td
cfe/trunk/lib/Driver/ToolChains/Clang.cpp
cfe/trunk/lib/Driver/ToolChains/Cuda.cpp
cfe/trunk/lib/Frontend/CompilerInvocation.cpp
cfe/trunk/test/Driver/cuda-external-tools.cu
Modified: cfe/trunk/include/clang/Basic/LangOptions.def
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/LangOptions.def?rev=324878&r1=324877&r2=324878&view=diff
==============================================================================
--- cfe/trunk/include/clang/Basic/LangOptions.def (original)
+++ cfe/trunk/include/clang/Basic/LangOptions.def Mon Feb 12 02:46:45 2018
@@ -204,6 +204,7 @@ LANGOPT(CUDAAllowVariadicFunctions, 1, 0
LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__")
LANGOPT(CUDADeviceFlushDenormalsToZero, 1, 0, "flushing denormals to zero")
LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
+LANGOPT(CUDARelocatableDeviceCode, 1, 0, "generate relocatable device code")
LANGOPT(SizedDeallocation , 1, 0, "sized deallocation")
LANGOPT(AlignedAllocation , 1, 0, "aligned allocation")
Modified: cfe/trunk/include/clang/Driver/Options.td
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Driver/Options.td?rev=324878&r1=324877&r2=324878&view=diff
==============================================================================
--- cfe/trunk/include/clang/Driver/Options.td (original)
+++ cfe/trunk/include/clang/Driver/Options.td Mon Feb 12 02:46:45 2018
@@ -566,6 +566,9 @@ def fno_cuda_flush_denormals_to_zero : F
def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">,
Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">;
def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">;
+def fcuda_rdc : Flag<["-"], "fcuda-rdc">, Flags<[CC1Option, HelpHidden]>,
+ HelpText<"Generate relocatable device code, also known as separate compilation mode.">;
+def fno_cuda_rdc : Flag<["-"], "fno-cuda-rdc">;
def dA : Flag<["-"], "dA">, Group<d_Group>;
def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
HelpText<"Print macro definitions in -E mode in addition to normal output">;
Modified: cfe/trunk/lib/Driver/ToolChains/Clang.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/ToolChains/Clang.cpp?rev=324878&r1=324877&r2=324878&view=diff
==============================================================================
--- cfe/trunk/lib/Driver/ToolChains/Clang.cpp (original)
+++ cfe/trunk/lib/Driver/ToolChains/Clang.cpp Mon Feb 12 02:46:45 2018
@@ -4658,14 +4658,20 @@ void Clang::ConstructJob(Compilation &C,
CmdArgs.push_back(Args.MakeArgString(Flags));
}
- // Host-side cuda compilation receives device-side outputs as Inputs[1...].
- // Include them with -fcuda-include-gpubinary.
- if (IsCuda && Inputs.size() > 1)
- for (auto I = std::next(Inputs.begin()), E = Inputs.end(); I != E; ++I) {
- CmdArgs.push_back("-fcuda-include-gpubinary");
- CmdArgs.push_back(I->getFilename());
+ if (IsCuda) {
+ // Host-side cuda compilation receives device-side outputs as Inputs[1...].
+ // Include them with -fcuda-include-gpubinary.
+ if (Inputs.size() > 1) {
+ for (auto I = std::next(Inputs.begin()), E = Inputs.end(); I != E; ++I) {
+ CmdArgs.push_back("-fcuda-include-gpubinary");
+ CmdArgs.push_back(I->getFilename());
+ }
}
+ if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false))
+ CmdArgs.push_back("-fcuda-rdc");
+ }
+
// OpenMP offloading device jobs take the argument -fopenmp-host-ir-file-path
// to specify the result of the compile phase on the host, so the meaningful
// device declarations can be identified. Also, -fopenmp-is-device is passed
Modified: cfe/trunk/lib/Driver/ToolChains/Cuda.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/ToolChains/Cuda.cpp?rev=324878&r1=324877&r2=324878&view=diff
==============================================================================
--- cfe/trunk/lib/Driver/ToolChains/Cuda.cpp (original)
+++ cfe/trunk/lib/Driver/ToolChains/Cuda.cpp Mon Feb 12 02:46:45 2018
@@ -355,11 +355,17 @@ void NVPTX::Assembler::ConstructJob(Comp
for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))
CmdArgs.push_back(Args.MakeArgString(A));
- // In OpenMP we need to generate relocatable code.
- if (JA.isOffloading(Action::OFK_OpenMP) &&
- Args.hasFlag(options::OPT_fopenmp_relocatable_target,
- options::OPT_fnoopenmp_relocatable_target,
- /*Default=*/ true))
+ bool Relocatable = false;
+ if (JA.isOffloading(Action::OFK_OpenMP))
+ // In OpenMP we need to generate relocatable code.
+ Relocatable = Args.hasFlag(options::OPT_fopenmp_relocatable_target,
+ options::OPT_fnoopenmp_relocatable_target,
+ /*Default=*/true);
+ else if (JA.isOffloading(Action::OFK_Cuda))
+ Relocatable = Args.hasFlag(options::OPT_fcuda_rdc,
+ options::OPT_fno_cuda_rdc, /*Default=*/false);
+
+ if (Relocatable)
CmdArgs.push_back("-c");
const char *Exec;
@@ -540,6 +546,10 @@ void CudaToolChain::addClangTargetOption
if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
options::OPT_fno_cuda_approx_transcendentals, false))
CC1Args.push_back("-fcuda-approx-transcendentals");
+
+ if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc,
+ false))
+ CC1Args.push_back("-fcuda-rdc");
}
if (DriverArgs.hasArg(options::OPT_nocudalib))
Modified: cfe/trunk/lib/Frontend/CompilerInvocation.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Frontend/CompilerInvocation.cpp?rev=324878&r1=324877&r2=324878&view=diff
==============================================================================
--- cfe/trunk/lib/Frontend/CompilerInvocation.cpp (original)
+++ cfe/trunk/lib/Frontend/CompilerInvocation.cpp Mon Feb 12 02:46:45 2018
@@ -2074,6 +2074,8 @@ static void ParseLangArgs(LangOptions &O
if (Opts.CUDAIsDevice && Args.hasArg(OPT_fcuda_approx_transcendentals))
Opts.CUDADeviceApproxTranscendentals = 1;
+ Opts.CUDARelocatableDeviceCode = Args.hasArg(OPT_fcuda_rdc);
+
if (Opts.ObjC1) {
if (Arg *arg = Args.getLastArg(OPT_fobjc_runtime_EQ)) {
StringRef value = arg->getValue();
Modified: cfe/trunk/test/Driver/cuda-external-tools.cu
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Driver/cuda-external-tools.cu?rev=324878&r1=324877&r2=324878&view=diff
==============================================================================
--- cfe/trunk/test/Driver/cuda-external-tools.cu (original)
+++ cfe/trunk/test/Driver/cuda-external-tools.cu Mon Feb 12 02:46:45 2018
@@ -18,6 +18,9 @@
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT3 %s
// RUN: %clang -### -target x86_64-linux-gnu -Ofast -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT3 %s
+// Generating relocatable device code
+// RUN: %clang -### -target x86_64-linux-gnu -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
// With debugging enabled, ptxas should be run with with no ptxas optimizations.
// RUN: %clang -### -target x86_64-linux-gnu --cuda-noopt-device-debug -O2 -c %s 2>&1 \
@@ -42,14 +45,23 @@
// Regular compile targeting sm_35.
// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35 %s
+// Separate compilation targeting sm_35.
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
// 32-bit compile.
// RUN: %clang -### -target i386-linux-gnu -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s
+// 32-bit compile when generating relocatable device code.
+// RUN: %clang -### -target i386-linux-gnu -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s
// Compile with -fintegrated-as. This should still cause us to invoke ptxas.
// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,OPT0 %s
+// Check that we still pass -c when generating relocatable device code.
+// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
// Check -Xcuda-ptxas and -Xcuda-fatbinary
// RUN: %clang -### -target x86_64-linux-gnu -c -Xcuda-ptxas -foo1 \
@@ -64,6 +76,14 @@
// RUN: %clang -### -target i386-apple-macosx -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20 %s
+// Check relocatable device code generation on MacOS.
+// RUN: %clang -### -target x86_64-apple-macosx -O0 -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM20,RDC %s
+// RUN: %clang -### -target x86_64-apple-macosx --cuda-gpu-arch=sm_35 -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,RDC %s
+// RUN: %clang -### -target i386-apple-macosx -fcuda-rdc -c %s 2>&1 \
+// RUN: | FileCheck -check-prefixes=CHECK,ARCH32,SM20,RDC %s
+
// Check that CLANG forwards the -v flag to PTXAS.
// RUN: %clang -### -save-temps -no-canonical-prefixes -v %s 2>&1 \
// RUN: | FileCheck -check-prefix=CHK-PTXAS-VERBOSE %s
@@ -76,6 +96,8 @@
// SM35-SAME: "-target-cpu" "sm_35"
// SM20-SAME: "-o" "[[PTXFILE:[^"]*]]"
// SM35-SAME: "-o" "[[PTXFILE:[^"]*]]"
+// RDC-SAME: "-fcuda-rdc"
+// CHECK-NOT: "-fcuda-rdc"
// Match the call to ptxas (which assembles PTX to SASS).
// CHECK: ptxas
@@ -97,6 +119,8 @@
// CHECK-SAME: "[[PTXFILE]]"
// PTXAS-EXTRA-SAME: "-foo1"
// PTXAS-EXTRA-SAME: "-foo2"
+// RDC-SAME: "-c"
+// CHECK-NOT: "-c"
// Match the call to fatbinary (which combines all our PTX and SASS into one
// blob).
@@ -117,5 +141,7 @@
// ARCH64-SAME: "-triple" "x86_64-
// ARCH32-SAME: "-triple" "i386-
// CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
+// RDC-SAME: "-fcuda-rdc"
+// CHECK-NOT: "-fcuda-rdc"
// CHK-PTXAS-VERBOSE: ptxas{{.*}}" "-v"
More information about the cfe-commits
mailing list