r247317 - [CUDA] Postprocess bitcode linked in during device-side CUDA compilation.

Thu Sep 10 11:24:23 PDT 2015

Author: tra
Date: Thu Sep 10 13:24:23 2015
New Revision: 247317

URL: http://llvm.org/viewvc/llvm-project?rev=247317&view=rev
Log:
[CUDA] Postprocess bitcode linked in during device-side CUDA compilation.

Link in and internalize the symbols we need from supplied bitcode library.

Differential Revision: http://reviews.llvm.org/D11664

Added:
    cfe/trunk/test/CodeGenCUDA/Inputs/device-code.ll
    cfe/trunk/test/CodeGenCUDA/link-device-bitcode.cu
Modified:
    cfe/trunk/include/clang/Basic/LangOptions.def
    cfe/trunk/include/clang/Driver/CC1Options.td
    cfe/trunk/lib/CodeGen/CodeGenAction.cpp
    cfe/trunk/lib/Frontend/CompilerInvocation.cpp

Modified: cfe/trunk/include/clang/Basic/LangOptions.def
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/LangOptions.def?rev=247317&r1=247316&r2=247317&view=diff
==============================================================================

--- cfe/trunk/include/clang/Basic/LangOptions.def (original)
+++ cfe/trunk/include/clang/Basic/LangOptions.def Thu Sep 10 13:24:23 2015
@@ -166,6 +166,7 @@ LANGOPT(OpenMPUseTLS      , 1, 0, "Use T
 LANGOPT(CUDAIsDevice      , 1, 0, "Compiling for CUDA device")
 LANGOPT(CUDAAllowHostCallsFromHostDevice, 1, 0, "Allow host device functions to call host functions")
 LANGOPT(CUDADisableTargetCallChecks, 1, 0, "Disable checks for call targets (host, device, etc.)")
+LANGOPT(CUDAUsesLibDevice , 1, 0, "Selectively link and internalize bitcode.")
 
 LANGOPT(AssumeSaneOperatorNew , 1, 1, "implicit __attribute__((malloc)) for C++'s new operators")
 LANGOPT(SizedDeallocation , 1, 0, "enable sized deallocation functions")

Modified: cfe/trunk/include/clang/Driver/CC1Options.td
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Driver/CC1Options.td?rev=247317&r1=247316&r2=247317&view=diff
==============================================================================
--- cfe/trunk/include/clang/Driver/CC1Options.td (original)
+++ cfe/trunk/include/clang/Driver/CC1Options.td Thu Sep 10 13:24:23 2015
@@ -659,6 +659,8 @@ def fcuda_disable_target_call_checks : F
   HelpText<"Disable all cross-target (host, device, etc.) call checks in CUDA">;
 def fcuda_include_gpubinary : Separate<["-"], "fcuda-include-gpubinary">,
   HelpText<"Incorporate CUDA device-side binary into host object file.">;
+def fcuda_uses_libdevice : Flag<["-"], "fcuda-uses-libdevice">,
+  HelpText<"Selectively link and internalize bitcode.">;
 
 } // let Flags = [CC1Option]
 

Modified: cfe/trunk/lib/CodeGen/CodeGenAction.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CodeGenAction.cpp?rev=247317&r1=247316&r2=247317&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CodeGenAction.cpp (original)
+++ cfe/trunk/lib/CodeGen/CodeGenAction.cpp Thu Sep 10 13:24:23 2015
@@ -159,7 +159,12 @@ namespace clang {
       if (LinkModule) {
         if (Linker::LinkModules(
                 M, LinkModule.get(),
-                [=](const DiagnosticInfo &DI) { linkerDiagnosticHandler(DI); }))
+                [=](const DiagnosticInfo &DI) { linkerDiagnosticHandler(DI); },
+                (LangOpts.CUDA && LangOpts.CUDAIsDevice &&
+                 LangOpts.CUDAUsesLibDevice)
+                    ? (Linker::Flags::LinkOnlyNeeded |
+                       Linker::Flags::InternalizeLinkedSymbols)
+                    : Linker::Flags::None))
           return;
       }
 

Modified: cfe/trunk/lib/Frontend/CompilerInvocation.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Frontend/CompilerInvocation.cpp?rev=247317&r1=247316&r2=247317&view=diff
==============================================================================
--- cfe/trunk/lib/Frontend/CompilerInvocation.cpp (original)
+++ cfe/trunk/lib/Frontend/CompilerInvocation.cpp Thu Sep 10 13:24:23 2015
@@ -1406,6 +1406,9 @@ static void ParseLangArgs(LangOptions &O
   if (Args.hasArg(OPT_fcuda_is_device))
     Opts.CUDAIsDevice = 1;
 
+  if (Args.hasArg(OPT_fcuda_uses_libdevice))
+    Opts.CUDAUsesLibDevice = 1;
+
   if (Args.hasArg(OPT_fcuda_allow_host_calls_from_host_device))
     Opts.CUDAAllowHostCallsFromHostDevice = 1;
 

Added: cfe/trunk/test/CodeGenCUDA/Inputs/device-code.ll
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGenCUDA/Inputs/device-code.ll?rev=247317&view=auto
==============================================================================
--- cfe/trunk/test/CodeGenCUDA/Inputs/device-code.ll (added)
+++ cfe/trunk/test/CodeGenCUDA/Inputs/device-code.ll Thu Sep 10 13:24:23 2015
@@ -0,0 +1,38 @@
+; Simple bit of IR to mimic CUDA's libdevice. We want to be
+; able to link with it and we need to make sure all __nvvm_reflect
+; calls are eliminated by the time PTX has been produced.
+
+target triple = "nvptx-unknown-cuda"
+
+declare i32 @__nvvm_reflect(i8*)
+
+@"$str" = private addrspace(1) constant [8 x i8] c"USE_MUL\00"
+
+define void @unused_subfunc(float %a) {
+       ret void
+}
+
+define void @used_subfunc(float %a) {
+       ret void
+}
+
+define float @_Z17device_mul_or_addff(float %a, float %b) {
+  %reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([8 x i8], [8 x i8] addrspace(1)* @"$str", i32 0, i32 0) to i8*))
+  %cmp = icmp ne i32 %reflect, 0
+  br i1 %cmp, label %use_mul, label %use_add
+
+use_mul:
+  %ret1 = fmul float %a, %b
+  br label %exit
+
+use_add:
+  %ret2 = fadd float %a, %b
+  br label %exit
+
+exit:
+  %ret = phi float [%ret1, %use_mul], [%ret2, %use_add]
+
+  call void @used_subfunc(float %ret)
+
+  ret float %ret
+}

Added: cfe/trunk/test/CodeGenCUDA/link-device-bitcode.cu
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGenCUDA/link-device-bitcode.cu?rev=247317&view=auto
==============================================================================
--- cfe/trunk/test/CodeGenCUDA/link-device-bitcode.cu (added)
+++ cfe/trunk/test/CodeGenCUDA/link-device-bitcode.cu Thu Sep 10 13:24:23 2015
@@ -0,0 +1,56 @@
+// Test for linking with CUDA's libdevice as outlined in
+// http://llvm.org/docs/NVPTXUsage.html#linking-with-libdevice
+//
+// REQUIRES: nvptx-registered-target
+//
+// Prepare bitcode file to link with
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc -o %t.bc \
+// RUN:    %S/Inputs/device-code.ll
+//
+// Make sure function in device-code gets linked in and internalized.
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \
+// RUN:    -mlink-bitcode-file %t.bc -fcuda-uses-libdevice -emit-llvm \
+// RUN:    -disable-llvm-passes -o - %s \
+// RUN:    | FileCheck %s -check-prefix CHECK-IR
+//
+// Make sure function in device-code gets linked but is not internalized
+// without -fcuda-uses-libdevice
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \
+// RUN:    -mlink-bitcode-file %t.bc -emit-llvm \
+// RUN:    -disable-llvm-passes -o - %s \
+// RUN:    | FileCheck %s -check-prefix CHECK-IR-NLD
+//
+// Make sure NVVMReflect pass is enabled in NVPTX back-end.
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \
+// RUN:    -mlink-bitcode-file %t.bc -fcuda-uses-libdevice -S -o /dev/null %s \
+// RUN:    -backend-option -debug-pass=Structure 2>&1 \
+// RUN:    | FileCheck %s -check-prefix CHECK-REFLECT
+
+#include "Inputs/cuda.h"
+
+__device__ float device_mul_or_add(float a, float b);
+extern "C" __device__ double __nv_sin(double x);
+extern "C" __device__ double __nv_exp(double x);
+
+// CHECK-IR-LABEL: define void @_Z26should_not_be_internalizedPf(
+// CHECK-PTX-LABEL: .visible .func _Z26should_not_be_internalizedPf(
+__device__ void should_not_be_internalized(float *data) {}
+
+// Make sure kernel call has not been internalized.
+// CHECK-IR-LABEL: define void @_Z6kernelPfS_
+// CHECK-PTX-LABEL: .visible .entry _Z6kernelPfS_(
+__global__ __attribute__((used)) void kernel(float *out, float *in) {
+  *out = device_mul_or_add(in[0], in[1]);
+  *out += __nv_exp(__nv_sin(*out));
+  should_not_be_internalized(out);
+}
+
+// Make sure device_mul_or_add() is present in IR, is internal and
+// calls __nvvm_reflect().
+// CHECK-IR-LABEL: define internal float @_Z17device_mul_or_addff(
+// CHECK-IR-NLD-LABEL: define float @_Z17device_mul_or_addff(
+// CHECK-IR: call i32 @__nvvm_reflect
+// CHECK-IR: ret float
+
+// Verify that NVVMReflect pass is among the passes run by NVPTX back-end.
+// CHECK-REFLECT: Replace occurrences of __nvvm_reflect() calls with 0/1