[llvm-dev] instrumenting device code with gpucc

Jingyue Wu via llvm-dev llvm-dev at lists.llvm.org
Fri Mar 4 19:56:02 PST 2016


On Fri, Mar 4, 2016 at 5:50 PM, Yuanfeng Peng <yuanfeng.jack.peng at gmail.com>
wrote:

> Hi Jingyue,
>
> My name is Yuanfeng Peng, I'm a PhD student at UPenn.  I'm sorry to bother
> you, but I'm having trouble with gpucc in my project, and I would be really
> grateful for your help!
>
> Currently we're trying to instrument CUDA code using LLVM 3.9, and I've
> written a pass to insert hook functions for certain function calls and
> memory accesses.  For example, given a CUDA program, say, axpy.cu, I
> first compile it with
>
>  clang++ -emit-llvm -c axpy.cu,
>
> which gives me two bitcode files, axpy.bc and axpy-sm_20.bc.  Then I use
> opt to load my pass and insert the hook functions to axpy.bc, which works
> fine.  After inspecting the instrumented axpy.bc, I noticed that the kernel
> code was not there; rather, it lived inside axpy-sm_20.bc, so I also load
> my pass to instrument axpy-sm_20.bc.
>

Expected. axpy.bc contains host code, and axpy-sm_??.bc contains device
code. If you only want to instrument the device side, you don't need to
modify axpy.bc.


>
>  However, after instrumenting axpy-sm_20.bc, I don't know how could I
> combine the host bitcode & device bitcode into a single binary... When I
> used llc to compile axpy-sm_20.bc into native code, I always got a bunch of
> errors; if I only do llc axpy.bc -o axpy.s and then link axpy.s with the
> necessary libraries, I got a working binary, but only the host code was
> instrumented.
>
> So what should I do to get a binary where the device code is also
> instrumented?
>


To link the modified axpy-sm_20.bc to the final binary, you need several
extra steps:
1. Compile axpy-sm_20.bc to PTX assembly using llc: llc axpy-sm_20.bc -o
axpy-sm_20.ptx -march=<nvptx or nvptx64>
2. Compile the PTX assembly to SASS using ptxas
3. Make the SASS a fat binary using NVIDIA's fatbinary tool
4. Link the fat binary to the host code using ld.

Clang does step 2-4 by invoking subcommands. Therefore, you can use "clang
-###" to dump all the subcommands, and then find the ones for step 2-4. For
example,

$ clang++ -### -O3 axpy.cu -I/usr/local/cuda/samples/common/inc
-L/usr/local/cuda/lib64 -lcudart_static -lcuda -ldl -lrt -pthread
--cuda-gpu-arch=sm_35

clang version 3.9.0 (http://llvm.org/git/clang.git
4ce165e39e7b185e394aa713d9adffd920288988) (http://llvm.org/git/llvm.git
2550ef485b6f9668bb7a4daa7ab276b6501492df)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /usr/local/google/home/jingyue/Work/llvm/install/bin
 "/usr/local/google/home/jingyue/Work/llvm/install-git/bin/clang-3.7"
"-cc1" "-triple" "nvptx64-nvidia-cuda" "-aux-triple"
"x86_64-unknown-linux-gnu" "-fcuda-target-overloads"
"-fcuda-disable-target-call-checks" "-S" "-disable-free" "-main-file-name" "
axpy.cu" "-mrelocation-model" "static" "-mthread-model" "posix"
"-mdisable-fp-elim" "-fmath-errno" "-no-integrated-as" "-fcuda-is-device"
"-mlink-cuda-bitcode"
"/usr/local/cuda/nvvm/libdevice/libdevice.compute_35.10.bc"
"-target-feature" "+ptx42" "-target-cpu" "sm_35" "-dwarf-column-info"
"-debugger-tuning=gdb" "-resource-dir"
"/usr/local/google/home/jingyue/Work/llvm/install-git/bin/../lib/clang/3.9.0"
"-I" "/usr/local/cuda/samples/common/inc" "-internal-isystem"
"/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8"
"-internal-isystem"
"/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/x86_64-linux-gnu/c++/4.8"
"-internal-isystem"
"/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/x86_64-linux-gnu/c++/4.8"
"-internal-isystem"
"/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8/backward"
"-internal-isystem" "/usr/local/include" "-internal-isystem"
"/usr/local/google/home/jingyue/Work/llvm/install-git/bin/../lib/clang/3.9.0/include"
"-internal-externc-isystem" "/include" "-internal-externc-isystem"
"/usr/include" "-internal-isystem"
"/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8"
"-internal-isystem"
"/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/x86_64-linux-gnu/c++/4.8"
"-internal-isystem"
"/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/x86_64-linux-gnu/c++/4.8"
"-internal-isystem"
"/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8/backward"
"-internal-isystem" "/usr/local/cuda/include" "-include"
"__clang_cuda_runtime_wrapper.h" "-O3" "-fdeprecated-macro"
"-fno-dwarf-directory-asm" "-fdebug-compilation-dir"
"/usr/local/google/home/jingyue/Work/cuda" "-ferror-limit" "19"
"-fmessage-length" "205" "-pthread" "-fobjc-runtime=gcc" "-fcxx-exceptions"
"-fexceptions" "-fdiagnostics-show-option" "-fcolor-diagnostics"
"-vectorize-loops" "-vectorize-slp" "-o" "/tmp/axpy-a88a72.s" "-x" "cuda" "
axpy.cu"
 "/usr/local/cuda/bin/ptxas" "-m64" "-O3" "--gpu-name" "sm_35"
"--output-file" "/tmp/axpy-1dbca7.o" "/tmp/axpy-a88a72.s"
 "/usr/local/cuda/bin/fatbinary" "--cuda" "-64" "--create"
"/tmp/axpy-e6057c.fatbin" "--image=profile=sm_35,file=/tmp/axpy-1dbca7.o"
"--image=profile=compute_35,file=/tmp/axpy-a88a72.s"
 "/usr/local/google/home/jingyue/Work/llvm/install-git/bin/clang-3.7"
"-cc1" "-triple" "x86_64-unknown-linux-gnu" "-aux-triple"
"nvptx64-nvidia-cuda" "-fcuda-target-overloads"
"-fcuda-disable-target-call-checks" "-emit-obj" "-disable-free"
"-main-file-name" "axpy.cu" "-mrelocation-model" "static" "-mthread-model"
"posix" "-fmath-errno" "-masm-verbose" "-mconstructor-aliases"
"-munwind-tables" "-fuse-init-array" "-target-cpu" "x86-64"
"-momit-leaf-frame-pointer" "-dwarf-column-info" "-debugger-tuning=gdb"
"-resource-dir"
"/usr/local/google/home/jingyue/Work/llvm/install-git/bin/../lib/clang/3.9.0"
"-I" "/usr/local/cuda/samples/common/inc" "-internal-isystem"
"/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8"
"-internal-isystem"
"/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/x86_64-linux-gnu/c++/4.8"
"-internal-isystem"
"/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/x86_64-linux-gnu/c++/4.8"
"-internal-isystem"
"/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8/backward"
"-internal-isystem" "/usr/local/include" "-internal-isystem"
"/usr/local/google/home/jingyue/Work/llvm/install-git/bin/../lib/clang/3.9.0/include"
"-internal-externc-isystem" "/usr/include/x86_64-linux-gnu"
"-internal-externc-isystem" "/include" "-internal-externc-isystem"
"/usr/include" "-internal-isystem" "/usr/local/cuda/include" "-include"
"__clang_cuda_runtime_wrapper.h" "-O3" "-fdeprecated-macro"
"-fdebug-compilation-dir" "/usr/local/google/home/jingyue/Work/cuda"
"-ferror-limit" "19" "-fmessage-length" "205" "-pthread"
"-fobjc-runtime=gcc" "-fcxx-exceptions" "-fexceptions"
"-fdiagnostics-show-option" "-fcolor-diagnostics" "-vectorize-loops"
"-vectorize-slp" "-o" "/tmp/axpy-48f6b5.o" "-x" "cuda" "axpy.cu"
"-fcuda-include-gpubinary" "/tmp/axpy-e6057c.fatbin"
 "/usr/bin/ld" "-z" "relro" "--hash-style=gnu" "--build-id"
"--eh-frame-hdr" "-m" "elf_x86_64" "-dynamic-linker"
"/lib64/ld-linux-x86-64.so.2" "-o" "a.out"
"/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu/crt1.o"
"/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu/crti.o"
"/usr/lib/gcc/x86_64-linux-gnu/4.8/crtbegin.o" "-L/usr/local/cuda/lib64"
"-L/usr/lib/gcc/x86_64-linux-gnu/4.8"
"-L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu"
"-L/lib/x86_64-linux-gnu" "-L/lib/../lib64" "-L/usr/lib/x86_64-linux-gnu"
"-L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.."
"-L/usr/local/google/home/jingyue/Work/llvm/install-git/bin/../lib"
"-L/lib" "-L/usr/lib" "/tmp/axpy-48f6b5.o" "-lcudart_static" "-lcuda"
"-ldl" "-lrt" "-lstdc++" "-lm" "-lgcc_s" "-lgcc" "-lpthread" "-lc"
"-lgcc_s" "-lgcc" "/usr/lib/gcc/x86_64-linux-gnu/4.8/crtend.o"
"/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu/crtn.o"



>
> I apologize for the long email and I look forward to hearing from you.
>
> Thanks!
> Yuanfeng Peng
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20160304/16918652/attachment.html>


More information about the llvm-dev mailing list