[llvm-dev] NVPTX Back-end: relocatable device code support for dynamic parallelism

Lorenz Braun via llvm-dev llvm-dev at lists.llvm.org
Fri Jun 9 04:31:07 PDT 2017


Hi everyone,

CUDA allows to call some runtime functions also from the device code. On 
a multi-GPU system this allows the GPU to determine its device id on its 
own via cudaGetDevice().
Unfortunately i cannot get it working when compiling with clang. When 
compiling with nvcc relocatable device code needs to be set to true 
(-rdc=true) and the cudadevrt is needed when linking [0]. I did not 
found such switches to turn rdc for clang. Just compiling does not work 
as ptxas does not find the function cudaGetDevice().

My guess is, that this feature is not supported. Does anyone know is 
this is the case?

I also tried to find out what nvcc is doing when setting rdc to on, but 
hat a few problem trying to understand whats going on. I will attach the 
verbose output of nvcc. I
have no clue what the binaries cudafe/cudafe++ and cicc are doing so its 
rather hard to guess whats happening.
There are additional options like -D__CUDACC_RDC__, --device-c and 
--compile-only that are not used when rdc is off. All but --device-c can 
be used with clang and i can compile my program, however i can't get it 
to run properly.  For each runtime call i get an unknown error with code 30.

I have few hope, that someone already has figured out how to use get rdc 
to work with clang, but i will be grateful for any hint. To whom could i 
write to regarding this problem? Maybe the NVPTX developers can help?

[0] 
http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#toolkit-support-for-dynamic-parallelism

-- 
Lorenz Braun
Research Associate
Institute of Computer Engineering (ZITI)
B6, 26, Building B, Office B2.20
68131 Mannheim

Phone: +49-621-181-2696
lorenz.braun at ziti.uni-heidelberg.de

-------------- next part --------------
#$ _SPACE_=
#$ _CUDART_=cudart
#$ _HERE_=/opt/cuda-8.0/bin
#$ _THERE_=/opt/cuda-8.0/bin
#$ _TARGET_SIZE_=
#$ _TARGET_DIR_=
#$ _TARGET_SIZE_=64
#$ TOP=/opt/cuda-8.0/bin/..
#$ NVVMIR_LIBRARY_DIR=/opt/cuda-8.0/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/opt/cuda-8.0/bin/../lib:/opt/cuda-8.0/lib64:/opt/llvm-4.0/lib
#$ PATH=/opt/cuda-8.0/bin/../open64/bin:/opt/cuda-8.0/bin/../nvvm/bin:/opt/cuda-8.0/bin:/opt/cuda-8.0/bin:/opt/llvm-4.0/bin:/home/lbraun/.opt/local/bin:/home/lbraun/.local/bin:/home/lbraun/.local/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/home/lbraun/bin:/home/lbraun/bin
#$ INCLUDES="-I/opt/cuda-8.0/bin/..//include"
#$ LIBRARIES=  "-L/opt/cuda-8.0/bin/..//lib64/stubs" "-L/opt/cuda-8.0/bin/..//lib64"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -std=c++11 -D__CUDA_ARCH__=350 -E -x c++        -DCUDA_DOUBLE_MATH_FUNCTIONS  -D__CUDACC__ -D__NVCC__ -D__CUDACC_RDC__  "-I/opt/cuda-8.0/bin/..//include"   -D"__CUDACC_VER__=80061" -D"__CUDACC_VER_BUILD__=61" -D"__CUDACC_VER_MINOR__=0" -D"__CUDACC_VER_MAJOR__=8" -include "cuda_runtime.h" -m64 "../testApps/cuda_id_test.cu" > "/tmp/tmpxft_00007040_00000000-9_cuda_id_test.cpp1.ii"

#$ cudafe --allow_managed --m64 --gnu_version=40805 --c++11 -tused --no_remove_unneeded_entities  --device-c --gen_c_file_name "/tmp/tmpxft_00007040_00000000-4_cuda_id_test.cudafe1.c" --stub_file_name "/tmp/tmpxft_00007040_00000000-4_cuda_id_test.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00007040_00000000-4_cuda_id_test.cudafe1.gpu" --nv_arch "compute_35" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00007040_00000000-3_cuda_id_test.module_id" --include_file_name "tmpxft_00007040_00000000-2_cuda_id_test.fatbin.c" "/tmp/tmpxft_00007040_00000000-9_cuda_id_test.cpp1.ii"

#$ gcc -std=c++11 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_RDC__  "-I/opt/cuda-8.0/bin/..//include"   -D"__CUDACC_VER__=80061" -D"__CUDACC_VER_BUILD__=61" -D"__CUDACC_VER_MINOR__=0" -D"__CUDACC_VER_MAJOR__=8" -include "cuda_runtime.h" -m64 "../testApps/cuda_id_test.cu" > "/tmp/tmpxft_00007040_00000000-5_cuda_id_test.cpp4.ii"

#$ cudafe++ --allow_managed --m64 --gnu_version=40805 --c++11 --parse_templates  --device-c --gen_c_file_name "/tmp/tmpxft_00007040_00000000-4_cuda_id_test.cudafe1.cpp" --stub_file_name "tmpxft_00007040_00000000-4_cuda_id_test.cudafe1.stub.c" --module_id_file_name "/tmp/tmpxft_00007040_00000000-3_cuda_id_test.module_id" "/tmp/tmpxft_00007040_00000000-5_cuda_id_test.cpp4.ii"

#$ gcc -D__CUDA_ARCH__=350 -E -x c        -DCUDA_DOUBLE_MATH_FUNCTIONS  -D__CUDACC__ -D__NVCC__ -D__CUDACC_RDC__ -D__CUDANVVM__  -D__CUDA_FTZ=0 -D__CUDA_PREC_DIV=1 -D__CUDA_PREC_SQRT=1 "-I/opt/cuda-8.0/bin/..//include"   -m64 "/tmp/tmpxft_00007040_00000000-4_cuda_id_test.cudafe1.gpu" > "/tmp/tmpxft_00007040_00000000-11_cuda_id_test.cpp2.i"

#$ cudafe -w --allow_managed --m64 --gnu_version=40805 --c  --device-c --gen_c_file_name "/tmp/tmpxft_00007040_00000000-12_cuda_id_test.cudafe2.c" --stub_file_name "/tmp/tmpxft_00007040_00000000-12_cuda_id_test.cudafe2.stub.c" --gen_device_file_name "/tmp/tmpxft_00007040_00000000-12_cuda_id_test.cudafe2.gpu" --nv_arch "compute_35" --module_id_file_name "/tmp/tmpxft_00007040_00000000-3_cuda_id_test.module_id" --include_file_name "tmpxft_00007040_00000000-2_cuda_id_test.fatbin.c" "/tmp/tmpxft_00007040_00000000-11_cuda_id_test.cpp2.i"

#$ gcc -D__CUDA_ARCH__=350 -E -x c        -DCUDA_DOUBLE_MATH_FUNCTIONS  -D__CUDABE__ -D__CUDANVVM__ -D__USE_FAST_MATH__=0  -D__CUDA_FTZ=0 -D__CUDA_PREC_DIV=1 -D__CUDA_PREC_SQRT=1 "-I/opt/cuda-8.0/bin/..//include"   -m64 "/tmp/tmpxft_00007040_00000000-12_cuda_id_test.cudafe2.gpu" > "/tmp/tmpxft_00007040_00000000-13_cuda_id_test.cpp3.i"

#$ cicc  -arch compute_35 -m64 -ftz=0 -prec_div=1 -prec_sqrt=1 -fmad=1 -nvvmir-library "/opt/cuda-8.0/bin/../nvvm/libdevice/libdevice.compute_35.10.bc"  --device-c --orig_src_file_name "../testApps/cuda_id_test.cu"  "/tmp/tmpxft_00007040_00000000-13_cuda_id_test.cpp3.i" -o "/tmp/tmpxft_00007040_00000000-6_cuda_id_test.ptx"

#$ ptxas  -arch=sm_35 -m64 --compile-only  "/tmp/tmpxft_00007040_00000000-6_cuda_id_test.ptx"  -o "/tmp/tmpxft_00007040_00000000-14_cuda_id_test.sm_35.cubin"

#$ fatbinary --create="/tmp/tmpxft_00007040_00000000-2_cuda_id_test.fatbin" -64 --cmdline="--compile-only  " "--image=profile=sm_35,file=/tmp/tmpxft_00007040_00000000-14_cuda_id_test.sm_35.cubin" "--image=profile=compute_35,file=/tmp/tmpxft_00007040_00000000-6_cuda_id_test.ptx" --embedded-fatbin="/tmp/tmpxft_00007040_00000000-2_cuda_id_test.fatbin.c" --cuda --device-c
#$ rm /tmp/tmpxft_00007040_00000000-2_cuda_id_test.fatbin

#$ gcc -std=c++11 -D__CUDA_ARCH__=350 -E -x c++        -DCUDA_DOUBLE_MATH_FUNCTIONS  -D__USE_FAST_MATH__=0  -D__CUDA_FTZ=0 -D__CUDA_PREC_DIV=1 -D__CUDA_PREC_SQRT=1 "-I/opt/cuda-8.0/bin/..//include"   -m64 "/tmp/tmpxft_00007040_00000000-4_cuda_id_test.cudafe1.cpp" > "/tmp/tmpxft_00007040_00000000-15_cuda_id_test.ii"
#$ gcc -std=c++11 -c -x c++ "-I/opt/cuda-8.0/bin/..//include"   -fpreprocessed -m64 -o "/tmp/tmpxft_00007040_00000000-16_cuda_id_test.o" "/tmp/tmpxft_00007040_00000000-15_cuda_id_test.ii"
#$ nvlink --arch=sm_35 --register-link-binaries="/tmp/tmpxft_00007040_00000000-7_id_test_dlink.reg.c" -m64   "-L/opt/cuda-8.0/bin/..//lib64/stubs" "-L/opt/cuda-8.0/bin/..//lib64" -cpu-arch=X86_64 "/tmp/tmpxft_00007040_00000000-16_cuda_id_test.o"  -lcudadevrt  -o "/tmp/tmpxft_00007040_00000000-17_id_test_dlink.sm_35.cubin"
#$ fatbinary --create="/tmp/tmpxft_00007040_00000000-8_id_test_dlink.fatbin" -64 --cmdline="--compile-only  " -link "--image=profile=sm_35,file=/tmp/tmpxft_00007040_00000000-17_id_test_dlink.sm_35.cubin" --embedded-fatbin="/tmp/tmpxft_00007040_00000000-8_id_test_dlink.fatbin.c"
#$ rm /tmp/tmpxft_00007040_00000000-8_id_test_dlink.fatbin

#$ gcc -std=c++11 -c -x c++ -DFATBINFILE="\"/tmp/tmpxft_00007040_00000000-8_id_test_dlink.fatbin.c\"" -DREGISTERLINKBINARYFILE="\"/tmp/tmpxft_00007040_00000000-7_id_test_dlink.reg.c\"" -I. "-I/opt/cuda-8.0/bin/..//include"   -D"__CUDACC_VER__=80061" -D"__CUDACC_VER_BUILD__=61" -D"__CUDACC_VER_MINOR__=0" -D"__CUDACC_VER_MAJOR__=8" -m64 -o "/tmp/tmpxft_00007040_00000000-18_id_test_dlink.o" "/opt/cuda-8.0/bin/crt/link.stub"
#$ g++ -m64 -o "id_test" -std=c++11 -Wl,--start-group "/tmp/tmpxft_00007040_00000000-18_id_test_dlink.o" "/tmp/tmpxft_00007040_00000000-16_cuda_id_test.o"   "-L/opt/cuda-8.0/bin/..//lib64/stubs" "-L/opt/cuda-8.0/bin/..//lib64" -lcudadevrt  -lcudart_static  -lrt -lpthread  -ldl  -Wl,--end-group


More information about the llvm-dev mailing list