[llvm-dev] NVPTX Back-end: relocatable device code support for dynamic parallelism
Lorenz Braun via llvm-dev
llvm-dev at lists.llvm.org
Fri Jun 9 04:31:07 PDT 2017
Hi everyone,
CUDA allows to call some runtime functions also from the device code. On
a multi-GPU system this allows the GPU to determine its device id on its
own via cudaGetDevice().
Unfortunately i cannot get it working when compiling with clang. When
compiling with nvcc relocatable device code needs to be set to true
(-rdc=true) and the cudadevrt is needed when linking [0]. I did not
found such switches to turn rdc for clang. Just compiling does not work
as ptxas does not find the function cudaGetDevice().
My guess is, that this feature is not supported. Does anyone know is
this is the case?
I also tried to find out what nvcc is doing when setting rdc to on, but
hat a few problem trying to understand whats going on. I will attach the
verbose output of nvcc. I
have no clue what the binaries cudafe/cudafe++ and cicc are doing so its
rather hard to guess whats happening.
There are additional options like -D__CUDACC_RDC__, --device-c and
--compile-only that are not used when rdc is off. All but --device-c can
be used with clang and i can compile my program, however i can't get it
to run properly. For each runtime call i get an unknown error with code 30.
I have few hope, that someone already has figured out how to use get rdc
to work with clang, but i will be grateful for any hint. To whom could i
write to regarding this problem? Maybe the NVPTX developers can help?
[0]
http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#toolkit-support-for-dynamic-parallelism
--
Lorenz Braun
Research Associate
Institute of Computer Engineering (ZITI)
B6, 26, Building B, Office B2.20
68131 Mannheim
Phone: +49-621-181-2696
lorenz.braun at ziti.uni-heidelberg.de
-------------- next part --------------
#$ _SPACE_=
#$ _CUDART_=cudart
#$ _HERE_=/opt/cuda-8.0/bin
#$ _THERE_=/opt/cuda-8.0/bin
#$ _TARGET_SIZE_=
#$ _TARGET_DIR_=
#$ _TARGET_SIZE_=64
#$ TOP=/opt/cuda-8.0/bin/..
#$ NVVMIR_LIBRARY_DIR=/opt/cuda-8.0/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/opt/cuda-8.0/bin/../lib:/opt/cuda-8.0/lib64:/opt/llvm-4.0/lib
#$ PATH=/opt/cuda-8.0/bin/../open64/bin:/opt/cuda-8.0/bin/../nvvm/bin:/opt/cuda-8.0/bin:/opt/cuda-8.0/bin:/opt/llvm-4.0/bin:/home/lbraun/.opt/local/bin:/home/lbraun/.local/bin:/home/lbraun/.local/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/home/lbraun/bin:/home/lbraun/bin
#$ INCLUDES="-I/opt/cuda-8.0/bin/..//include"
#$ LIBRARIES= "-L/opt/cuda-8.0/bin/..//lib64/stubs" "-L/opt/cuda-8.0/bin/..//lib64"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -std=c++11 -D__CUDA_ARCH__=350 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_RDC__ "-I/opt/cuda-8.0/bin/..//include" -D"__CUDACC_VER__=80061" -D"__CUDACC_VER_BUILD__=61" -D"__CUDACC_VER_MINOR__=0" -D"__CUDACC_VER_MAJOR__=8" -include "cuda_runtime.h" -m64 "../testApps/cuda_id_test.cu" > "/tmp/tmpxft_00007040_00000000-9_cuda_id_test.cpp1.ii"
#$ cudafe --allow_managed --m64 --gnu_version=40805 --c++11 -tused --no_remove_unneeded_entities --device-c --gen_c_file_name "/tmp/tmpxft_00007040_00000000-4_cuda_id_test.cudafe1.c" --stub_file_name "/tmp/tmpxft_00007040_00000000-4_cuda_id_test.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00007040_00000000-4_cuda_id_test.cudafe1.gpu" --nv_arch "compute_35" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00007040_00000000-3_cuda_id_test.module_id" --include_file_name "tmpxft_00007040_00000000-2_cuda_id_test.fatbin.c" "/tmp/tmpxft_00007040_00000000-9_cuda_id_test.cpp1.ii"
#$ gcc -std=c++11 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_RDC__ "-I/opt/cuda-8.0/bin/..//include" -D"__CUDACC_VER__=80061" -D"__CUDACC_VER_BUILD__=61" -D"__CUDACC_VER_MINOR__=0" -D"__CUDACC_VER_MAJOR__=8" -include "cuda_runtime.h" -m64 "../testApps/cuda_id_test.cu" > "/tmp/tmpxft_00007040_00000000-5_cuda_id_test.cpp4.ii"
#$ cudafe++ --allow_managed --m64 --gnu_version=40805 --c++11 --parse_templates --device-c --gen_c_file_name "/tmp/tmpxft_00007040_00000000-4_cuda_id_test.cudafe1.cpp" --stub_file_name "tmpxft_00007040_00000000-4_cuda_id_test.cudafe1.stub.c" --module_id_file_name "/tmp/tmpxft_00007040_00000000-3_cuda_id_test.module_id" "/tmp/tmpxft_00007040_00000000-5_cuda_id_test.cpp4.ii"
#$ gcc -D__CUDA_ARCH__=350 -E -x c -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_RDC__ -D__CUDANVVM__ -D__CUDA_FTZ=0 -D__CUDA_PREC_DIV=1 -D__CUDA_PREC_SQRT=1 "-I/opt/cuda-8.0/bin/..//include" -m64 "/tmp/tmpxft_00007040_00000000-4_cuda_id_test.cudafe1.gpu" > "/tmp/tmpxft_00007040_00000000-11_cuda_id_test.cpp2.i"
#$ cudafe -w --allow_managed --m64 --gnu_version=40805 --c --device-c --gen_c_file_name "/tmp/tmpxft_00007040_00000000-12_cuda_id_test.cudafe2.c" --stub_file_name "/tmp/tmpxft_00007040_00000000-12_cuda_id_test.cudafe2.stub.c" --gen_device_file_name "/tmp/tmpxft_00007040_00000000-12_cuda_id_test.cudafe2.gpu" --nv_arch "compute_35" --module_id_file_name "/tmp/tmpxft_00007040_00000000-3_cuda_id_test.module_id" --include_file_name "tmpxft_00007040_00000000-2_cuda_id_test.fatbin.c" "/tmp/tmpxft_00007040_00000000-11_cuda_id_test.cpp2.i"
#$ gcc -D__CUDA_ARCH__=350 -E -x c -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDABE__ -D__CUDANVVM__ -D__USE_FAST_MATH__=0 -D__CUDA_FTZ=0 -D__CUDA_PREC_DIV=1 -D__CUDA_PREC_SQRT=1 "-I/opt/cuda-8.0/bin/..//include" -m64 "/tmp/tmpxft_00007040_00000000-12_cuda_id_test.cudafe2.gpu" > "/tmp/tmpxft_00007040_00000000-13_cuda_id_test.cpp3.i"
#$ cicc -arch compute_35 -m64 -ftz=0 -prec_div=1 -prec_sqrt=1 -fmad=1 -nvvmir-library "/opt/cuda-8.0/bin/../nvvm/libdevice/libdevice.compute_35.10.bc" --device-c --orig_src_file_name "../testApps/cuda_id_test.cu" "/tmp/tmpxft_00007040_00000000-13_cuda_id_test.cpp3.i" -o "/tmp/tmpxft_00007040_00000000-6_cuda_id_test.ptx"
#$ ptxas -arch=sm_35 -m64 --compile-only "/tmp/tmpxft_00007040_00000000-6_cuda_id_test.ptx" -o "/tmp/tmpxft_00007040_00000000-14_cuda_id_test.sm_35.cubin"
#$ fatbinary --create="/tmp/tmpxft_00007040_00000000-2_cuda_id_test.fatbin" -64 --cmdline="--compile-only " "--image=profile=sm_35,file=/tmp/tmpxft_00007040_00000000-14_cuda_id_test.sm_35.cubin" "--image=profile=compute_35,file=/tmp/tmpxft_00007040_00000000-6_cuda_id_test.ptx" --embedded-fatbin="/tmp/tmpxft_00007040_00000000-2_cuda_id_test.fatbin.c" --cuda --device-c
#$ rm /tmp/tmpxft_00007040_00000000-2_cuda_id_test.fatbin
#$ gcc -std=c++11 -D__CUDA_ARCH__=350 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__USE_FAST_MATH__=0 -D__CUDA_FTZ=0 -D__CUDA_PREC_DIV=1 -D__CUDA_PREC_SQRT=1 "-I/opt/cuda-8.0/bin/..//include" -m64 "/tmp/tmpxft_00007040_00000000-4_cuda_id_test.cudafe1.cpp" > "/tmp/tmpxft_00007040_00000000-15_cuda_id_test.ii"
#$ gcc -std=c++11 -c -x c++ "-I/opt/cuda-8.0/bin/..//include" -fpreprocessed -m64 -o "/tmp/tmpxft_00007040_00000000-16_cuda_id_test.o" "/tmp/tmpxft_00007040_00000000-15_cuda_id_test.ii"
#$ nvlink --arch=sm_35 --register-link-binaries="/tmp/tmpxft_00007040_00000000-7_id_test_dlink.reg.c" -m64 "-L/opt/cuda-8.0/bin/..//lib64/stubs" "-L/opt/cuda-8.0/bin/..//lib64" -cpu-arch=X86_64 "/tmp/tmpxft_00007040_00000000-16_cuda_id_test.o" -lcudadevrt -o "/tmp/tmpxft_00007040_00000000-17_id_test_dlink.sm_35.cubin"
#$ fatbinary --create="/tmp/tmpxft_00007040_00000000-8_id_test_dlink.fatbin" -64 --cmdline="--compile-only " -link "--image=profile=sm_35,file=/tmp/tmpxft_00007040_00000000-17_id_test_dlink.sm_35.cubin" --embedded-fatbin="/tmp/tmpxft_00007040_00000000-8_id_test_dlink.fatbin.c"
#$ rm /tmp/tmpxft_00007040_00000000-8_id_test_dlink.fatbin
#$ gcc -std=c++11 -c -x c++ -DFATBINFILE="\"/tmp/tmpxft_00007040_00000000-8_id_test_dlink.fatbin.c\"" -DREGISTERLINKBINARYFILE="\"/tmp/tmpxft_00007040_00000000-7_id_test_dlink.reg.c\"" -I. "-I/opt/cuda-8.0/bin/..//include" -D"__CUDACC_VER__=80061" -D"__CUDACC_VER_BUILD__=61" -D"__CUDACC_VER_MINOR__=0" -D"__CUDACC_VER_MAJOR__=8" -m64 -o "/tmp/tmpxft_00007040_00000000-18_id_test_dlink.o" "/opt/cuda-8.0/bin/crt/link.stub"
#$ g++ -m64 -o "id_test" -std=c++11 -Wl,--start-group "/tmp/tmpxft_00007040_00000000-18_id_test_dlink.o" "/tmp/tmpxft_00007040_00000000-16_cuda_id_test.o" "-L/opt/cuda-8.0/bin/..//lib64/stubs" "-L/opt/cuda-8.0/bin/..//lib64" -lcudadevrt -lcudart_static -lrt -lpthread -ldl -Wl,--end-group
More information about the llvm-dev
mailing list