[LLVMdev] NVPTX CUDA_ERROR_NO_BINARY_FOR_GPU
Pete Couperus
pjcoup at gmail.com
Fri Mar 1 12:51:52 PST 2013
Hi Timothy,
I'm not sure what you mean by this working for other intrinsics, but
in this case, I think you want the intrinsic name
llvm.nvvm.read.ptx.sreg.tid.x.
For me, this looks like:
%x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
Pete
On Fri, Mar 1, 2013 at 11:51 AM, Timothy Baldridge <tbaldridge at gmail.com> wrote:
> I'm building this with llvm-c, and accessing these intrinsics via calling
> the intrinsic as if it were a function.
>
> class F_SREG<string OpStr, NVPTXRegClass regclassOut, Intrinsic IntOp> :
> NVPTXInst<(outs regclassOut:$dst), (ins),
> OpStr,
> [(set regclassOut:$dst, (IntOp))]>;
>
> def INT_PTX_SREG_TID_X : F_SREG<"mov.u32 \t$dst, %tid.x;", Int32Regs,
> int_nvvm_read_ptx_sreg_tid_x>;
>
> This method of accessing intrinsics works just fine for other intrinsics
> (for instance sqrt). Should I be declaring these as extern global variables?
>
> Thanks,
>
> Timothy
>
>
> On Fri, Mar 1, 2013 at 12:44 PM, Dmitry Mikushin <dmitry at kernelgen.org>
> wrote:
>>
>> Timothy,
>>
>> Those calls to compute grid intrinsics are definitely wrong. In ptx code
>> they should end up into reading special registers, rather than function
>> calls. Try to take some working example and figure out the LLVM IR
>> differences between it and the result of your compiler.
>>
>> - D.
>>
>> ----- Original message -----
>> > I've written a compiler that outputs PTX code, the result seems fairly
>> > reasonable, but I'm not sure the intrinsics are getting compiled
>> > correctly.
>> >
>> > In addition, when I try load the module using CUDA, I get an
>> > error: CUDA_ERROR_NO_BINARY_FOR_GPU. I'm running this on a 2012 MBP
>> > with a 640M GPU.
>> >
>> > PTX Code (for a mandelbrot calculation):
>> >
>> > //
>> > // Generated by LLVM NVPTX Back-End
>> > //
>> >
>> > .version 3.1
>> > .target sm_10, texmode_independent
>> > .address_size 64
>> >
>> > .func (.reg .b32 func_retval0) INT_PTX_SREG_CTAID_X
>> > (
>> >
>> > )
>> > ;
>> > .func (.reg .b32 func_retval0) INT_PTX_SREG_CTAID_Y
>> > (
>> >
>> > )
>> > ;
>> > .func (.reg .b32 func_retval0) INT_PTX_SREG_TID_X
>> > (
>> >
>> > )
>> > ;
>> > .func (.reg .b32 func_retval0) INT_PTX_SREG_NTID_X
>> > (
>> >
>> > )
>> > ;
>> > .func (.reg .b32 func_retval0) INT_PTX_SREG_NTID_Y
>> > (
>> >
>> > )
>> > ;
>> >
>> > // .globl examples_2E_mandelbrot_2F_square
>> > .func (.reg .b64 func_retval0) examples_2E_mandelbrot_2F_square(
>> > .reg .b64 examples_2E_mandelbrot_2F_square_param_0
>> > )
>> > {
>> > .reg .pred %p<396>;
>> > .reg .s16 %rc<396>;
>> > .reg .s16 %rs<396>;
>> > .reg .s32 %r<396>;
>> > .reg .s64 %rl<396>;
>> > .reg .f32 %f<396>;
>> > .reg .f64 %fl<396>;
>> >
>> > mov.f64 %fl0, examples_2E_mandelbrot_2F_square_param_0;
>> > mul.f64 %fl0, %fl0, %fl0;
>> > mov.f64 func_retval0, %fl0;
>> > ret;
>> > }
>> >
>> > // .globl
>> > examples_2E_mandelbrot_2F_calc_2D_iteration
>> > .func (.reg .b64 func_retval0)
>> > examples_2E_mandelbrot_2F_calc_2D_iteration( .reg .b64
>> > examples_2E_mandelbrot_2F_calc_2D_iteration_param_0, .reg
>> > .b64
>> > examples_2E_mandelbrot_2F_calc_2D_iteration_param_1, .reg
>> > .b64
>> > examples_2E_mandelbrot_2F_calc_2D_iteration_param_2, .reg
>> > .b64
>> > examples_2E_mandelbrot_2F_calc_2D_iteration_param_3, .reg
>> > .b64
>> > examples_2E_mandelbrot_2F_calc_2D_iteration_param_4 )
>> > {
>> > .reg .pred %p<396>;
>> > .reg .s16 %rc<396>;
>> > .reg .s16 %rs<396>;
>> > .reg .s32 %r<396>;
>> > .reg .s64 %rl<396>;
>> > .reg .f32 %f<396>;
>> > .reg .f64 %fl<396>;
>> >
>> > mov.f64 %fl0,
>> > examples_2E_mandelbrot_2F_calc_2D_iteration_param_0;
>> > mov.f64
>> > %fl1, examples_2E_mandelbrot_2F_calc_2D_iteration_param_3;
>> > div.rn.f64 %fl0, %fl0, %fl1; mov.f64 %fl2,
>> > examples_2E_mandelbrot_2F_calc_2D_iteration_param_1;
>> > mul.f64
>> > %fl1, %fl0, 0d400C000000000000; mov.f64 %fl0,
>> > examples_2E_mandelbrot_2F_calc_2D_iteration_param_2;
>> > add.f64
>> > %fl1, %fl1, 0dC004000000000000; mov.f64 %fl3,
>> > examples_2E_mandelbrot_2F_calc_2D_iteration_param_4;
>> > div.rn.f64
>> > %fl2, %fl2, %fl3; add.f64 %fl2,
>> > %fl2, %fl2;
>> > add.f64 %fl3, %fl2, 0dBFF0000000000000;
>> > mov.f64 %fl2, 0d0000000000000000;
>> > mov.f64 %fl5, %fl2;
>> > mov.f64 %fl4, %fl2;
>> > bra.uni BB1_1;
>> > BB1_2:
>> > add.f64 %fl2, %fl2, 0d3FF0000000000000;
>> > sub.f64 %fl6, %fl6, %fl7;
>> > add.f64 %fl6, %fl6, %fl1;
>> > add.f64 %fl5, %fl5, %fl5;
>> > mul.f64 %fl4, %fl5, %fl4;
>> > add.f64 %fl4, %fl4, %fl3;
>> > mov.f64 %fl5, %fl6;
>> > BB1_1:
>> > mul.f64 %fl6, %fl5, %fl5;
>> > mul.f64 %fl7, %fl4, %fl4;
>> > add.f64 %fl8, %fl6, %fl7;
>> > setp.lt.f64 %p0, %fl8, 0d4010000000000000;
>> > setp.lt.f64 %p1, %fl2, %fl0;
>> > and.pred %p0, %p0, %p1;
>> > @!%p0 bra BB1_3;
>> > bra.uni BB1_2;
>> > BB1_3:
>> > mov.f64 func_retval0, %fl2;
>> > ret;
>> > }
>> >
>> > // .globl
>> > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx .func (.reg .b64
>> > func_retval0) examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx(
>> > .reg .b64
>> > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_0,
>> > .reg .b64
>> > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_1,
>> > .reg .b64
>> > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_2,
>> > .reg .b64
>> > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_3
>> > )
>> > {
>> > .reg .pred %p<396>;
>> > .reg .s16 %rc<396>;
>> > .reg .s16 %rs<396>;
>> > .reg .s32 %r<396>;
>> > .reg .s64 %rl<396>;
>> > .reg .f32 %f<396>;
>> > .reg .f64 %fl<396>;
>> >
>> > mov.b64 %rl0,
>> > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_0;
>> > mov.f64 %fl2,
>> > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_1;
>> > mov.f64 %fl1,
>> > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_2;
>> > mov.f64 %fl0,
>> > examples_2E_mandelbrot_2F_calc_2D_mandelbrot_2D_ptx_param_3;
>> > // Callseq Start 0
>> > {
>> > .reg .b32 temp_param_reg;
>> > // <end>}
>> > .reg .b32 retval0;
>> > call.uni (retval0),
>> > INT_PTX_SREG_CTAID_X,
>> > (
>> > );
>> > mov.b32 %r0, retval0;
>> >
>> > //{
>> > }// Callseq End 0
>> > // Callseq Start 1
>> > {
>> > .reg .b32 temp_param_reg;
>> > // <end>}
>> > .reg .b32 retval0;
>> > call.uni (retval0),
>> > INT_PTX_SREG_NTID_X,
>> > (
>> > );
>> > mov.b32 %r1, retval0;
>> >
>> > //{
>> > }// Callseq End 1
>> > // Callseq Start 2
>> > {
>> > .reg .b32 temp_param_reg;
>> > // <end>}
>> > .reg .b32 retval0;
>> > call.uni (retval0),
>> > INT_PTX_SREG_TID_X,
>> > (
>> > );
>> > mov.b32 %r2, retval0;
>> >
>> > //{
>> > }// Callseq End 2
>> > mad.lo.s32 %r0, %r0, %r1, %r2;
>> > cvt.rn.f64.s32 %fl3, %r0;
>> > // Callseq Start 3
>> > {
>> > .reg .b32 temp_param_reg;
>> > // <end>}
>> > .reg .b32 retval0;
>> > call.uni (retval0),
>> > INT_PTX_SREG_CTAID_Y,
>> > (
>> > );
>> > mov.b32 %r0, retval0;
>> >
>> > //{
>> > }// Callseq End 3
>> > // Callseq Start 4
>> > {
>> > .reg .b32 temp_param_reg;
>> > // <end>}
>> > .reg .b32 retval0;
>> > call.uni (retval0),
>> > INT_PTX_SREG_NTID_Y,
>> > (
>> > );
>> > mov.b32 %r1, retval0;
>> >
>> > //{
>> > }// Callseq End 4
>> > // Callseq Start 5
>> > {
>> > .reg .b32 temp_param_reg;
>> > // <end>}
>> > .reg .b32 retval0;
>> > call.uni (retval0),
>> > INT_PTX_SREG_TID_X,
>> > (
>> > );
>> > mov.b32 %r2, retval0;
>> >
>> > //{
>> > }// Callseq End 5
>> > mad.lo.s32 %r0, %r0, %r1, %r2;
>> > cvt.rn.f64.s32 %fl4, %r0;
>> > mul.f64 %fl5, %fl4, %fl2;
>> > add.f64 %fl5, %fl5, %fl3;
>> > cvt.rzi.s64.f64 %rl1, %fl5;
>> > shl.b64 %rl1, %rl1, 3;
>> > add.s64 %rl1, %rl0, %rl1;
>> > div.rn.f64 %fl2, %fl3, %fl2;
>> > mul.f64 %fl2, %fl2, 0d400C000000000000;
>> > add.f64 %fl2, %fl2, 0dC004000000000000;
>> > div.rn.f64 %fl1, %fl4, %fl1;
>> > add.f64 %fl1, %fl1, %fl1;
>> > add.f64 %fl3, %fl1, 0dBFF0000000000000;
>> > mov.f64 %fl1, 0d0000000000000000;
>> > mov.f64 %fl5, %fl1;
>> > mov.f64 %fl4, %fl1;
>> > bra.uni BB2_1;
>> > BB2_2:
>> > add.f64 %fl1, %fl1, 0d3FF0000000000000;
>> > sub.f64 %fl6, %fl6, %fl7;
>> > add.f64 %fl6, %fl6, %fl2;
>> > add.f64 %fl5, %fl5, %fl5;
>> > mul.f64 %fl4, %fl5, %fl4;
>> > add.f64 %fl4, %fl4, %fl3;
>> > mov.f64 %fl5, %fl6;
>> > BB2_1:
>> > mul.f64 %fl6, %fl5, %fl5;
>> > mul.f64 %fl7, %fl4, %fl4;
>> > add.f64 %fl8, %fl6, %fl7;
>> > setp.lt.f64 %p0, %fl8, 0d4010000000000000;
>> > setp.lt.f64 %p1, %fl1, %fl0;
>> > and.pred %p0, %p0, %p1;
>> > @!%p0 bra BB2_3;
>> > bra.uni BB2_2;
>> > BB2_3:
>> > div.rn.f64 %fl0, %fl1, %fl0;
>> > st.global.f64 [%rl1], %fl0;
>> > mov.b64 func_retval0, %rl0;
>> > ret;
>> > }
>>
>
>
>
> --
> “One of the main causes of the fall of the Roman Empire was that–lacking
> zero–they had no way to indicate successful termination of their C
> programs.”
> (Robert Firth)
>
> _______________________________________________
> LLVM Developers mailing list
> LLVMdev at cs.uiuc.edu http://llvm.cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev
>
More information about the llvm-dev
mailing list