[llvm-commits] [NVPTX] llc -march=nvptx64 -mcpu=sm_20 generates invalid zero align for device function params

Fri Nov 9 04:16:32 PST 2012

Dear all,

I'm attaching a patch that should fix the issue mentioned above. It
simply makes the same check seen in the same file for global
variables:

  emitPTXAddressSpace(PTy->getAddressSpace(), O);
  if (GVar->getAlignment() == 0)
    O << " .align " << (int) TD->getPrefTypeAlignment(ETy);
  else
    O << " .align " << GVar->getAlignment();

Could you please review and commit? Do you think it needs a test case?

Thanks,
- D.

dmikushin at hp2:~/forge/align0> llc -march=nvptx64 -mcpu=sm_20 align0.ll -o -
//
// Generated by LLVM NVPTX Back-End
//

.version 3.1
.target sm_20
.address_size 64

	// .globl	__internal_dsmul
.visible .func __internal_dsmul(
	.param .b64 __internal_dsmul_param_0,
	.param .align 4 .b8 __internal_dsmul_param_1[8],
	.param .align 4 .b8 __internal_dsmul_param_2[8]
)                                       // @__internal_dsmul
{
	.reg .pred %p<396>;
	.reg .s16 %rc<396>;
	.reg .s16 %rs<396>;
	.reg .s32 %r<396>;
	.reg .s64 %rl<396>;
	.reg .f32 %f<396>;
	.reg .f64 %fl<396>;

// BB#0:                                // %entry
	mov.b64	%rl0, __internal_dsmul_param_1;
	cvta.local.u64 	%rl0, %rl0;
	ld.f32 	%f0, [%rl0+4];
	neg.f32 	%f1, %f0;
	mov.b64	%rl1, __internal_dsmul_param_2;
	mov.f32 	%f2, 0f45800800;
	// inline asm
	mad.f32 %f1, %f1, %f2, %f0;
	// inline asm
	// inline asm
	mad.f32 %f3, %f0, %f2, %f1;
	// inline asm
	cvta.local.u64 	%rl1, %rl1;
	ld.f32 	%f1, [%rl1+4];
	neg.f32 	%f4, %f1;
	// inline asm
	mad.f32 %f4, %f4, %f2, %f1;
	// inline asm
	// inline asm
	mad.f32 %f4, %f1, %f2, %f4;
	// inline asm
	sub.f32 	%f5, %f0, %f3;
	sub.f32 	%f6, %f1, %f4;
	mul.rn.f32 	%f2, %f0, %f1;
	neg.f32 	%f7, %f2;
	// inline asm
	mad.f32 %f7, %f3, %f4, %f7;
	// inline asm
	// inline asm
	mad.f32 %f3, %f3, %f6, %f7;
	// inline asm
	// inline asm
	mad.f32 %f3, %f4, %f5, %f3;
	// inline asm
	// inline asm
	mad.f32 %f3, %f5, %f6, %f3;
	// inline asm
	ld.f32 	%f4, [%rl1];
	mul.rn.f32 	%f0, %f0, %f4;
	ld.f32 	%f4, [%rl0];
	mul.rn.f32 	%f1, %f4, %f1;
	add.f32 	%f0, %f0, %f1;
	add.f32 	%f1, %f3, %f0;
	add.rn.f32 	%f0, %f2, %f1;
	sub.f32 	%f2, %f2, %f0;
	add.rn.f32 	%f1, %f2, %f1;
	ld.param.u64 	%rl0, [__internal_dsmul_param_0];
	st.f32 	[%rl0], %f1;
	st.f32 	[%rl0+4], %f0;
	ret;
}

2012/7/11 Dmitry N. Mikushin <maemarcus at gmail.com>:
> Hello,
>
> FYI, this is a bug http://llvm.org/bugs/show_bug.cgi?id=13324
>
> When compiling the following code for sm_20, func params are by some reason
> given with .align 0, which is invalid. Problem does not occur if compiled
> for sm_10.
>
>> cat test.ll
> ; ModuleID = '__kernelgen_main_module'
> target datalayout = "e-p:64:64-i64:64:64-f64:64:64-n1:8:16:32:64"
> target triple = "ptx64-unknown-unknown"
>
> %struct.float2 = type { float, float }
>
> define ptx_device void @__internal_dsmul(%struct.float2* noalias nocapture
> sret %agg.result, %struct.float2* nocapture byval %x, %struct.float2*
> nocapture byval %y) nounwind inlinehint alwaysinline {
> entry:
>   %y1 = getelementptr inbounds %struct.float2* %x, i64 0, i32 1
>   %0 = load float* %y1, align 4
>   %sub = fsub float -0.000000e+00, %0
>   %1 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %sub,
> float 4.097000e+03, float %0) nounwind
>   %2 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %0,
> float 4.097000e+03, float %1) nounwind
>   %y5 = getelementptr inbounds %struct.float2* %y, i64 0, i32 1
>   %3 = load float* %y5, align 4
>   %sub7 = fsub float -0.000000e+00, %3
>   %4 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float
> %sub7, float 4.097000e+03, float %3) nounwind
>   %5 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %3,
> float 4.097000e+03, float %4) nounwind
>   %sub12 = fsub float %0, %2
>   %sub14 = fsub float %3, %5
>   %6 = tail call float @llvm.nvvm.mul.rn.f(float %0, float %3) nounwind
>   %sub18 = fsub float -0.000000e+00, %6
>   %7 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %2,
> float %5, float %sub18) nounwind
>   %8 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %2,
> float %sub14, float %7) nounwind
>   %9 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %5,
> float %sub12, float %8) nounwind
>   %10 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float
> %sub12, float %sub14, float %9) nounwind
>   %x24 = getelementptr inbounds %struct.float2* %y, i64 0, i32 0
>   %11 = load float* %x24, align 4
>   %12 = tail call float @llvm.nvvm.mul.rn.f(float %0, float %11) nounwind
>   %x26 = getelementptr inbounds %struct.float2* %x, i64 0, i32 0
>   %13 = load float* %x26, align 4
>   %14 = tail call float @llvm.nvvm.mul.rn.f(float %13, float %3) nounwind
>   %add = fadd float %12, %14
>   %add29 = fadd float %10, %add
>   %15 = tail call float @llvm.nvvm.add.rn.f(float %6, float %add29) nounwind
>   %sub32 = fsub float %6, %15
>   %16 = tail call float @llvm.nvvm.add.rn.f(float %sub32, float %add29)
> nounwind
>   %agg.result.0 = getelementptr inbounds %struct.float2* %agg.result, i64 0,
> i32 0
>   store float %16, float* %agg.result.0, align 8
>   %agg.result.1 = getelementptr inbounds %struct.float2* %agg.result, i64 0,
> i32 1
>   store float %15, float* %agg.result.1, align 4
>   ret void
> }
>
> declare ptx_device float @llvm.nvvm.add.rn.f(float, float) nounwind readnone
>
> declare ptx_device float @llvm.nvvm.mul.rn.f(float, float) nounwind readnone
>
>> llc -march=nvptx64 -mcpu=sm_20 test.ll -o test.ptx
>> cat test.ptx
> //
> // Generated by LLVM NVPTX Back-End
> //
>
> .version 3.0
> .target sm_20, texmode_independent
> .address_size 64
>
>
>     // .globl    __internal_dsmul
> .func __internal_dsmul(
>     .param .b64 __internal_dsmul_param_0,
>     .param .align 0 .b8 __internal_dsmul_param_1[8],
>     .param .align 0 .b8 __internal_dsmul_param_2[8]
> )                                       // @__internal_dsmul
> {
>     .reg .pred %p<396>;
>     .reg .s16 %rc<396>;
>     .reg .s16 %rs<396>;
>     .reg .s32 %r<396>;
>     .reg .s64 %rl<396>;
>     .reg .f32 %f<396>;
>     .reg .f64 %fl<396>;
>
> // BB#0:                                // %entry
>     mov.b64    %rl0, __internal_dsmul_param_1;
>     cvta.local.u64     %rl0, %rl0;
>     ld.f32     %f0, [%rl0+4];
>     neg.f32     %f1, %f0;
>     mov.b64    %rl1, __internal_dsmul_param_2;
>     mov.f32     %f2, 0f45800800;
>     // inline asm
>     mad.f32 %f1, %f1, %f2, %f0;
>     // inline asm
>     // inline asm
>     mad.f32 %f3, %f0, %f2, %f1;
>     // inline asm
>     cvta.local.u64     %rl1, %rl1;
>     ld.f32     %f1, [%rl1+4];
>     neg.f32     %f4, %f1;
>     // inline asm
>     mad.f32 %f4, %f4, %f2, %f1;
>     // inline asm
>     // inline asm
>     mad.f32 %f4, %f1, %f2, %f4;
>     // inline asm
>     sub.f32     %f5, %f0, %f3;
>     sub.f32     %f6, %f1, %f4;
>     mul.rn.f32     %f2, %f0, %f1;
>     neg.f32     %f7, %f2;
>     // inline asm
>     mad.f32 %f7, %f3, %f4, %f7;
>     // inline asm
>     // inline asm
>     mad.f32 %f3, %f3, %f6, %f7;
>     // inline asm
>     // inline asm
>     mad.f32 %f3, %f4, %f5, %f3;
>     // inline asm
>     // inline asm
>     mad.f32 %f3, %f5, %f6, %f3;
>     // inline asm
>     ld.f32     %f4, [%rl1];
>     mul.rn.f32     %f0, %f0, %f4;
>     ld.f32     %f4, [%rl0];
>     mul.rn.f32     %f1, %f4, %f1;
>     add.f32     %f0, %f0, %f1;
>     add.f32     %f1, %f3, %f0;
>     add.rn.f32     %f0, %f2, %f1;
>     sub.f32     %f2, %f2, %f0;
>     add.rn.f32     %f1, %f2, %f1;
>     ld.param.u64     %rl0, [__internal_dsmul_param_0];
>     st.f32     [%rl0], %f1;
>     st.f32     [%rl0+4], %f0;
>     ret;
> }
>
>> ptxas -arch=sm_20 test.ptx -o ptx.cubin
> ptxas test.ptx, line 13; error   : Alignment must be a power of two
> ptxas test.ptx, line 14; error   : Alignment must be a power of two
> ptxas fatal   : Ptx assembly aborted due to errors
>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: align0.patch
Type: application/octet-stream
Size: 517 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20121109/d1c3438d/attachment.obj>