Hello,<br><br>FYI, this is a bug <a href="http://llvm.org/bugs/show_bug.cgi?id=13324">http://llvm.org/bugs/show_bug.cgi?id=13324</a><br><br>When compiling the following code for sm_20, func params are by some reason given with .align 0, which is invalid. Problem does not occur if compiled for sm_10.<br>
<br>> cat test.ll<br>; ModuleID = '__kernelgen_main_module'<br>target datalayout = "e-p:64:64-i64:64:64-f64:64:64-n1:8:16:32:64"<br>target triple = "ptx64-unknown-unknown"<br><br>%struct.float2 = type { float, float }<br>
<br>define ptx_device void @__internal_dsmul(%struct.float2* noalias nocapture sret %agg.result, %struct.float2* nocapture byval %x, %struct.float2* nocapture byval %y) nounwind inlinehint alwaysinline {<br>entry:<br> %y1 = getelementptr inbounds %struct.float2* %x, i64 0, i32 1<br>
%0 = load float* %y1, align 4<br> %sub = fsub float -0.000000e+00, %0<br> %1 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %sub, float 4.097000e+03, float %0) nounwind<br> %2 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %0, float 4.097000e+03, float %1) nounwind<br>
%y5 = getelementptr inbounds %struct.float2* %y, i64 0, i32 1<br> %3 = load float* %y5, align 4<br> %sub7 = fsub float -0.000000e+00, %3<br> %4 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %sub7, float 4.097000e+03, float %3) nounwind<br>
%5 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %3, float 4.097000e+03, float %4) nounwind<br> %sub12 = fsub float %0, %2<br> %sub14 = fsub float %3, %5<br> %6 = tail call float @llvm.nvvm.mul.rn.f(float %0, float %3) nounwind<br>
%sub18 = fsub float -0.000000e+00, %6<br> %7 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %2, float %5, float %sub18) nounwind<br> %8 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %2, float %sub14, float %7) nounwind<br>
%9 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %5, float %sub12, float %8) nounwind<br> %10 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %sub12, float %sub14, float %9) nounwind<br>
%x24 = getelementptr inbounds %struct.float2* %y, i64 0, i32 0<br> %11 = load float* %x24, align 4<br> %12 = tail call float @llvm.nvvm.mul.rn.f(float %0, float %11) nounwind<br> %x26 = getelementptr inbounds %struct.float2* %x, i64 0, i32 0<br>
%13 = load float* %x26, align 4<br> %14 = tail call float @llvm.nvvm.mul.rn.f(float %13, float %3) nounwind<br> %add = fadd float %12, %14<br> %add29 = fadd float %10, %add<br> %15 = tail call float @llvm.nvvm.add.rn.f(float %6, float %add29) nounwind<br>
%sub32 = fsub float %6, %15<br> %16 = tail call float @llvm.nvvm.add.rn.f(float %sub32, float %add29) nounwind<br> %agg.result.0 = getelementptr inbounds %struct.float2* %agg.result, i64 0, i32 0<br> store float %16, float* %agg.result.0, align 8<br>
%agg.result.1 = getelementptr inbounds %struct.float2* %agg.result, i64 0, i32 1<br> store float %15, float* %agg.result.1, align 4<br> ret void<br>}<br><br>declare ptx_device float @llvm.nvvm.add.rn.f(float, float) nounwind readnone<br>
<br>declare ptx_device float @llvm.nvvm.mul.rn.f(float, float) nounwind readnone<br><br>> llc -march=nvptx64 -mcpu=sm_20 test.ll -o test.ptx<br>> cat test.ptx<br>//<br>// Generated by LLVM NVPTX Back-End<br>//<br><br>
.version 3.0<br>.target sm_20, texmode_independent<br>.address_size 64<br><br><br> // .globl __internal_dsmul<br>.func __internal_dsmul(<br> .param .b64 __internal_dsmul_param_0,<br> .param .align 0 .b8 __internal_dsmul_param_1[8],<br>
.param .align 0 .b8 __internal_dsmul_param_2[8]<br>) // @__internal_dsmul<br>{<br> .reg .pred %p<396>;<br> .reg .s16 %rc<396>;<br> .reg .s16 %rs<396>;<br>
.reg .s32 %r<396>;<br> .reg .s64 %rl<396>;<br> .reg .f32 %f<396>;<br> .reg .f64 %fl<396>;<br><br>// BB#0: // %entry<br> mov.b64 %rl0, __internal_dsmul_param_1;<br>
cvta.local.u64 %rl0, %rl0;<br> ld.f32 %f0, [%rl0+4];<br> neg.f32 %f1, %f0;<br> mov.b64 %rl1, __internal_dsmul_param_2;<br> mov.f32 %f2, 0f45800800;<br> // inline asm<br> mad.f32 %f1, %f1, %f2, %f0;<br>
// inline asm<br> // inline asm<br> mad.f32 %f3, %f0, %f2, %f1;<br> // inline asm<br> cvta.local.u64 %rl1, %rl1;<br> ld.f32 %f1, [%rl1+4];<br> neg.f32 %f4, %f1;<br> // inline asm<br> mad.f32 %f4, %f4, %f2, %f1;<br>
// inline asm<br> // inline asm<br> mad.f32 %f4, %f1, %f2, %f4;<br> // inline asm<br> sub.f32 %f5, %f0, %f3;<br> sub.f32 %f6, %f1, %f4;<br> mul.rn.f32 %f2, %f0, %f1;<br> neg.f32 %f7, %f2;<br>
// inline asm<br> mad.f32 %f7, %f3, %f4, %f7;<br> // inline asm<br> // inline asm<br> mad.f32 %f3, %f3, %f6, %f7;<br> // inline asm<br> // inline asm<br> mad.f32 %f3, %f4, %f5, %f3;<br> // inline asm<br>
// inline asm<br> mad.f32 %f3, %f5, %f6, %f3;<br> // inline asm<br> ld.f32 %f4, [%rl1];<br> mul.rn.f32 %f0, %f0, %f4;<br> ld.f32 %f4, [%rl0];<br> mul.rn.f32 %f1, %f4, %f1;<br> add.f32 %f0, %f0, %f1;<br>
add.f32 %f1, %f3, %f0;<br> add.rn.f32 %f0, %f2, %f1;<br> sub.f32 %f2, %f2, %f0;<br> add.rn.f32 %f1, %f2, %f1;<br> ld.param.u64 %rl0, [__internal_dsmul_param_0];<br> st.f32 [%rl0], %f1;<br>
st.f32 [%rl0+4], %f0;<br> ret;<br>}<br><br>> ptxas -arch=sm_20 test.ptx -o ptx.cubin<br>ptxas test.ptx, line 13; error : Alignment must be a power of two<br>ptxas test.ptx, line 14; error : Alignment must be a power of two<br>
ptxas fatal : Ptx assembly aborted due to errors<br><br>