[LLVMdev] [NVPTX] llc -march=nvptx64 -mcpu=sm_20 generates invalid zero align for device function params
Dmitry N. Mikushin
maemarcus at gmail.com
Fri Nov 9 04:16:32 PST 2012
Dear all,
I'm attaching a patch that should fix the issue mentioned above. It
simply makes the same check seen in the same file for global
variables:
emitPTXAddressSpace(PTy->getAddressSpace(), O);
if (GVar->getAlignment() == 0)
O << " .align " << (int) TD->getPrefTypeAlignment(ETy);
else
O << " .align " << GVar->getAlignment();
Could you please review and commit? Do you think it needs a test case?
Thanks,
- D.
dmikushin at hp2:~/forge/align0> llc -march=nvptx64 -mcpu=sm_20 align0.ll -o -
//
// Generated by LLVM NVPTX Back-End
//
.version 3.1
.target sm_20
.address_size 64
// .globl __internal_dsmul
.visible .func __internal_dsmul(
.param .b64 __internal_dsmul_param_0,
.param .align 4 .b8 __internal_dsmul_param_1[8],
.param .align 4 .b8 __internal_dsmul_param_2[8]
) // @__internal_dsmul
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;
// BB#0: // %entry
mov.b64 %rl0, __internal_dsmul_param_1;
cvta.local.u64 %rl0, %rl0;
ld.f32 %f0, [%rl0+4];
neg.f32 %f1, %f0;
mov.b64 %rl1, __internal_dsmul_param_2;
mov.f32 %f2, 0f45800800;
// inline asm
mad.f32 %f1, %f1, %f2, %f0;
// inline asm
// inline asm
mad.f32 %f3, %f0, %f2, %f1;
// inline asm
cvta.local.u64 %rl1, %rl1;
ld.f32 %f1, [%rl1+4];
neg.f32 %f4, %f1;
// inline asm
mad.f32 %f4, %f4, %f2, %f1;
// inline asm
// inline asm
mad.f32 %f4, %f1, %f2, %f4;
// inline asm
sub.f32 %f5, %f0, %f3;
sub.f32 %f6, %f1, %f4;
mul.rn.f32 %f2, %f0, %f1;
neg.f32 %f7, %f2;
// inline asm
mad.f32 %f7, %f3, %f4, %f7;
// inline asm
// inline asm
mad.f32 %f3, %f3, %f6, %f7;
// inline asm
// inline asm
mad.f32 %f3, %f4, %f5, %f3;
// inline asm
// inline asm
mad.f32 %f3, %f5, %f6, %f3;
// inline asm
ld.f32 %f4, [%rl1];
mul.rn.f32 %f0, %f0, %f4;
ld.f32 %f4, [%rl0];
mul.rn.f32 %f1, %f4, %f1;
add.f32 %f0, %f0, %f1;
add.f32 %f1, %f3, %f0;
add.rn.f32 %f0, %f2, %f1;
sub.f32 %f2, %f2, %f0;
add.rn.f32 %f1, %f2, %f1;
ld.param.u64 %rl0, [__internal_dsmul_param_0];
st.f32 [%rl0], %f1;
st.f32 [%rl0+4], %f0;
ret;
}
2012/7/11 Dmitry N. Mikushin <maemarcus at gmail.com>:
> Hello,
>
> FYI, this is a bug http://llvm.org/bugs/show_bug.cgi?id=13324
>
> When compiling the following code for sm_20, func params are by some reason
> given with .align 0, which is invalid. Problem does not occur if compiled
> for sm_10.
>
>> cat test.ll
> ; ModuleID = '__kernelgen_main_module'
> target datalayout = "e-p:64:64-i64:64:64-f64:64:64-n1:8:16:32:64"
> target triple = "ptx64-unknown-unknown"
>
> %struct.float2 = type { float, float }
>
> define ptx_device void @__internal_dsmul(%struct.float2* noalias nocapture
> sret %agg.result, %struct.float2* nocapture byval %x, %struct.float2*
> nocapture byval %y) nounwind inlinehint alwaysinline {
> entry:
> %y1 = getelementptr inbounds %struct.float2* %x, i64 0, i32 1
> %0 = load float* %y1, align 4
> %sub = fsub float -0.000000e+00, %0
> %1 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %sub,
> float 4.097000e+03, float %0) nounwind
> %2 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %0,
> float 4.097000e+03, float %1) nounwind
> %y5 = getelementptr inbounds %struct.float2* %y, i64 0, i32 1
> %3 = load float* %y5, align 4
> %sub7 = fsub float -0.000000e+00, %3
> %4 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float
> %sub7, float 4.097000e+03, float %3) nounwind
> %5 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %3,
> float 4.097000e+03, float %4) nounwind
> %sub12 = fsub float %0, %2
> %sub14 = fsub float %3, %5
> %6 = tail call float @llvm.nvvm.mul.rn.f(float %0, float %3) nounwind
> %sub18 = fsub float -0.000000e+00, %6
> %7 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %2,
> float %5, float %sub18) nounwind
> %8 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %2,
> float %sub14, float %7) nounwind
> %9 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %5,
> float %sub12, float %8) nounwind
> %10 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float
> %sub12, float %sub14, float %9) nounwind
> %x24 = getelementptr inbounds %struct.float2* %y, i64 0, i32 0
> %11 = load float* %x24, align 4
> %12 = tail call float @llvm.nvvm.mul.rn.f(float %0, float %11) nounwind
> %x26 = getelementptr inbounds %struct.float2* %x, i64 0, i32 0
> %13 = load float* %x26, align 4
> %14 = tail call float @llvm.nvvm.mul.rn.f(float %13, float %3) nounwind
> %add = fadd float %12, %14
> %add29 = fadd float %10, %add
> %15 = tail call float @llvm.nvvm.add.rn.f(float %6, float %add29) nounwind
> %sub32 = fsub float %6, %15
> %16 = tail call float @llvm.nvvm.add.rn.f(float %sub32, float %add29)
> nounwind
> %agg.result.0 = getelementptr inbounds %struct.float2* %agg.result, i64 0,
> i32 0
> store float %16, float* %agg.result.0, align 8
> %agg.result.1 = getelementptr inbounds %struct.float2* %agg.result, i64 0,
> i32 1
> store float %15, float* %agg.result.1, align 4
> ret void
> }
>
> declare ptx_device float @llvm.nvvm.add.rn.f(float, float) nounwind readnone
>
> declare ptx_device float @llvm.nvvm.mul.rn.f(float, float) nounwind readnone
>
>> llc -march=nvptx64 -mcpu=sm_20 test.ll -o test.ptx
>> cat test.ptx
> //
> // Generated by LLVM NVPTX Back-End
> //
>
> .version 3.0
> .target sm_20, texmode_independent
> .address_size 64
>
>
> // .globl __internal_dsmul
> .func __internal_dsmul(
> .param .b64 __internal_dsmul_param_0,
> .param .align 0 .b8 __internal_dsmul_param_1[8],
> .param .align 0 .b8 __internal_dsmul_param_2[8]
> ) // @__internal_dsmul
> {
> .reg .pred %p<396>;
> .reg .s16 %rc<396>;
> .reg .s16 %rs<396>;
> .reg .s32 %r<396>;
> .reg .s64 %rl<396>;
> .reg .f32 %f<396>;
> .reg .f64 %fl<396>;
>
> // BB#0: // %entry
> mov.b64 %rl0, __internal_dsmul_param_1;
> cvta.local.u64 %rl0, %rl0;
> ld.f32 %f0, [%rl0+4];
> neg.f32 %f1, %f0;
> mov.b64 %rl1, __internal_dsmul_param_2;
> mov.f32 %f2, 0f45800800;
> // inline asm
> mad.f32 %f1, %f1, %f2, %f0;
> // inline asm
> // inline asm
> mad.f32 %f3, %f0, %f2, %f1;
> // inline asm
> cvta.local.u64 %rl1, %rl1;
> ld.f32 %f1, [%rl1+4];
> neg.f32 %f4, %f1;
> // inline asm
> mad.f32 %f4, %f4, %f2, %f1;
> // inline asm
> // inline asm
> mad.f32 %f4, %f1, %f2, %f4;
> // inline asm
> sub.f32 %f5, %f0, %f3;
> sub.f32 %f6, %f1, %f4;
> mul.rn.f32 %f2, %f0, %f1;
> neg.f32 %f7, %f2;
> // inline asm
> mad.f32 %f7, %f3, %f4, %f7;
> // inline asm
> // inline asm
> mad.f32 %f3, %f3, %f6, %f7;
> // inline asm
> // inline asm
> mad.f32 %f3, %f4, %f5, %f3;
> // inline asm
> // inline asm
> mad.f32 %f3, %f5, %f6, %f3;
> // inline asm
> ld.f32 %f4, [%rl1];
> mul.rn.f32 %f0, %f0, %f4;
> ld.f32 %f4, [%rl0];
> mul.rn.f32 %f1, %f4, %f1;
> add.f32 %f0, %f0, %f1;
> add.f32 %f1, %f3, %f0;
> add.rn.f32 %f0, %f2, %f1;
> sub.f32 %f2, %f2, %f0;
> add.rn.f32 %f1, %f2, %f1;
> ld.param.u64 %rl0, [__internal_dsmul_param_0];
> st.f32 [%rl0], %f1;
> st.f32 [%rl0+4], %f0;
> ret;
> }
>
>> ptxas -arch=sm_20 test.ptx -o ptx.cubin
> ptxas test.ptx, line 13; error : Alignment must be a power of two
> ptxas test.ptx, line 14; error : Alignment must be a power of two
> ptxas fatal : Ptx assembly aborted due to errors
>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: align0.patch
Type: application/octet-stream
Size: 517 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20121109/d1c3438d/attachment.obj>
More information about the llvm-dev
mailing list