[LLVMdev] [NVPTX] llc -march=nvptx64 -mcpu=sm_20 generates invalid zero align for device function params

Tue Jul 10 18:46:44 PDT 2012

Hello,

FYI, this is a bug http://llvm.org/bugs/show_bug.cgi?id=13324

When compiling the following code for sm_20, func params are by some reason
given with .align 0, which is invalid. Problem does not occur if compiled
for sm_10.

> cat test.ll
; ModuleID = '__kernelgen_main_module'
target datalayout = "e-p:64:64-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx64-unknown-unknown"

%struct.float2 = type { float, float }

define ptx_device void @__internal_dsmul(%struct.float2* noalias nocapture
sret %agg.result, %struct.float2* nocapture byval %x, %struct.float2*
nocapture byval %y) nounwind inlinehint alwaysinline {
entry:
  %y1 = getelementptr inbounds %struct.float2* %x, i64 0, i32 1
  %0 = load float* %y1, align 4
  %sub = fsub float -0.000000e+00, %0
  %1 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float
%sub, float 4.097000e+03, float %0) nounwind
  %2 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %0,
float 4.097000e+03, float %1) nounwind
  %y5 = getelementptr inbounds %struct.float2* %y, i64 0, i32 1
  %3 = load float* %y5, align 4
  %sub7 = fsub float -0.000000e+00, %3
  %4 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float
%sub7, float 4.097000e+03, float %3) nounwind
  %5 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %3,
float 4.097000e+03, float %4) nounwind
  %sub12 = fsub float %0, %2
  %sub14 = fsub float %3, %5
  %6 = tail call float @llvm.nvvm.mul.rn.f(float %0, float %3) nounwind
  %sub18 = fsub float -0.000000e+00, %6
  %7 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %2,
float %5, float %sub18) nounwind
  %8 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %2,
float %sub14, float %7) nounwind
  %9 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float %5,
float %sub12, float %8) nounwind
  %10 = tail call float asm "mad.f32 $0, $1, $2, $3;", "=f,f,f,f"(float
%sub12, float %sub14, float %9) nounwind
  %x24 = getelementptr inbounds %struct.float2* %y, i64 0, i32 0
  %11 = load float* %x24, align 4
  %12 = tail call float @llvm.nvvm.mul.rn.f(float %0, float %11) nounwind
  %x26 = getelementptr inbounds %struct.float2* %x, i64 0, i32 0
  %13 = load float* %x26, align 4
  %14 = tail call float @llvm.nvvm.mul.rn.f(float %13, float %3) nounwind
  %add = fadd float %12, %14
  %add29 = fadd float %10, %add
  %15 = tail call float @llvm.nvvm.add.rn.f(float %6, float %add29) nounwind
  %sub32 = fsub float %6, %15
  %16 = tail call float @llvm.nvvm.add.rn.f(float %sub32, float %add29)
nounwind
  %agg.result.0 = getelementptr inbounds %struct.float2* %agg.result, i64
0, i32 0
  store float %16, float* %agg.result.0, align 8
  %agg.result.1 = getelementptr inbounds %struct.float2* %agg.result, i64
0, i32 1
  store float %15, float* %agg.result.1, align 4
  ret void
}

declare ptx_device float @llvm.nvvm.add.rn.f(float, float) nounwind readnone

declare ptx_device float @llvm.nvvm.mul.rn.f(float, float) nounwind readnone

> llc -march=nvptx64 -mcpu=sm_20 test.ll -o test.ptx
> cat test.ptx
//
// Generated by LLVM NVPTX Back-End
//

.version 3.0
.target sm_20, texmode_independent
.address_size 64

    // .globl    __internal_dsmul
.func __internal_dsmul(
    .param .b64 __internal_dsmul_param_0,
    .param .align 0 .b8 __internal_dsmul_param_1[8],
    .param .align 0 .b8 __internal_dsmul_param_2[8]
)                                       // @__internal_dsmul
{
    .reg .pred %p<396>;
    .reg .s16 %rc<396>;
    .reg .s16 %rs<396>;
    .reg .s32 %r<396>;
    .reg .s64 %rl<396>;
    .reg .f32 %f<396>;
    .reg .f64 %fl<396>;

// BB#0:                                // %entry
    mov.b64    %rl0, __internal_dsmul_param_1;
    cvta.local.u64     %rl0, %rl0;
    ld.f32     %f0, [%rl0+4];
    neg.f32     %f1, %f0;
    mov.b64    %rl1, __internal_dsmul_param_2;
    mov.f32     %f2, 0f45800800;
    // inline asm
    mad.f32 %f1, %f1, %f2, %f0;
    // inline asm
    // inline asm
    mad.f32 %f3, %f0, %f2, %f1;
    // inline asm
    cvta.local.u64     %rl1, %rl1;
    ld.f32     %f1, [%rl1+4];
    neg.f32     %f4, %f1;
    // inline asm
    mad.f32 %f4, %f4, %f2, %f1;
    // inline asm
    // inline asm
    mad.f32 %f4, %f1, %f2, %f4;
    // inline asm
    sub.f32     %f5, %f0, %f3;
    sub.f32     %f6, %f1, %f4;
    mul.rn.f32     %f2, %f0, %f1;
    neg.f32     %f7, %f2;
    // inline asm
    mad.f32 %f7, %f3, %f4, %f7;
    // inline asm
    // inline asm
    mad.f32 %f3, %f3, %f6, %f7;
    // inline asm
    // inline asm
    mad.f32 %f3, %f4, %f5, %f3;
    // inline asm
    // inline asm
    mad.f32 %f3, %f5, %f6, %f3;
    // inline asm
    ld.f32     %f4, [%rl1];
    mul.rn.f32     %f0, %f0, %f4;
    ld.f32     %f4, [%rl0];
    mul.rn.f32     %f1, %f4, %f1;
    add.f32     %f0, %f0, %f1;
    add.f32     %f1, %f3, %f0;
    add.rn.f32     %f0, %f2, %f1;
    sub.f32     %f2, %f2, %f0;
    add.rn.f32     %f1, %f2, %f1;
    ld.param.u64     %rl0, [__internal_dsmul_param_0];
    st.f32     [%rl0], %f1;
    st.f32     [%rl0+4], %f0;
    ret;
}

> ptxas -arch=sm_20 test.ptx -o ptx.cubin
ptxas test.ptx, line 13; error   : Alignment must be a power of two
ptxas test.ptx, line 14; error   : Alignment must be a power of two
ptxas fatal   : Ptx assembly aborted due to errors
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20120711/e55c1cd9/attachment.html>