[LLVMbugs] [Bug 22246] New: "select i1" not lowered to the optimal form

bugzilla-daemon at llvm.org bugzilla-daemon at llvm.org
Thu Jan 15 18:28:12 PST 2015


http://llvm.org/bugs/show_bug.cgi?id=22246

            Bug ID: 22246
           Summary: "select i1" not lowered to the optimal form
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: All
            Status: NEW
          Severity: normal
          Priority: P
         Component: Backend: PTX
          Assignee: unassignedbugs at nondot.org
          Reporter: wujingyue at gmail.com
                CC: llvmbugs at cs.uiuc.edu
    Classification: Unclassified

Here's a reduced test case. 

CUDA:
_attribute__((global)) void foo(bool p1, bool p2, bool p3, bool *output) {
  *output = (p1 ? p2 : p3);
}

clang++ -cc1 -x cuda -std=gnu++11 -fcuda-is-device -mstackrealign
-D__GCUDACC__=1 -DGCUDACC_STANDALONE_MODE -triple nvptx64-unknown-unknown
-target-cpu sm_35 -emit-llvm ~/Work/cuda/select.cu -o select.ll -fno-builtin
-O3

LLVM IR:
define void @_Z3foobbbPb(i1 zeroext %p1, i1 zeroext %p2, i1 zeroext %p3, i8*
nocapture %output) #0 {
entry:
  %.sink.v = select i1 %p1, i1 %p2, i1 %p3
  %frombool5 = zext i1 %.sink.v to i8
  store i8 %frombool5, i8* %output, align 1, !tbaa !2
  ret void
}

llc ~/Work/cuda/select.ll -o ~/Work/cuda/select.ptx -march=nvptx64 -mcpu=sm_35
-fp-contract=fast -O3

PTX:
.visible .entry _Z3foobbbPb(                                                    
        .param .u8 _Z3foobbbPb_param_0,                                         
        .param .u8 _Z3foobbbPb_param_1,                                         
        .param .u8 _Z3foobbbPb_param_2,                                         
        .param .u64 _Z3foobbbPb_param_3                                         
)                                                                               
{                                                                               
        .reg .pred      %p<8>;                                                  
        .reg .s16       %rs<8>;                                                 
        .reg .s64       %rd<2>;                                                 

// BB#0:                                // %entry                               
        ld.param.u8     %rs1, [_Z3foobbbPb_param_0];                            
        and.b16         %rs2, %rs1, 1;                                          
        setp.eq.b16     %p1, %rs2, 1;                                           
        ld.param.u8     %rs3, [_Z3foobbbPb_param_1];                            
        and.b16         %rs4, %rs3, 1;                                          
        setp.eq.b16     %p2, %rs4, 1;                                           
        and.pred        %p3, %p1, %p2;                                          
        not.pred        %p4, %p1;                                               
        ld.param.u8     %rs5, [_Z3foobbbPb_param_2];                            
        and.b16         %rs6, %rs5, 1;                                          
        setp.eq.b16     %p5, %rs6, 1;                                           
        and.pred        %p6, %p4, %p5;                                          
        or.pred         %p7, %p3, %p6;                                          
        selp.u16        %rs7, 1, 0, %p7;                                        
        ld.param.u64    %rd1, [_Z3foobbbPb_param_3];                            
        st.u8   [%rd1], %rs7;                                                   
        ret;                                                                    
}

Essentially, the PTX essentially simulate (p1 ? p2 : p3) with (p1 && p2) ||
(!p1 && p3) instead of using the selp instruction. A better PTX generated by
nvcc is:

.visible .entry _Z3foobbbPb(                                                    
        .param .u8 _Z3foobbbPb_param_0,                                         
        .param .u8 _Z3foobbbPb_param_1,                                         
        .param .u8 _Z3foobbbPb_param_2,                                         
        .param .u64 _Z3foobbbPb_param_3                                         
)                                                                               
{                                                                               
        .reg .pred      %p<2>;                                                  
        .reg .s16       %rs<6>;                                                 
        .reg .s64       %rd<3>;                                                 


        ld.param.u64    %rd1, [_Z3foobbbPb_param_3];                            
        cvta.to.global.u64      %rd2, %rd1;                                     
        ld.param.s8     %rs1, [_Z3foobbbPb_param_0];                            
        and.b16         %rs2, %rs1, 255;                                        
        .loc 1 2 1                                                              
        setp.eq.s16     %p1, %rs2, 0;                                           
        ld.param.s8     %rs3, [_Z3foobbbPb_param_1];                            
        ld.param.s8     %rs4, [_Z3foobbbPb_param_2];                            
        .loc 1 2 1                                                              
        selp.b16        %rs5, %rs4, %rs3, %p1;                                  
        st.global.u8    [%rd2], %rs5;                                           
        .loc 1 3 2                                                              
        ret;                                                                    
}

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20150116/e33ae3c1/attachment.html>


More information about the llvm-bugs mailing list