<html>
    <head>
      <base href="http://llvm.org/bugs/" />
    </head>
    <body><table border="1" cellspacing="0" cellpadding="8">
        <tr>
          <th>Bug ID</th>
          <td><a class="bz_bug_link 
          bz_status_NEW "
   title="NEW --- - "select i1" not lowered to the optimal form"
   href="http://llvm.org/bugs/show_bug.cgi?id=22246">22246</a>
          </td>
        </tr>

        <tr>
          <th>Summary</th>
          <td>"select i1" not lowered to the optimal form
          </td>
        </tr>

        <tr>
          <th>Product</th>
          <td>libraries
          </td>
        </tr>

        <tr>
          <th>Version</th>
          <td>trunk
          </td>
        </tr>

        <tr>
          <th>Hardware</th>
          <td>PC
          </td>
        </tr>

        <tr>
          <th>OS</th>
          <td>All
          </td>
        </tr>

        <tr>
          <th>Status</th>
          <td>NEW
          </td>
        </tr>

        <tr>
          <th>Severity</th>
          <td>normal
          </td>
        </tr>

        <tr>
          <th>Priority</th>
          <td>P
          </td>
        </tr>

        <tr>
          <th>Component</th>
          <td>Backend: PTX
          </td>
        </tr>

        <tr>
          <th>Assignee</th>
          <td>unassignedbugs@nondot.org
          </td>
        </tr>

        <tr>
          <th>Reporter</th>
          <td>wujingyue@gmail.com
          </td>
        </tr>

        <tr>
          <th>CC</th>
          <td>llvmbugs@cs.uiuc.edu
          </td>
        </tr>

        <tr>
          <th>Classification</th>
          <td>Unclassified
          </td>
        </tr></table>
      <p>
        <div>
        <pre>Here's a reduced test case. 

CUDA:
_attribute__((global)) void foo(bool p1, bool p2, bool p3, bool *output) {
  *output = (p1 ? p2 : p3);
}

clang++ -cc1 -x cuda -std=gnu++11 -fcuda-is-device -mstackrealign
-D__GCUDACC__=1 -DGCUDACC_STANDALONE_MODE -triple nvptx64-unknown-unknown
-target-cpu sm_35 -emit-llvm ~/Work/cuda/select.cu -o select.ll -fno-builtin
-O3

LLVM IR:
define void @_Z3foobbbPb(i1 zeroext %p1, i1 zeroext %p2, i1 zeroext %p3, i8*
nocapture %output) #0 {
entry:
  %.sink.v = select i1 %p1, i1 %p2, i1 %p3
  %frombool5 = zext i1 %.sink.v to i8
  store i8 %frombool5, i8* %output, align 1, !tbaa !2
  ret void
}

llc ~/Work/cuda/select.ll -o ~/Work/cuda/select.ptx -march=nvptx64 -mcpu=sm_35
-fp-contract=fast -O3

PTX:
.visible .entry _Z3foobbbPb(                                                    
        .param .u8 _Z3foobbbPb_param_0,                                         
        .param .u8 _Z3foobbbPb_param_1,                                         
        .param .u8 _Z3foobbbPb_param_2,                                         
        .param .u64 _Z3foobbbPb_param_3                                         
)                                                                               
{                                                                               
        .reg .pred      %p<8>;                                                  
        .reg .s16       %rs<8>;                                                 
        .reg .s64       %rd<2>;                                                 

// BB#0:                                // %entry                               
        ld.param.u8     %rs1, [_Z3foobbbPb_param_0];                            
        and.b16         %rs2, %rs1, 1;                                          
        setp.eq.b16     %p1, %rs2, 1;                                           
        ld.param.u8     %rs3, [_Z3foobbbPb_param_1];                            
        and.b16         %rs4, %rs3, 1;                                          
        setp.eq.b16     %p2, %rs4, 1;                                           
        and.pred        %p3, %p1, %p2;                                          
        not.pred        %p4, %p1;                                               
        ld.param.u8     %rs5, [_Z3foobbbPb_param_2];                            
        and.b16         %rs6, %rs5, 1;                                          
        setp.eq.b16     %p5, %rs6, 1;                                           
        and.pred        %p6, %p4, %p5;                                          
        or.pred         %p7, %p3, %p6;                                          
        selp.u16        %rs7, 1, 0, %p7;                                        
        ld.param.u64    %rd1, [_Z3foobbbPb_param_3];                            
        st.u8   [%rd1], %rs7;                                                   
        ret;                                                                    
}

Essentially, the PTX essentially simulate (p1 ? p2 : p3) with (p1 && p2) ||
(!p1 && p3) instead of using the selp instruction. A better PTX generated by
nvcc is:

.visible .entry _Z3foobbbPb(                                                    
        .param .u8 _Z3foobbbPb_param_0,                                         
        .param .u8 _Z3foobbbPb_param_1,                                         
        .param .u8 _Z3foobbbPb_param_2,                                         
        .param .u64 _Z3foobbbPb_param_3                                         
)                                                                               
{                                                                               
        .reg .pred      %p<2>;                                                  
        .reg .s16       %rs<6>;                                                 
        .reg .s64       %rd<3>;                                                 


        ld.param.u64    %rd1, [_Z3foobbbPb_param_3];                            
        cvta.to.global.u64      %rd2, %rd1;                                     
        ld.param.s8     %rs1, [_Z3foobbbPb_param_0];                            
        and.b16         %rs2, %rs1, 255;                                        
        .loc 1 2 1                                                              
        setp.eq.s16     %p1, %rs2, 0;                                           
        ld.param.s8     %rs3, [_Z3foobbbPb_param_1];                            
        ld.param.s8     %rs4, [_Z3foobbbPb_param_2];                            
        .loc 1 2 1                                                              
        selp.b16        %rs5, %rs4, %rs3, %p1;                                  
        st.global.u8    [%rd2], %rs5;                                           
        .loc 1 3 2                                                              
        ret;                                                                    
}</pre>
        </div>
      </p>
      <hr>
      <span>You are receiving this mail because:</span>
      
      <ul>
          <li>You are on the CC list for the bug.</li>
      </ul>
    </body>
</html>