<html>
<head>
<base href="http://llvm.org/bugs/" />
</head>
<body><table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Bug ID</th>
<td><a class="bz_bug_link
bz_status_NEW "
title="NEW --- - "select i1" not lowered to the optimal form"
href="http://llvm.org/bugs/show_bug.cgi?id=22246">22246</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>"select i1" not lowered to the optimal form
</td>
</tr>
<tr>
<th>Product</th>
<td>libraries
</td>
</tr>
<tr>
<th>Version</th>
<td>trunk
</td>
</tr>
<tr>
<th>Hardware</th>
<td>PC
</td>
</tr>
<tr>
<th>OS</th>
<td>All
</td>
</tr>
<tr>
<th>Status</th>
<td>NEW
</td>
</tr>
<tr>
<th>Severity</th>
<td>normal
</td>
</tr>
<tr>
<th>Priority</th>
<td>P
</td>
</tr>
<tr>
<th>Component</th>
<td>Backend: PTX
</td>
</tr>
<tr>
<th>Assignee</th>
<td>unassignedbugs@nondot.org
</td>
</tr>
<tr>
<th>Reporter</th>
<td>wujingyue@gmail.com
</td>
</tr>
<tr>
<th>CC</th>
<td>llvmbugs@cs.uiuc.edu
</td>
</tr>
<tr>
<th>Classification</th>
<td>Unclassified
</td>
</tr></table>
<p>
<div>
<pre>Here's a reduced test case.
CUDA:
_attribute__((global)) void foo(bool p1, bool p2, bool p3, bool *output) {
*output = (p1 ? p2 : p3);
}
clang++ -cc1 -x cuda -std=gnu++11 -fcuda-is-device -mstackrealign
-D__GCUDACC__=1 -DGCUDACC_STANDALONE_MODE -triple nvptx64-unknown-unknown
-target-cpu sm_35 -emit-llvm ~/Work/cuda/select.cu -o select.ll -fno-builtin
-O3
LLVM IR:
define void @_Z3foobbbPb(i1 zeroext %p1, i1 zeroext %p2, i1 zeroext %p3, i8*
nocapture %output) #0 {
entry:
%.sink.v = select i1 %p1, i1 %p2, i1 %p3
%frombool5 = zext i1 %.sink.v to i8
store i8 %frombool5, i8* %output, align 1, !tbaa !2
ret void
}
llc ~/Work/cuda/select.ll -o ~/Work/cuda/select.ptx -march=nvptx64 -mcpu=sm_35
-fp-contract=fast -O3
PTX:
.visible .entry _Z3foobbbPb(
.param .u8 _Z3foobbbPb_param_0,
.param .u8 _Z3foobbbPb_param_1,
.param .u8 _Z3foobbbPb_param_2,
.param .u64 _Z3foobbbPb_param_3
)
{
.reg .pred %p<8>;
.reg .s16 %rs<8>;
.reg .s64 %rd<2>;
// BB#0: // %entry
ld.param.u8 %rs1, [_Z3foobbbPb_param_0];
and.b16 %rs2, %rs1, 1;
setp.eq.b16 %p1, %rs2, 1;
ld.param.u8 %rs3, [_Z3foobbbPb_param_1];
and.b16 %rs4, %rs3, 1;
setp.eq.b16 %p2, %rs4, 1;
and.pred %p3, %p1, %p2;
not.pred %p4, %p1;
ld.param.u8 %rs5, [_Z3foobbbPb_param_2];
and.b16 %rs6, %rs5, 1;
setp.eq.b16 %p5, %rs6, 1;
and.pred %p6, %p4, %p5;
or.pred %p7, %p3, %p6;
selp.u16 %rs7, 1, 0, %p7;
ld.param.u64 %rd1, [_Z3foobbbPb_param_3];
st.u8 [%rd1], %rs7;
ret;
}
Essentially, the PTX essentially simulate (p1 ? p2 : p3) with (p1 && p2) ||
(!p1 && p3) instead of using the selp instruction. A better PTX generated by
nvcc is:
.visible .entry _Z3foobbbPb(
.param .u8 _Z3foobbbPb_param_0,
.param .u8 _Z3foobbbPb_param_1,
.param .u8 _Z3foobbbPb_param_2,
.param .u64 _Z3foobbbPb_param_3
)
{
.reg .pred %p<2>;
.reg .s16 %rs<6>;
.reg .s64 %rd<3>;
ld.param.u64 %rd1, [_Z3foobbbPb_param_3];
cvta.to.global.u64 %rd2, %rd1;
ld.param.s8 %rs1, [_Z3foobbbPb_param_0];
and.b16 %rs2, %rs1, 255;
.loc 1 2 1
setp.eq.s16 %p1, %rs2, 0;
ld.param.s8 %rs3, [_Z3foobbbPb_param_1];
ld.param.s8 %rs4, [_Z3foobbbPb_param_2];
.loc 1 2 1
selp.b16 %rs5, %rs4, %rs3, %p1;
st.global.u8 [%rd2], %rs5;
.loc 1 3 2
ret;
}</pre>
</div>
</p>
<hr>
<span>You are receiving this mail because:</span>
<ul>
<li>You are on the CC list for the bug.</li>
</ul>
</body>
</html>