[LLVMbugs] [Bug 22246] New: "select i1" not lowered to the optimal form
bugzilla-daemon at llvm.org
bugzilla-daemon at llvm.org
Thu Jan 15 18:28:12 PST 2015
http://llvm.org/bugs/show_bug.cgi?id=22246
Bug ID: 22246
Summary: "select i1" not lowered to the optimal form
Product: libraries
Version: trunk
Hardware: PC
OS: All
Status: NEW
Severity: normal
Priority: P
Component: Backend: PTX
Assignee: unassignedbugs at nondot.org
Reporter: wujingyue at gmail.com
CC: llvmbugs at cs.uiuc.edu
Classification: Unclassified
Here's a reduced test case.
CUDA:
_attribute__((global)) void foo(bool p1, bool p2, bool p3, bool *output) {
*output = (p1 ? p2 : p3);
}
clang++ -cc1 -x cuda -std=gnu++11 -fcuda-is-device -mstackrealign
-D__GCUDACC__=1 -DGCUDACC_STANDALONE_MODE -triple nvptx64-unknown-unknown
-target-cpu sm_35 -emit-llvm ~/Work/cuda/select.cu -o select.ll -fno-builtin
-O3
LLVM IR:
define void @_Z3foobbbPb(i1 zeroext %p1, i1 zeroext %p2, i1 zeroext %p3, i8*
nocapture %output) #0 {
entry:
%.sink.v = select i1 %p1, i1 %p2, i1 %p3
%frombool5 = zext i1 %.sink.v to i8
store i8 %frombool5, i8* %output, align 1, !tbaa !2
ret void
}
llc ~/Work/cuda/select.ll -o ~/Work/cuda/select.ptx -march=nvptx64 -mcpu=sm_35
-fp-contract=fast -O3
PTX:
.visible .entry _Z3foobbbPb(
.param .u8 _Z3foobbbPb_param_0,
.param .u8 _Z3foobbbPb_param_1,
.param .u8 _Z3foobbbPb_param_2,
.param .u64 _Z3foobbbPb_param_3
)
{
.reg .pred %p<8>;
.reg .s16 %rs<8>;
.reg .s64 %rd<2>;
// BB#0: // %entry
ld.param.u8 %rs1, [_Z3foobbbPb_param_0];
and.b16 %rs2, %rs1, 1;
setp.eq.b16 %p1, %rs2, 1;
ld.param.u8 %rs3, [_Z3foobbbPb_param_1];
and.b16 %rs4, %rs3, 1;
setp.eq.b16 %p2, %rs4, 1;
and.pred %p3, %p1, %p2;
not.pred %p4, %p1;
ld.param.u8 %rs5, [_Z3foobbbPb_param_2];
and.b16 %rs6, %rs5, 1;
setp.eq.b16 %p5, %rs6, 1;
and.pred %p6, %p4, %p5;
or.pred %p7, %p3, %p6;
selp.u16 %rs7, 1, 0, %p7;
ld.param.u64 %rd1, [_Z3foobbbPb_param_3];
st.u8 [%rd1], %rs7;
ret;
}
Essentially, the PTX essentially simulate (p1 ? p2 : p3) with (p1 && p2) ||
(!p1 && p3) instead of using the selp instruction. A better PTX generated by
nvcc is:
.visible .entry _Z3foobbbPb(
.param .u8 _Z3foobbbPb_param_0,
.param .u8 _Z3foobbbPb_param_1,
.param .u8 _Z3foobbbPb_param_2,
.param .u64 _Z3foobbbPb_param_3
)
{
.reg .pred %p<2>;
.reg .s16 %rs<6>;
.reg .s64 %rd<3>;
ld.param.u64 %rd1, [_Z3foobbbPb_param_3];
cvta.to.global.u64 %rd2, %rd1;
ld.param.s8 %rs1, [_Z3foobbbPb_param_0];
and.b16 %rs2, %rs1, 255;
.loc 1 2 1
setp.eq.s16 %p1, %rs2, 0;
ld.param.s8 %rs3, [_Z3foobbbPb_param_1];
ld.param.s8 %rs4, [_Z3foobbbPb_param_2];
.loc 1 2 1
selp.b16 %rs5, %rs4, %rs3, %p1;
st.global.u8 [%rd2], %rs5;
.loc 1 3 2
ret;
}
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20150116/e33ae3c1/attachment.html>
More information about the llvm-bugs
mailing list