[LLVMbugs] [Bug 8125] New: switches over a 2-bit domain lowered inefficiently
bugzilla-daemon at llvm.org
bugzilla-daemon at llvm.org
Fri Sep 10 06:37:16 PDT 2010
http://llvm.org/bugs/show_bug.cgi?id=8125
Summary: switches over a 2-bit domain lowered inefficiently
Product: libraries
Version: trunk
Platform: All
OS/Version: All
Status: NEW
Severity: normal
Priority: P
Component: Backend: X86
AssignedTo: unassignedbugs at nondot.org
ReportedBy: ggreif at gmail.com
CC: llvmbugs at cs.uiuc.edu
Consider this C++ program:
##########################################
struct Foo {
void* tagged;
Foo* follow(void);
Foo* collect(unsigned = 1);
};
Foo* Foo::follow(void) {
int t = (unsigned long)tagged & 0x3;
switch (t)
{
case 0:
case 1:
return (this + 1)->follow();
case 3:
return this + 1;
case 2:
return (this + 1)->collect();
}
}
Foo* Foo::collect(unsigned acc) {
int t = (unsigned long)tagged & 0x3;
switch (t)
{
case 0:
case 1:
return (this + 1)->collect((acc << 1) | t);
case 3:
return this + 1;
case 2:
return this + 1 + acc;
}
}
##########################################
Clang compiles the second function to:
##########################################
define %struct.Foo* @_ZN3Foo7collectEj(%struct.Foo* %this, i32 %acc) nounwind
readonly align 2 {
entry:
br label %tailrecurse
tailrecurse: ; preds = %sw.bb, %entry
%indvar = phi i64 [ %indvar.next, %sw.bb ], [ 0, %entry ]
%acc.tr = phi i32 [ %or, %sw.bb ], [ %acc, %entry ]
%tmp = getelementptr inbounds %struct.Foo* %this, i64 %indvar, i32 0
%tmp2 = load i8** %tmp, align 8
%0 = ptrtoint i8* %tmp2 to i64
%and = and i64 %0, 3
%conv = trunc i64 %and to i32
switch i32 %conv, label %sw.epilog [
i32 0, label %sw.bb
i32 1, label %sw.bb
i32 3, label %sw.bb6
i32 2, label %sw.bb8
]
sw.bb: ; preds = %tailrecurse,
%tailrecurse
%shl = shl i32 %acc.tr, 1
%or = or i32 %conv, %shl
%indvar.next = add i64 %indvar, 1
br label %tailrecurse
sw.bb6: ; preds = %tailrecurse
%this.tr.sum21 = add i64 %indvar, 1
%add.ptr7 = getelementptr inbounds %struct.Foo* %this, i64 %this.tr.sum21
ret %struct.Foo* %add.ptr7
sw.bb8: ; preds = %tailrecurse
%idx.ext = zext i32 %acc.tr to i64
%add.ptr9.sum = add i64 %idx.ext, 1
%this.tr.sum = add i64 %indvar, %add.ptr9.sum
%add.ptr11 = getelementptr inbounds %struct.Foo* %this, i64 %this.tr.sum
ret %struct.Foo* %add.ptr11
sw.epilog: ; preds = %tailrecurse
ret %struct.Foo* undef
}
##########################################
With a pretty switch statement inside.
As an aside, I get clang warnings:
Release+Asserts/bin/clang++ -O2 switch.cpp -o switch.O2.ll -emit-llvm -S
-fno-exceptions
switch.cpp:21:1: warning: control may reach end of non-void function
[-Wreturn-type]
}
^
switch.cpp:35:1: warning: control may reach end of non-void function
[-Wreturn-type]
}
^
2 warnings generated.
These are an inability in clang to see that the AND mask has 2 bits, which
admits 4 combinations and that all are covered in the switch instruction.
Also the default switch arm (returning undefined arises from this).
Meta-question can the switch be written as: "switch i32 %conv, undefined
[...]"?
Okay, now let's look at the generated assembly (x86-64) of the first function:
##########################################
_ZN3Foo6followEv:
.Leh_func_begin0:
pushq %rbp
.Ltmp0:
movq %rsp, %rbp
.Ltmp1:
movl $2, %ecx
movq %rdi, %rdx
leaq 8(%rdi), %rax
leaq 16(%rdi), %rsi
jmp .LBB0_1
.align 16, 0x90
.LBB0_9:
addq $8, %rax
incq %rcx
addq $8, %rsi
.LBB0_1:
movl -16(%rsi), %edi
andl $3, %edi
;; we should jz here
cmpl $2, %edi
jb .LBB0_9
cmpl $3, %edi
;; no need for this compare
je .LBB0_10
cmpl $2, %edi
;; no need for this compare
jne .LBB0_13
movl $1, %eax
.align 16, 0x90
.LBB0_5:
movl -8(%rsi), %edi
andl $3, %edi
;; we should jz here
cmpl $3, %edi
;; comparing with 2 would allow "trisection"
je .LBB0_11
cmpl $2, %edi
;; no need for this compare
je .LBB0_12
cmpl $1, %edi
;; no need for this compare
ja .LBB0_13
addl %eax, %eax
orl %edi, %eax
incq %rcx
addq $8, %rsi
jmp .LBB0_5
.LBB0_10:
popq %rbp
ret
.LBB0_11:
movq %rsi, %rax
popq %rbp
ret
.LBB0_12:
movl %eax, %eax
addq %rcx, %rax
leaq (%rdx,%rax,8), %rax
.LBB0_13:
popq %rbp
ret
.Ltmp2:
.size _ZN3Foo6followEv, .Ltmp2-_ZN3Foo6followEv
.Leh_func_end0:
##########################################
I have commented stuff that looks very inefficient. I can imagine a target
independent lowering of a 2-bit value domain (the AND-mask having popcnt=2 with
the zero-value peeled off immediately after the AND) like this:
Enumerate the 3 possible values in order:
x < y < z
compare against y, if eq ---> leg for y
if smaller ---> leg for x
if bigger ---> leg for z
Look 'ma, only one compare!
When the 2 bits of the mask are adjacent, then on targets which have a rcr
(rotate with carry) one could rotate the upper bit into the carry flag and the
next bit would end up in the minus flag. This would allow a 4-way branch.
On other targets (PPC) we could move to condition code register and have a
multi-way branch too.
Links: http://docs.sun.com/app/docs/doc/802-1948/6i5uqa9p5?l=en&a=view
--
Configure bugmail: http://llvm.org/bugs/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are on the CC list for the bug.
More information about the llvm-bugs
mailing list