[LLVMbugs] [Bug 8125] New: switches over a 2-bit domain lowered inefficiently

Fri Sep 10 06:37:16 PDT 2010

http://llvm.org/bugs/show_bug.cgi?id=8125

           Summary: switches over a 2-bit domain lowered inefficiently
           Product: libraries
           Version: trunk
          Platform: All
        OS/Version: All
            Status: NEW
          Severity: normal
          Priority: P
         Component: Backend: X86
        AssignedTo: unassignedbugs at nondot.org
        ReportedBy: ggreif at gmail.com
                CC: llvmbugs at cs.uiuc.edu

Consider this C++ program:

##########################################
struct Foo {
    void* tagged;

    Foo* follow(void);
    Foo* collect(unsigned = 1);
};

Foo* Foo::follow(void) {
    int t = (unsigned long)tagged & 0x3;
    switch (t)
    {
        case 0:
        case 1:
            return (this + 1)->follow();
        case 3:
            return this + 1;
        case 2:
            return (this + 1)->collect();
    }
}

Foo* Foo::collect(unsigned acc) {
    int t = (unsigned long)tagged & 0x3;
    switch (t)
    {
        case 0:
        case 1:
            return (this + 1)->collect((acc << 1) | t);
        case 3:
            return this + 1;
        case 2:
            return this + 1 + acc;
    }
}
##########################################

Clang compiles the second function to:

##########################################
define %struct.Foo* @_ZN3Foo7collectEj(%struct.Foo* %this, i32 %acc) nounwind
readonly align 2 {
entry:
  br label %tailrecurse

tailrecurse:                                      ; preds = %sw.bb, %entry
  %indvar = phi i64 [ %indvar.next, %sw.bb ], [ 0, %entry ]
  %acc.tr = phi i32 [ %or, %sw.bb ], [ %acc, %entry ]
  %tmp = getelementptr inbounds %struct.Foo* %this, i64 %indvar, i32 0
  %tmp2 = load i8** %tmp, align 8
  %0 = ptrtoint i8* %tmp2 to i64
  %and = and i64 %0, 3
  %conv = trunc i64 %and to i32
  switch i32 %conv, label %sw.epilog [
    i32 0, label %sw.bb
    i32 1, label %sw.bb
    i32 3, label %sw.bb6
    i32 2, label %sw.bb8
  ]

sw.bb:                                            ; preds = %tailrecurse,
%tailrecurse
  %shl = shl i32 %acc.tr, 1
  %or = or i32 %conv, %shl
  %indvar.next = add i64 %indvar, 1
  br label %tailrecurse

sw.bb6:                                           ; preds = %tailrecurse
  %this.tr.sum21 = add i64 %indvar, 1
  %add.ptr7 = getelementptr inbounds %struct.Foo* %this, i64 %this.tr.sum21
  ret %struct.Foo* %add.ptr7

sw.bb8:                                           ; preds = %tailrecurse
  %idx.ext = zext i32 %acc.tr to i64
  %add.ptr9.sum = add i64 %idx.ext, 1
  %this.tr.sum = add i64 %indvar, %add.ptr9.sum
  %add.ptr11 = getelementptr inbounds %struct.Foo* %this, i64 %this.tr.sum
  ret %struct.Foo* %add.ptr11

sw.epilog:                                        ; preds = %tailrecurse
  ret %struct.Foo* undef
}
##########################################

With a pretty switch statement inside.

As an aside, I get clang warnings:
Release+Asserts/bin/clang++ -O2 switch.cpp -o switch.O2.ll -emit-llvm -S
-fno-exceptions
switch.cpp:21:1: warning: control may reach end of non-void function
[-Wreturn-type]
}
^
switch.cpp:35:1: warning: control may reach end of non-void function
[-Wreturn-type]
}
^
2 warnings generated.

These are an inability in clang to see that the AND mask has 2 bits, which
admits 4 combinations and that all are covered in the switch instruction.
Also the default switch arm (returning undefined arises from this).
Meta-question can the switch be written as: "switch i32 %conv, undefined
[...]"?

Okay, now let's look at the generated assembly (x86-64) of the first function:

##########################################
_ZN3Foo6followEv:
.Leh_func_begin0:
    pushq    %rbp
.Ltmp0:
    movq    %rsp, %rbp
.Ltmp1:
    movl    $2, %ecx
    movq    %rdi, %rdx
    leaq    8(%rdi), %rax
    leaq    16(%rdi), %rsi
    jmp    .LBB0_1
    .align    16, 0x90
.LBB0_9:
    addq    $8, %rax
    incq    %rcx
    addq    $8, %rsi
.LBB0_1:
    movl    -16(%rsi), %edi
    andl    $3, %edi
                ;;  we should jz here
    cmpl    $2, %edi
    jb    .LBB0_9
    cmpl    $3, %edi
                ;; no need for this compare
    je    .LBB0_10
    cmpl    $2, %edi
                ;; no need for this compare
    jne    .LBB0_13
    movl    $1, %eax
    .align    16, 0x90
.LBB0_5:
    movl    -8(%rsi), %edi
    andl    $3, %edi
                ;;  we should jz here
    cmpl    $3, %edi
                ;; comparing with 2 would allow "trisection"
    je    .LBB0_11
    cmpl    $2, %edi
                ;; no need for this compare
    je    .LBB0_12
    cmpl    $1, %edi
                ;; no need for this compare
    ja    .LBB0_13
    addl    %eax, %eax
    orl    %edi, %eax
    incq    %rcx
    addq    $8, %rsi
    jmp    .LBB0_5
.LBB0_10:
    popq    %rbp
    ret
.LBB0_11:
    movq    %rsi, %rax
    popq    %rbp
    ret
.LBB0_12:
    movl    %eax, %eax
    addq    %rcx, %rax
    leaq    (%rdx,%rax,8), %rax
.LBB0_13:
    popq    %rbp
    ret
.Ltmp2:
    .size    _ZN3Foo6followEv, .Ltmp2-_ZN3Foo6followEv
.Leh_func_end0:
##########################################

I have commented stuff that looks very inefficient. I can imagine a target
independent lowering of a 2-bit value domain (the AND-mask having popcnt=2 with
the zero-value peeled off immediately after the AND) like this:

Enumerate the 3 possible values in order:

x < y < z

compare against y, if eq ---> leg for y
                   if smaller  ---> leg for x
                   if bigger  ---> leg for z

Look 'ma, only one compare!

When the 2 bits of the mask are adjacent, then on targets which have a rcr
(rotate with carry) one could rotate the upper bit into the carry flag and the
next bit would end up in the minus flag. This would allow a 4-way branch.
On other targets (PPC) we could move to condition code register and have a
multi-way branch too.

Links: http://docs.sun.com/app/docs/doc/802-1948/6i5uqa9p5?l=en&a=view

-- 
Configure bugmail: http://llvm.org/bugs/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are on the CC list for the bug.