[LLVMdev] Area for improvement

Mon Feb 21 18:48:00 PST 2005

I noticed that fourinarow is one of the programs in which LLVM is much 
slower than GCC, so I decided to take a look and see why that is so.  
The program has many loops that look like this:

    #define ROWS 6
    #define COLS 7

    void init_board(char b[COLS][ROWS+1])
    {
      int i,j;

      for (i=0;i<COLS;i++)
            for (j=0;j<ROWS;j++)
              b[i][j]='.';

      for (i=0;i<COLS;i++)
            b[i][ROWS]=0;
    }

This generates the following X86 code:

        .text
        .align    16
        .globl    init_board
        .type    init_board, @function
    init_board:
        subl $4, %esp
        movl %esi, (%esp)
        movl 8(%esp), %eax
        movl $0, %ecx
    .LBBinit_board_1:    # loopexit.1
        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi
        movb $46, (%esi)
        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi
        leal 1(%esi), %edx
        movb $46, (%edx)
        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi
        leal 2(%esi), %edx
        movb $46, (%edx)
        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi
        leal 3(%esi), %edx
        movb $46, (%edx)
        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi
        leal 4(%esi), %edx
        movb $46, (%edx)
        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi
        leal 5(%esi), %edx
        movb $46, (%edx)
        incl %ecx
        cmpl $7, %ecx
        jne .LBBinit_board_1    # loopexit.1
    .LBBinit_board_2:    # return
        movb $0, 6(%eax)
        movb $0, 13(%eax)
        movb $0, 20(%eax)
        movb $0, 27(%eax)
        movb $0, 34(%eax)
        movb $0, 41(%eax)
        movb $0, 48(%eax)
        movl (%esp), %esi
        addl $4, %esp
        ret

The code generated by GCC is much faster.  LLVM does unroll the loop, 
which GCC does not do, but the addressing code is a constant repetition 
of these three instructions:

        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi

All but the first occurrance do nothing but put the same address into 
%esi that was already there to being with, and does it with an expensive 
multiply at that.  GCC correctly creates a suitable induction variable 
and strength reduces the multiplication down to addition.

It turns out that using the new selection dag code generator, the outer 
loop is unrolled also and nothing is left but a bunch of movb 
instructions.  Much, much faster.  But still, the optimization passes 
shouldn't be leaving so much garbage around for instruction selection to 
clean up.  At first, I thought all that was needed was to another round 
of basic block optimizations after the loop was unrolled, until I saw 
the actual LLVM bytecodes:

    void %init_board([7 x sbyte]* %b) {
    entry:
        br label %loopexit.1

    loopexit.1:        ; preds = %loopexit.1, %entry
        %indvar14 = phi uint [ 0, %entry ], [ %indvar.next15,
    %loopexit.1 ]        ; <uint> [#uses=7]
        %tmp.10 = getelementptr [7 x sbyte]* %b, uint %indvar14, int
    0        ; <sbyte*> [#uses=1]
        store sbyte 46, sbyte* %tmp.10
        %tmp.10.1 = getelementptr [7 x sbyte]* %b, uint %indvar14, int
    1        ; <sbyte*> [#uses=1]
        store sbyte 46, sbyte* %tmp.10.1
        %tmp.10.2 = getelementptr [7 x sbyte]* %b, uint %indvar14, int
    2        ; <sbyte*> [#uses=1]
        store sbyte 46, sbyte* %tmp.10.2
        %tmp.10.3 = getelementptr [7 x sbyte]* %b, uint %indvar14, int
    3        ; <sbyte*> [#uses=1]
        store sbyte 46, sbyte* %tmp.10.3
        %tmp.10.4 = getelementptr [7 x sbyte]* %b, uint %indvar14, int
    4        ; <sbyte*> [#uses=1]
        store sbyte 46, sbyte* %tmp.10.4
        %tmp.10.5 = getelementptr [7 x sbyte]* %b, uint %indvar14, int
    5        ; <sbyte*> [#uses=1]
        store sbyte 46, sbyte* %tmp.10.5
        %indvar.next15 = add uint %indvar14, 1        ; <uint> [#uses=2]
        %exitcond16 = seteq uint %indvar.next15, 7        ; <bool> [#uses=1]
        br bool %exitcond16, label %return, label %loopexit.1

    return:        ; preds = %loopexit.1
        %tmp.19 = getelementptr [7 x sbyte]* %b, int 0, int 6        ;
    <sbyte*> [#uses=1]
        store sbyte 0, sbyte* %tmp.19
        %tmp.19.1 = getelementptr [7 x sbyte]* %b, int 1, int 6        ;
    <sbyte*> [#uses=1]
        store sbyte 0, sbyte* %tmp.19.1
        %tmp.19.2 = getelementptr [7 x sbyte]* %b, int 2, int 6        ;
    <sbyte*> [#uses=1]
        store sbyte 0, sbyte* %tmp.19.2
        %tmp.19.3 = getelementptr [7 x sbyte]* %b, int 3, int 6        ;
    <sbyte*> [#uses=1]
        store sbyte 0, sbyte* %tmp.19.3
        %tmp.19.4 = getelementptr [7 x sbyte]* %b, int 4, int 6        ;
    <sbyte*> [#uses=1]
        store sbyte 0, sbyte* %tmp.19.4
        %tmp.19.5 = getelementptr [7 x sbyte]* %b, int 5, int 6        ;
    <sbyte*> [#uses=1]
        store sbyte 0, sbyte* %tmp.19.5
        %tmp.19.6 = getelementptr [7 x sbyte]* %b, int 6, int 6        ;
    <sbyte*> [#uses=1]
        store sbyte 0, sbyte* %tmp.19.6
        ret void
    }

Now the problem is obvious.  A two dimensional array access is being 
performed by a single instruction.  The arithmetic needed to address the 
element is implicit, and therefore inaccessible to optimizations.  The 
redundant calculations can not be eliminated, nor can strength reduction 
be performed.  getelementptr needs to be broken down into its 
constituent operations at some point.

Or am I missing something?  Is this breaking down and reapplying 
optimizations part of the selection dag process?  I am somewhat 
surprised that using it affected what loops got unrolled.

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20050221/3409ef8f/attachment.html>