<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">

<html>

<head>

  <meta content="text/html;charset=ISO-8859-1" http-equiv="Content-Type">

</head>

<body bgcolor="#ffffff" text="#000000">

I noticed that fourinarow is one of the programs in which LLVM is much

slower than GCC, so I decided to take a look and see why that is so. 

The program has many loops that look like this:<br>

<blockquote><tt>#define ROWS 6<br>

#define COLS 7<br>

  <br>

void init_board(char b[COLS][ROWS+1])<br>

{<br>

  int i,j;<br>

  <br>

  for (i=0;i<COLS;i++)<br>

        for (j=0;j<ROWS;j++)<br>

          b[i][j]='.';<br>

                  <br>

  for (i=0;i<COLS;i++)<br>

        b[i][ROWS]=0;<br>

}</tt><br>

</blockquote>

This generates the following X86 code:<tt><br>

</tt>

<blockquote><tt>    .text</tt><br>

  <tt>    .align    16</tt><br>

  <tt>    .globl    init_board</tt><br>

  <tt>    .type    init_board, @function</tt><br>

  <tt>init_board:</tt><br>

  <tt>    subl $4, %esp</tt><br>

  <tt>    movl %esi, (%esp)</tt><br>

  <tt>    movl 8(%esp), %eax</tt><br>

  <tt>    movl $0, %ecx</tt><br>

  <tt>.LBBinit_board_1:    # loopexit.1</tt><br>

  <tt>    imull $7, %ecx, %edx</tt><br>

  <tt>    movl %eax, %esi</tt><br>

  <tt>    addl %edx, %esi</tt><br>

  <tt>    movb $46, (%esi)</tt><br>

  <tt>    imull $7, %ecx, %edx</tt><br>

  <tt>    movl %eax, %esi</tt><br>

  <tt>    addl %edx, %esi</tt><br>

  <tt>    leal 1(%esi), %edx</tt><br>

  <tt>    movb $46, (%edx)</tt><br>

  <tt>    imull $7, %ecx, %edx</tt><br>

  <tt>    movl %eax, %esi</tt><br>

  <tt>    addl %edx, %esi</tt><br>

  <tt>    leal 2(%esi), %edx</tt><br>

  <tt>    movb $46, (%edx)</tt><br>

  <tt>    imull $7, %ecx, %edx</tt><br>

  <tt>    movl %eax, %esi</tt><br>

  <tt>    addl %edx, %esi</tt><br>

  <tt>    leal 3(%esi), %edx</tt><br>

  <tt>    movb $46, (%edx)</tt><br>

  <tt>    imull $7, %ecx, %edx</tt><br>

  <tt>    movl %eax, %esi</tt><br>

  <tt>    addl %edx, %esi</tt><br>

  <tt>    leal 4(%esi), %edx</tt><br>

  <tt>    movb $46, (%edx)</tt><br>

  <tt>    imull $7, %ecx, %edx</tt><br>

  <tt>    movl %eax, %esi</tt><br>

  <tt>    addl %edx, %esi</tt><br>

  <tt>    leal 5(%esi), %edx</tt><br>

  <tt>    movb $46, (%edx)</tt><br>

  <tt>    incl %ecx</tt><br>

  <tt>    cmpl $7, %ecx</tt><br>

  <tt>    jne .LBBinit_board_1    # loopexit.1</tt><br>

  <tt>.LBBinit_board_2:    # return</tt><br>

  <tt>    movb $0, 6(%eax)</tt><br>

  <tt>    movb $0, 13(%eax)</tt><br>

  <tt>    movb $0, 20(%eax)</tt><br>

  <tt>    movb $0, 27(%eax)</tt><br>

  <tt>    movb $0, 34(%eax)</tt><br>

  <tt>    movb $0, 41(%eax)</tt><br>

  <tt>    movb $0, 48(%eax)</tt><br>

  <tt>    movl (%esp), %esi</tt><br>

  <tt>    addl $4, %esp</tt><br>

  <tt>    ret</tt><br>

</blockquote>

The code generated by GCC is much faster.  LLVM does unroll the loop,

which GCC does not do, but the addressing code is a constant repetition

of these three instructions:<br>

<blockquote><tt>    imull $7, %ecx, %edx</tt><br>

  <tt>    movl %eax, %esi</tt><br>

  <tt>    addl %edx, %esi</tt><br>

</blockquote>

All but the first occurrance do nothing but put the same address into

%esi that was already there to being with, and does it with an

expensive multiply at that.  GCC correctly creates a suitable induction

variable and strength reduces the multiplication down to addition.<br>

<br>

It turns out that using the new selection dag code generator, the outer

loop is unrolled also and nothing is left but a bunch of movb

instructions.  Much, much faster.  But still, the optimization passes

shouldn't be leaving so much garbage around for instruction selection

to clean up.  At first, I thought all that was needed was to another

round of basic block optimizations after the loop was unrolled, until I

saw the actual LLVM bytecodes:<br>

<blockquote><tt>void %init_board([7 x sbyte]* %b) {</tt><br>

  <tt>entry:</tt><br>

  <tt>    br label %loopexit.1</tt><br>

  <br>

  <tt>loopexit.1:        ; preds = %loopexit.1, %entry</tt><br>

  <tt>    %indvar14 = phi uint [ 0, %entry ], [ %indvar.next15,

%loopexit.1 ]        ; <uint> [#uses=7]</tt><br>

  <tt>    %tmp.10 = getelementptr [7 x sbyte]* %b, uint %indvar14, int

0        ; <sbyte*> [#uses=1]</tt><br>

  <tt>    store sbyte 46, sbyte* %tmp.10</tt><br>

  <tt>    %tmp.10.1 = getelementptr [7 x sbyte]* %b, uint %indvar14,

int 1        ; <sbyte*> [#uses=1]</tt><br>

  <tt>    store sbyte 46, sbyte* %tmp.10.1</tt><br>

  <tt>    %tmp.10.2 = getelementptr [7 x sbyte]* %b, uint %indvar14,

int 2        ; <sbyte*> [#uses=1]</tt><br>

  <tt>    store sbyte 46, sbyte* %tmp.10.2</tt><br>

  <tt>    %tmp.10.3 = getelementptr [7 x sbyte]* %b, uint %indvar14,

int 3        ; <sbyte*> [#uses=1]</tt><br>

  <tt>    store sbyte 46, sbyte* %tmp.10.3</tt><br>

  <tt>    %tmp.10.4 = getelementptr [7 x sbyte]* %b, uint %indvar14,

int 4        ; <sbyte*> [#uses=1]</tt><br>

  <tt>    store sbyte 46, sbyte* %tmp.10.4</tt><br>

  <tt>    %tmp.10.5 = getelementptr [7 x sbyte]* %b, uint %indvar14,

int 5        ; <sbyte*> [#uses=1]</tt><br>

  <tt>    store sbyte 46, sbyte* %tmp.10.5</tt><br>

  <tt>    %indvar.next15 = add uint %indvar14, 1        ; <uint>

[#uses=2]</tt><br>

  <tt>    %exitcond16 = seteq uint %indvar.next15, 7        ;

<bool> [#uses=1]</tt><br>

  <tt>    br bool %exitcond16, label %return, label %loopexit.1</tt><br>

  <br>

  <tt>return:        ; preds = %loopexit.1</tt><br>

  <tt>    %tmp.19 = getelementptr [7 x sbyte]* %b, int 0, int 6       

; <sbyte*> [#uses=1]</tt><br>

  <tt>    store sbyte 0, sbyte* %tmp.19</tt><br>

  <tt>    %tmp.19.1 = getelementptr [7 x sbyte]* %b, int 1, int 6   

    ; <sbyte*> [#uses=1]</tt><br>

  <tt>    store sbyte 0, sbyte* %tmp.19.1</tt><br>

  <tt>    %tmp.19.2 = getelementptr [7 x sbyte]* %b, int 2, int 6   

    ; <sbyte*> [#uses=1]</tt><br>

  <tt>    store sbyte 0, sbyte* %tmp.19.2</tt><br>

  <tt>    %tmp.19.3 = getelementptr [7 x sbyte]* %b, int 3, int 6   

    ; <sbyte*> [#uses=1]</tt><br>

  <tt>    store sbyte 0, sbyte* %tmp.19.3</tt><br>

  <tt>    %tmp.19.4 = getelementptr [7 x sbyte]* %b, int 4, int 6   

    ; <sbyte*> [#uses=1]</tt><br>

  <tt>    store sbyte 0, sbyte* %tmp.19.4</tt><br>

  <tt>    %tmp.19.5 = getelementptr [7 x sbyte]* %b, int 5, int 6   

    ; <sbyte*> [#uses=1]</tt><br>

  <tt>    store sbyte 0, sbyte* %tmp.19.5</tt><br>

  <tt>    %tmp.19.6 = getelementptr [7 x sbyte]* %b, int 6, int 6   

    ; <sbyte*> [#uses=1]</tt><br>

  <tt>    store sbyte 0, sbyte* %tmp.19.6</tt><br>

  <tt>    ret void</tt><br>

  <tt>}</tt><br>

</blockquote>

Now the problem is obvious.  A two dimensional array access is being

performed by a single instruction.  The arithmetic needed to address

the element is implicit, and therefore inaccessible to optimizations. 

The redundant calculations can not be eliminated, nor can strength

reduction be performed.  getelementptr needs to be broken down into its

constituent operations at some point.<br>

<br>

Or am I missing something?  Is this breaking down and reapplying

optimizations part of the selection dag process?  I am somewhat

surprised that using it affected what loops got unrolled.<br>

<br>

</body>

</html>