[LLVMbugs] [Bug 527] NEW: multi-dimensional array addressing code not optimized

Mon Feb 21 21:33:43 PST 2005

http://llvm.cs.uiuc.edu/bugs/show_bug.cgi?id=527

           Summary: multi-dimensional array addressing code not optimized
           Product: libraries
           Version: trunk
          Platform: All
        OS/Version: All
            Status: NEW
          Severity: normal
          Priority: P2
         Component: Scalar Optimizations
        AssignedTo: unassignedbugs at nondot.org
        ReportedBy: jeffc at jolt-lang.org

I noticed that fourinarow is one of the programs in which LLVM is much slower
than GCC, so I decided to take a look and see why that is so.  The program has
many loops that look like this:

    #define ROWS 6
    #define COLS 7

    void init_board(char b[COLS][ROWS+1])
    {
      int i,j;

      for (i=0;i<COLS;i++)
            for (j=0;j<ROWS;j++)
              b[i][j]='.';

      for (i=0;i<COLS;i++)
            b[i][ROWS]=0;
    }

This generates the following X86 code:

        .text
        .align    16
        .globl    init_board
        .type    init_board, @function
    init_board:
        subl $4, %esp
        movl %esi, (%esp)
        movl 8(%esp), %eax
        movl $0, %ecx
    .LBBinit_board_1:    # loopexit.1
        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi
        movb $46, (%esi)
        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi
        leal 1(%esi), %edx
        movb $46, (%edx)
        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi
        leal 2(%esi), %edx
        movb $46, (%edx)
        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi
        leal 3(%esi), %edx
        movb $46, (%edx)
        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi
        leal 4(%esi), %edx
        movb $46, (%edx)
        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi
        leal 5(%esi), %edx
        movb $46, (%edx)
        incl %ecx
        cmpl $7, %ecx
        jne .LBBinit_board_1    # loopexit.1
    .LBBinit_board_2:    # return
        movb $0, 6(%eax)
        movb $0, 13(%eax)
        movb $0, 20(%eax)
        movb $0, 27(%eax)
        movb $0, 34(%eax)
        movb $0, 41(%eax)
        movb $0, 48(%eax)
        movl (%esp), %esi
        addl $4, %esp
        ret

The code generated by GCC is much faster.  LLVM does unroll the loop, which GCC
does not do, but the addressing code is a constant repetition of these three
instructions:

        imull $7, %ecx, %edx
        movl %eax, %esi
        addl %edx, %esi

All but the first occurrance do nothing but put the same address into %esi that
was already there to being with, and does it with an expensive multiply at that.
 GCC correctly creates a suitable induction variable and strength reduces the
multiplication down to addition.

It turns out that using the new selection dag code generator, the outer loop is
unrolled also and nothing is left but a bunch of movb instructions.  Much, much
faster.  But still, the optimization passes shouldn't be leaving so much garbage
around for instruction selection to clean up.  At first, I thought all that was
needed was to another round of basic block optimizations after the loop was
unrolled, until I saw the actual LLVM bytecodes:

    void %init_board([7 x sbyte]* %b) {
    entry:
        br label %loopexit.1

    loopexit.1:        ; preds = %loopexit.1, %entry
        %indvar14 = phi uint [ 0, %entry ], [ %indvar.next15, %loopexit.1 ]    
   ; <uint> [#uses=7]
        %tmp.10 = getelementptr [7 x sbyte]* %b, uint %indvar14, int 0        ;
<sbyte*> [#uses=1]
        store sbyte 46, sbyte* %tmp.10
        %tmp.10.1 = getelementptr [7 x sbyte]* %b, uint %indvar14, int 1       
; <sbyte*> [#uses=1]
        store sbyte 46, sbyte* %tmp.10.1
        %tmp.10.2 = getelementptr [7 x sbyte]* %b, uint %indvar14, int 2       
; <sbyte*> [#uses=1]
        store sbyte 46, sbyte* %tmp.10.2
        %tmp.10.3 = getelementptr [7 x sbyte]* %b, uint %indvar14, int 3       
; <sbyte*> [#uses=1]
        store sbyte 46, sbyte* %tmp.10.3
        %tmp.10.4 = getelementptr [7 x sbyte]* %b, uint %indvar14, int 4       
; <sbyte*> [#uses=1]
        store sbyte 46, sbyte* %tmp.10.4
        %tmp.10.5 = getelementptr [7 x sbyte]* %b, uint %indvar14, int 5       
; <sbyte*> [#uses=1]
        store sbyte 46, sbyte* %tmp.10.5
        %indvar.next15 = add uint %indvar14, 1        ; <uint> [#uses=2]
        %exitcond16 = seteq uint %indvar.next15, 7        ; <bool> [#uses=1]
        br bool %exitcond16, label %return, label %loopexit.1

    return:        ; preds = %loopexit.1
        %tmp.19 = getelementptr [7 x sbyte]* %b, int 0, int 6        ; <sbyte*>
[#uses=1]
        store sbyte 0, sbyte* %tmp.19
        %tmp.19.1 = getelementptr [7 x sbyte]* %b, int 1, int 6        ;
<sbyte*> [#uses=1]
        store sbyte 0, sbyte* %tmp.19.1
        %tmp.19.2 = getelementptr [7 x sbyte]* %b, int 2, int 6        ;
<sbyte*> [#uses=1]
        store sbyte 0, sbyte* %tmp.19.2
        %tmp.19.3 = getelementptr [7 x sbyte]* %b, int 3, int 6        ;
<sbyte*> [#uses=1]
        store sbyte 0, sbyte* %tmp.19.3
        %tmp.19.4 = getelementptr [7 x sbyte]* %b, int 4, int 6        ;
<sbyte*> [#uses=1]
        store sbyte 0, sbyte* %tmp.19.4
        %tmp.19.5 = getelementptr [7 x sbyte]* %b, int 5, int 6        ;
<sbyte*> [#uses=1]
        store sbyte 0, sbyte* %tmp.19.5
        %tmp.19.6 = getelementptr [7 x sbyte]* %b, int 6, int 6        ;
<sbyte*> [#uses=1]
        store sbyte 0, sbyte* %tmp.19.6
        ret void
    }

Now the problem is obvious.  A two dimensional array access is being performed
by a single instruction.  The arithmetic needed to address the element is
implicit, and therefore inaccessible to optimizations.  The redundant
calculations can not be eliminated, nor can strength reduction be performed. 
getelementptr needs to be broken down into its constituent operations at some point.

When I increased COLS to the point where the loop could no longer be unrolled,
the selection dag code generator generated effectively the same code as the
default X86 code generator.  Lots of redundant imul/movl/addl sequences.  It
can't clean it up either.  Only unrolling all nested loops permits it to be
optimized away, regardless of code generator.

------- You are receiving this mail because: -------
You are on the CC list for the bug, or are watching someone who is.