[llvm-dev] [RFC] New pass: LoopExitValues

Tue Sep 1 11:06:53 PDT 2015

On Mon, Aug 31, 2015 at 5:52 PM, Jake VanAdrighem
<jvanadrighem at gmail.com> wrote:
> Do you have some specific performance measurements?

Averaging 4 runs of 10000 iterations each of Coremark on my X86_64
desktop showed:

-O2 performance: +2.9% faster with the L.E.V. pass
-Os size: 1.5% smaller with the L.E.V. pass

In the case of Coremark, the benefit comes mainly from the matrix
portion benchmark, which uses nested loops.  Similarly, I used a
matrix multiplication for the regression test as shown below.  The
L.E.V. pass eliminated 4 instructions.

void matrix_mul(unsigned int Size, unsigned int *Dst, unsigned int
*Src, unsigned int Val) {
  for (int Outer = 0; Outer < Size; ++Outer)
    for (int Inner = 0; Inner < Size; ++Inner)
       Dst[Outer * Size + Inner] = Src[Outer * Size + Inner] * Val;
}

With LoopExitValues
-------------------------------
matrix_mul:
    testl %edi, %edi
    je .LBB0_5
    xorl %r9d, %r9d
    xorl %r8d, %r8d
.LBB0_2:
    xorl %r11d, %r11d
.LBB0_3:
    movl %r9d, %r10d
    movl (%rdx,%r10,4), %eax
    imull %ecx, %eax
    movl %eax, (%rsi,%r10,4)
    incl %r11d
    incl %r9d
    cmpl %r11d, %edi
    jne .LBB0_3
    incl %r8d
    cmpl %edi, %r8d
    jne .LBB0_2
.LBB0_5:
    retq

Without LoopExitValues:
-----------------------------------
matrix_mul:
    pushq %rbx           # Eliminated by L.E.V. pass
.Ltmp0:
.Ltmp1:
    testl %edi, %edi
    je .LBB0_5
    xorl %r8d, %r8d
    xorl %r9d, %r9d
.LBB0_2:
    xorl %r10d, %r10d
    movl %r8d, %eax              # Eliminated by L.E.V. pass
.LBB0_3:
    movl %eax, %r11d
    movl (%rdx,%r11,4), %ebx
    imull %ecx, %ebx
    movl %ebx, (%rsi,%r11,4)
    incl %r10d
    incl %eax
    cmpl %r10d, %edi
    jne .LBB0_3
    incl %r9d
    addl %edi, %r8d            # Eliminated by L.E.V. pass
    cmpl %edi, %r9d
    jne .LBB0_2
.LBB0_5:
    popq %rbx                    # Eliminated by L.E.V. pass
    retq