[LLVMbugs] [Bug 5968] New: Some useless load/stores in a kernel

bugzilla-daemon at cs.uiuc.edu bugzilla-daemon at cs.uiuc.edu
Thu Jan 7 00:54:46 PST 2010


http://llvm.org/bugs/show_bug.cgi?id=5968

           Summary: Some useless load/stores in a kernel
           Product: new-bugs
           Version: 2.6
          Platform: PC
        OS/Version: Windows XP
            Status: NEW
          Keywords: code-quality
          Severity: normal
          Priority: P2
         Component: new bugs
        AssignedTo: unassignedbugs at nondot.org
        ReportedBy: bearophile at mailas.com
                CC: llvmbugs at cs.uiuc.edu


In the attach there is a small C program that shows a number of redundant
load/stores from memory.

On Windows I compile it with (or the same with gcc):

llvm-gcc -Wall -O3 -s -S -fomit-frame-pointer -Wl,--enable-stdcall-fixup -msse3
-march=native lsolver.c -o lsolver_llvm.s

Where -Wl,--enable-stdcall-fixup fixes a llvm bug.


This is the good asm generated by gcc 4.3.3:

L26:
    fldl    -8(%edx,%eax,8)
    fldl    -8(%ebx,%eax,8)
    faddl   -8(%ecx,%eax,8)
    fmul    %st(4), %st
    fldl    -16(%edx,%eax,8)
    faddl   (%edx,%eax,8)
    fmul    %st(6), %st
    faddp   %st, %st(1)
    fmul    %st(3), %st
    fstl    -8(%edx,%eax,8)
    incl    %eax
    fsubp   %st, %st(1)
    cmpl    %edi, %eax
    fmul    %st(0), %st
    faddp   %st, %st(1)
    jne L26


Again with gcc, but with -mfpmath=sse for force the usage of SSE, uses 5 loads
and 1 store:
L26:
    movsd   -8(%edx,%eax,8), %xmm0
    movsd   -8(%ebx,%eax,8), %xmm1
    movsd   -16(%edx,%eax,8), %xmm2
    addsd   -8(%ecx,%eax,8), %xmm1
    addsd   (%edx,%eax,8), %xmm2
    mulsd   %xmm5, %xmm1
    mulsd   %xmm6, %xmm2
    addsd   %xmm2, %xmm1
    mulsd   %xmm4, %xmm1
    movsd   %xmm1, -8(%edx,%eax,8)
    subsd   %xmm0, %xmm1
    incl    %eax
    mulsd   %xmm1, %xmm1
    cmpl    %edi, %eax
    addsd   %xmm1, %xmm3
    jne L26


Compiled with LLVM-gcc 2.6 (32 bit), uses 11 loads and 1 store (very similar
asm is produced by the D LDC compiler too):

LBB5_4:
    movl    8(%eax,%ecx,4), %ebx
    movl    (%eax,%ecx,4), %ebp
    movsd   8(%ebp,%esi,8), %xmm4
    addsd   8(%ebx,%esi,8), %xmm4
    mulsd   %xmm1, %xmm4
    movsd   (%edx,%esi,8), %xmm5
    addsd   16(%edx,%esi,8), %xmm5
    mulsd   %xmm2, %xmm5
    addsd   %xmm4, %xmm5
    mulsd   %xmm3, %xmm5
    movsd   8(%edi,%esi,8), %xmm4 ; ***
    movsd   %xmm5, 8(%edi,%esi,8) ; ***
    movl    4(%eax,%ecx,4), %edx
    movsd   8(%edx,%esi,8), %xmm5 ; ***
    subsd   %xmm4, %xmm5
    mulsd   %xmm5, %xmm5
    addsd   %xmm5, %xmm0
    incl    %esi
    cmpl    4(%esp), %esi
    movl    %edx, %edi
    jne LBB5_4


Compiled with llvm-gcc (V. 2.7 trunk) x86-64 linux, asm by <baldrick> on IRC,
the situation is a little better, 20 instructions instead of 21, 8 loads + 1
store:

.LBB5_4:
    movq    (%rcx,%rdi,8), %r11
    movq    16(%rcx,%rdi,8), %r10
    addsd   16(%r8,%r9,8), %xmm4
    movsd   8(%r8,%r9,8), %xmm6
    mulsd   %xmm1, %xmm4
    movsd   8(%r11,%r9,8), %xmm5
    addsd   8(%r10,%r9,8), %xmm5
    mulsd   %xmm2, %xmm5
    addsd   %xmm4, %xmm5
    mulsd   %xmm3, %xmm5
    movsd   %xmm5, 8(%r8,%r9,8)
    movq    8(%rcx,%rdi,8), %r8
    movsd   8(%r8,%r9,8), %xmm4
    incq    %r9
    cmpq    %rsi, %r9
    movapd  %xmm4, %xmm5
    subsd   %xmm6, %xmm5
    mulsd   %xmm5, %xmm5
    addsd   %xmm5, %xmm0
    jne .LBB5_4


Those redudant loads and stores (in the 32 bit version) can also be seen
comparing the running time of the llvm-gcc version with the gcc version.


-- 
Configure bugmail: http://llvm.org/bugs/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
You are on the CC list for the bug.



More information about the llvm-bugs mailing list