[LLVMbugs] [Bug 16723] New: Increment only one index to scan and operate on arrays in parallel

Sun Jul 28 07:40:50 PDT 2013

http://llvm.org/bugs/show_bug.cgi?id=16723

            Bug ID: 16723
           Summary: Increment only one index to scan and operate on arrays
                    in parallel
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Windows XP
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Common Code Generator Code
          Assignee: unassignedbugs at nondot.org
          Reporter: bearophile at mailas.com
                CC: llvmbugs at cs.uiuc.edu
    Classification: Unclassified

This is a potential enhancement request for the back-end.

This is D code, to be compiled with the ldc2 compiler (in D uint is a 32 bit
unsigned integer):

void foo(in uint[] a,
         in uint[] b,
         in uint[] c,
         in uint[] d,
         in uint[] e,
         uint[] result) pure nothrow {
    foreach (immutable i; 0 .. result.length)
        result[i] = cast(uint)(a[i] + b[i] + c[i] + d[i] + e[i]);
}
void main() {}

(Compiled with ldc2 v.0.11.0 based on DMD v2.062 and LLVM 3.3svn)

ldmd2 -O -release -inline -noboundscheck -vectorize-loops -vectorize-slp
-vectorize-slp-aggressive -output-s

For the loop of foo ldc2 gives:
LBB0_2:
    movl    %eax, (%esp)
    movl    (%ebx), %eax
    addl    (%ebp), %eax
    addl    (%edi), %eax
    addl    (%esi), %eax
    addl    (%edx), %eax
    movl    %eax, (%ecx)
    movl    (%esp), %eax
    addl    $4, %edx
    addl    $4, %esi
    addl    $4, %edi
    addl    $4, %ebx
    addl    $4, %ebp
    addl    $4, %ecx
    decl    %eax
    jne LBB0_2

This is a similar C99 program (D arrays are not just pointers, they are small
structs of ponter-length, but the loop shouldn't be influenced by this):

#include "stdio.h"
#include "stdint.h"

void foo(size_t n,
         const uint32_t *restrict a,
         const uint32_t *restrict b,
         const uint32_t *restrict c,
         const uint32_t *restrict d,
         const uint32_t *restrict e,
               uint32_t *restrict result) {
    for (size_t i = 0; i < n; i++)
        result[i] = a[i] + b[i] + c[i] + d[i] + e[i];
}
int main() {
    return 0;
}

Compiled with:
gcc -std=c99 -Ofast -S

gcc v.4.8.0 gives this for the loop of foo:
L5:
    movl    40(%esp), %ecx
    movl    (%edi,%edx,4), %eax
    addl    0(%ebp,%edx,4), %eax
    addl    (%esi,%edx,4), %eax
    addl    (%ebx,%edx,4), %eax
    addl    (%ecx,%edx,4), %eax
    movl    44(%esp), %ecx
    movl    %eax, (%ecx,%edx,4)
    addl    $1, %edx
    cmpl    20(%esp), %edx
    jne L5

While for the loop of foo, Intel compiler V.13.0.1 (without restrict) gives:
...
..B2.4:                         # Preds ..B2.4 ..B2.3
        movl      (%rsi,%r10,8), %r11d                          #12.21
        addl      (%rdx,%r10,8), %r11d                          #12.28
        movl      (%r8,%r10,8), %ebp                            #12.42
        addl      (%rcx,%r10,8), %r11d                          #12.35
        addl      (%r9,%r10,8), %ebp                            #12.49
        addl      %ebp, %r11d                                   #12.42
        movl      %r11d, (%rbx,%r10,8)                          #12.9
        movl      4(%rsi,%r10,8), %r11d                         #12.21
        addl      4(%rdx,%r10,8), %r11d                         #12.28
        movl      4(%r8,%r10,8), %ebp                           #12.42
        addl      4(%rcx,%r10,8), %r11d                         #12.35
        addl      4(%r9,%r10,8), %ebp                           #12.49
        addl      %ebp, %r11d                                   #12.42
        movl      %r11d, 4(%rbx,%r10,8)                         #12.9
        incq      %r10                                          #11.5
        cmpq      %rax, %r10                                    #11.5
        jb        ..B2.4        # Prob 63%                      #11.5
...

If I use only the a,b,c,d,result arrays, with the Intel compiler (without
restrict), part of foo becomes:
...
..B3.25:                        # Preds ..B3.23 ..B3.25
        movdqu    (%rsi,%rax,4), %xmm2                          #19.28
        movdqu    (%rdx,%rax,4), %xmm0                          #19.28
        lddqu     (%rcx,%rax,4), %xmm1                          #19.35
        paddd     %xmm0, %xmm2                                  #19.28
        paddd     %xmm1, %xmm2                                  #19.35
        movdqa    %xmm2, (%r8,%rax,4)                           #19.9
        addq      $4, %rax                                      #18.5
        cmpq      %r9, %rax                                     #18.5
        jb        ..B3.25       # Prob 82%                      #18.5
...

So in the end in this enhancement I suggest to explore the possibility and
performance of replacing increment instructions like:
    addl    $4, %edx
    addl    $4, %esi
    addl    $4, %edi
    addl    $4, %ebx
    addl    $4, %ebp
    addl    $4, %ecx

with a single counter increment, and then use instructions like this to read
and write arrays memory:
    movl    (%edi,%edx,4), %eax

(Another potential improvement is to use instructions like paddd that sum four
uint at a time and add after the loop a little extra serial code for situations
where the input array lengths are not multiple of 4. But this is an unrelated
and more complex enhancement).

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20130728/688a382c/attachment.html>