<html>

    <head>

      <base href="http://llvm.org/bugs/" />

    </head>

    <body><table border="1" cellspacing="0" cellpadding="8">

        <tr>

          <th>Bug ID</th>

          <td><a class="bz_bug_link 

          bz_status_NEW "

   title="NEW --- - Increment only one index to scan and operate on arrays in parallel"

   href="http://llvm.org/bugs/show_bug.cgi?id=16723">16723</a>

          </td>

        </tr>

        <tr>

          <th>Summary</th>

          <td>Increment only one index to scan and operate on arrays in parallel

          </td>

        </tr>

        <tr>

          <th>Product</th>

          <td>libraries

          </td>

        </tr>

        <tr>

          <th>Version</th>

          <td>trunk

          </td>

        </tr>

        <tr>

          <th>Hardware</th>

          <td>PC

          </td>

        </tr>

        <tr>

          <th>OS</th>

          <td>Windows XP

          </td>

        </tr>

        <tr>

          <th>Status</th>

          <td>NEW

          </td>

        </tr>

        <tr>

          <th>Severity</th>

          <td>enhancement

          </td>

        </tr>

        <tr>

          <th>Priority</th>

          <td>P

          </td>

        </tr>

        <tr>

          <th>Component</th>

          <td>Common Code Generator Code

          </td>

        </tr>

        <tr>

          <th>Assignee</th>

          <td>unassignedbugs@nondot.org

          </td>

        </tr>

        <tr>

          <th>Reporter</th>

          <td>bearophile@mailas.com

          </td>

        </tr>

        <tr>

          <th>CC</th>

          <td>llvmbugs@cs.uiuc.edu

          </td>

        </tr>

        <tr>

          <th>Classification</th>

          <td>Unclassified

          </td>

        </tr></table>

      <p>

        <div>

        <pre>This is a potential enhancement request for the back-end.

This is D code, to be compiled with the ldc2 compiler (in D uint is a 32 bit

unsigned integer):

void foo(in uint[] a,

         in uint[] b,

         in uint[] c,

         in uint[] d,

         in uint[] e,

         uint[] result) pure nothrow {

    foreach (immutable i; 0 .. result.length)

        result[i] = cast(uint)(a[i] + b[i] + c[i] + d[i] + e[i]);

}

void main() {}

(Compiled with ldc2 v.0.11.0 based on DMD v2.062 and LLVM 3.3svn)

ldmd2 -O -release -inline -noboundscheck -vectorize-loops -vectorize-slp

-vectorize-slp-aggressive -output-s

For the loop of foo ldc2 gives:

LBB0_2:

    movl    %eax, (%esp)

    movl    (%ebx), %eax

    addl    (%ebp), %eax

    addl    (%edi), %eax

    addl    (%esi), %eax

    addl    (%edx), %eax

    movl    %eax, (%ecx)

    movl    (%esp), %eax

    addl    $4, %edx

    addl    $4, %esi

    addl    $4, %edi

    addl    $4, %ebx

    addl    $4, %ebp

    addl    $4, %ecx

    decl    %eax

    jne LBB0_2

This is a similar C99 program (D arrays are not just pointers, they are small

structs of ponter-length, but the loop shouldn't be influenced by this):

#include "stdio.h"

#include "stdint.h"

void foo(size_t n,

         const uint32_t *restrict a,

         const uint32_t *restrict b,

         const uint32_t *restrict c,

         const uint32_t *restrict d,

         const uint32_t *restrict e,

               uint32_t *restrict result) {

    for (size_t i = 0; i < n; i++)

        result[i] = a[i] + b[i] + c[i] + d[i] + e[i];

}

int main() {

    return 0;

}

Compiled with:

gcc -std=c99 -Ofast -S

gcc v.4.8.0 gives this for the loop of foo:

L5:

    movl    40(%esp), %ecx

    movl    (%edi,%edx,4), %eax

    addl    0(%ebp,%edx,4), %eax

    addl    (%esi,%edx,4), %eax

    addl    (%ebx,%edx,4), %eax

    addl    (%ecx,%edx,4), %eax

    movl    44(%esp), %ecx

    movl    %eax, (%ecx,%edx,4)

    addl    $1, %edx

    cmpl    20(%esp), %edx

    jne L5

While for the loop of foo, Intel compiler V.13.0.1 (without restrict) gives:

...

..B2.4:                         # Preds ..B2.4 ..B2.3

        movl      (%rsi,%r10,8), %r11d                          #12.21

        addl      (%rdx,%r10,8), %r11d                          #12.28

        movl      (%r8,%r10,8), %ebp                            #12.42

        addl      (%rcx,%r10,8), %r11d                          #12.35

        addl      (%r9,%r10,8), %ebp                            #12.49

        addl      %ebp, %r11d                                   #12.42

        movl      %r11d, (%rbx,%r10,8)                          #12.9

        movl      4(%rsi,%r10,8), %r11d                         #12.21

        addl      4(%rdx,%r10,8), %r11d                         #12.28

        movl      4(%r8,%r10,8), %ebp                           #12.42

        addl      4(%rcx,%r10,8), %r11d                         #12.35

        addl      4(%r9,%r10,8), %ebp                           #12.49

        addl      %ebp, %r11d                                   #12.42

        movl      %r11d, 4(%rbx,%r10,8)                         #12.9

        incq      %r10                                          #11.5

        cmpq      %rax, %r10                                    #11.5

        jb        ..B2.4        # Prob 63%                      #11.5

...

If I use only the a,b,c,d,result arrays, with the Intel compiler (without

restrict), part of foo becomes:

...

..B3.25:                        # Preds ..B3.23 ..B3.25

        movdqu    (%rsi,%rax,4), %xmm2                          #19.28

        movdqu    (%rdx,%rax,4), %xmm0                          #19.28

        lddqu     (%rcx,%rax,4), %xmm1                          #19.35

        paddd     %xmm0, %xmm2                                  #19.28

        paddd     %xmm1, %xmm2                                  #19.35

        movdqa    %xmm2, (%r8,%rax,4)                           #19.9

        addq      $4, %rax                                      #18.5

        cmpq      %r9, %rax                                     #18.5

        jb        ..B3.25       # Prob 82%                      #18.5

...

So in the end in this enhancement I suggest to explore the possibility and

performance of replacing increment instructions like:

    addl    $4, %edx

    addl    $4, %esi

    addl    $4, %edi

    addl    $4, %ebx

    addl    $4, %ebp

    addl    $4, %ecx

with a single counter increment, and then use instructions like this to read

and write arrays memory:

    movl    (%edi,%edx,4), %eax

(Another potential improvement is to use instructions like paddd that sum four

uint at a time and add after the loop a little extra serial code for situations

where the input array lengths are not multiple of 4. But this is an unrelated

and more complex enhancement).</pre>

        </div>

      </p>

      <hr>

      <span>You are receiving this mail because:</span>

      <ul>

          <li>You are on the CC list for the bug.</li>

      </ul>

    </body>

</html>