[LLVMbugs] [Bug 16723] New: Increment only one index to scan and operate on arrays in parallel

bugzilla-daemon at llvm.org bugzilla-daemon at llvm.org
Sun Jul 28 07:40:50 PDT 2013


            Bug ID: 16723
           Summary: Increment only one index to scan and operate on arrays
                    in parallel
           Product: libraries
           Version: trunk
          Hardware: PC
                OS: Windows XP
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: Common Code Generator Code
          Assignee: unassignedbugs at nondot.org
          Reporter: bearophile at mailas.com
                CC: llvmbugs at cs.uiuc.edu
    Classification: Unclassified

This is a potential enhancement request for the back-end.

This is D code, to be compiled with the ldc2 compiler (in D uint is a 32 bit
unsigned integer):

void foo(in uint[] a,
         in uint[] b,
         in uint[] c,
         in uint[] d,
         in uint[] e,
         uint[] result) pure nothrow {
    foreach (immutable i; 0 .. result.length)
        result[i] = cast(uint)(a[i] + b[i] + c[i] + d[i] + e[i]);
void main() {}

(Compiled with ldc2 v.0.11.0 based on DMD v2.062 and LLVM 3.3svn)

ldmd2 -O -release -inline -noboundscheck -vectorize-loops -vectorize-slp
-vectorize-slp-aggressive -output-s

For the loop of foo ldc2 gives:
    movl    %eax, (%esp)
    movl    (%ebx), %eax
    addl    (%ebp), %eax
    addl    (%edi), %eax
    addl    (%esi), %eax
    addl    (%edx), %eax
    movl    %eax, (%ecx)
    movl    (%esp), %eax
    addl    $4, %edx
    addl    $4, %esi
    addl    $4, %edi
    addl    $4, %ebx
    addl    $4, %ebp
    addl    $4, %ecx
    decl    %eax
    jne LBB0_2

This is a similar C99 program (D arrays are not just pointers, they are small
structs of ponter-length, but the loop shouldn't be influenced by this):

#include "stdio.h"
#include "stdint.h"

void foo(size_t n,
         const uint32_t *restrict a,
         const uint32_t *restrict b,
         const uint32_t *restrict c,
         const uint32_t *restrict d,
         const uint32_t *restrict e,
               uint32_t *restrict result) {
    for (size_t i = 0; i < n; i++)
        result[i] = a[i] + b[i] + c[i] + d[i] + e[i];
int main() {
    return 0;

Compiled with:
gcc -std=c99 -Ofast -S

gcc v.4.8.0 gives this for the loop of foo:
    movl    40(%esp), %ecx
    movl    (%edi,%edx,4), %eax
    addl    0(%ebp,%edx,4), %eax
    addl    (%esi,%edx,4), %eax
    addl    (%ebx,%edx,4), %eax
    addl    (%ecx,%edx,4), %eax
    movl    44(%esp), %ecx
    movl    %eax, (%ecx,%edx,4)
    addl    $1, %edx
    cmpl    20(%esp), %edx
    jne L5

While for the loop of foo, Intel compiler V.13.0.1 (without restrict) gives:
..B2.4:                         # Preds ..B2.4 ..B2.3
        movl      (%rsi,%r10,8), %r11d                          #12.21
        addl      (%rdx,%r10,8), %r11d                          #12.28
        movl      (%r8,%r10,8), %ebp                            #12.42
        addl      (%rcx,%r10,8), %r11d                          #12.35
        addl      (%r9,%r10,8), %ebp                            #12.49
        addl      %ebp, %r11d                                   #12.42
        movl      %r11d, (%rbx,%r10,8)                          #12.9
        movl      4(%rsi,%r10,8), %r11d                         #12.21
        addl      4(%rdx,%r10,8), %r11d                         #12.28
        movl      4(%r8,%r10,8), %ebp                           #12.42
        addl      4(%rcx,%r10,8), %r11d                         #12.35
        addl      4(%r9,%r10,8), %ebp                           #12.49
        addl      %ebp, %r11d                                   #12.42
        movl      %r11d, 4(%rbx,%r10,8)                         #12.9
        incq      %r10                                          #11.5
        cmpq      %rax, %r10                                    #11.5
        jb        ..B2.4        # Prob 63%                      #11.5

If I use only the a,b,c,d,result arrays, with the Intel compiler (without
restrict), part of foo becomes:
..B3.25:                        # Preds ..B3.23 ..B3.25
        movdqu    (%rsi,%rax,4), %xmm2                          #19.28
        movdqu    (%rdx,%rax,4), %xmm0                          #19.28
        lddqu     (%rcx,%rax,4), %xmm1                          #19.35
        paddd     %xmm0, %xmm2                                  #19.28
        paddd     %xmm1, %xmm2                                  #19.35
        movdqa    %xmm2, (%r8,%rax,4)                           #19.9
        addq      $4, %rax                                      #18.5
        cmpq      %r9, %rax                                     #18.5
        jb        ..B3.25       # Prob 82%                      #18.5

So in the end in this enhancement I suggest to explore the possibility and
performance of replacing increment instructions like:
    addl    $4, %edx
    addl    $4, %esi
    addl    $4, %edi
    addl    $4, %ebx
    addl    $4, %ebp
    addl    $4, %ecx

with a single counter increment, and then use instructions like this to read
and write arrays memory:
    movl    (%edi,%edx,4), %eax

(Another potential improvement is to use instructions like paddd that sum four
uint at a time and add after the loop a little extra serial code for situations
where the input array lengths are not multiple of 4. But this is an unrelated
and more complex enhancement).

You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20130728/688a382c/attachment.html>

More information about the llvm-bugs mailing list